Skip to content

Commit 75492dc

Browse files
authored
expose head code point of a segment (#75)
* expose head code point of a segment * changeset
1 parent 07642a2 commit 75492dc

File tree

5 files changed

+56
-28
lines changed

5 files changed

+56
-28
lines changed

.changeset/gentle-suns-design.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
---
2+
"unicode-segmenter": minor
3+
---
4+
5+
Expose an internal state: `_hd`;
6+
7+
The first codepoint of a segment, which is often need to be checked its bounds.
8+
9+
For example,
10+
11+
```ts
12+
for (const { segment } of graphemeSegments(text)) {
13+
const cp = segment.codePointAt(0)!;
14+
// Also need to `!` assertions in TypeScript.
15+
if (isBMP(cp)) {
16+
// ...
17+
}
18+
}
19+
```
20+
21+
It can be replaced by `_hd` state. no additional overhead.

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb
254254

255255
| Name | Unicode® | ESM? | Size | Size (min) | Size (min+gzip) | Size (min+br) |
256256
|------------------------------|----------|------|----------:|-----------:|----------------:|--------------:|
257-
| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 15,929 | 12,110 | 5,050 | 3,738 |
257+
| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 15,997 | 12,130 | 5,061 | 3,751 |
258258
| `graphemer` | 15.0.0 | ✖️ ️| 410,435 | 95,104 | 15,752 | 10,660 |
259259
| `grapheme-splitter` | 10.0.0 | ✖️ | 122,252 | 23,680 | 7,852 | 4,841 |
260260
| `@formatjs/intl-segmenter`* | 15.0.0 | ✖️ | 603,285 | 369,560 | 72,218 | 49,416 |
@@ -270,7 +270,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb
270270

271271
| Name | Bytecode size | Bytecode size (gzip)* |
272272
|------------------------------|--------------:|----------------------:|
273-
| `unicode-segmenter/grapheme` | 22,019 | 11,513 |
273+
| `unicode-segmenter/grapheme` | 22,061 | 11,539 |
274274
| `graphemer` | 133,974 | 31,715 |
275275
| `grapheme-splitter` | 63,855 | 19,133 |
276276

src/grapheme.js

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import { consonant_ranges } from './_incb_data.js';
2525
* @typedef {import('./_grapheme_data.js').GraphemeCategoryRange} GraphemeCategoryRange
2626
*
2727
* @typedef {object} GraphemeSegmentExtra
28+
* @property {number} _hd The first code point of the segment
2829
* @property {GraphemeCategoryNum} _catBegin Beginning Grapheme_Cluster_Break category of the segment
2930
* @property {GraphemeCategoryNum} _catEnd Ending Grapheme_Cluster_Break category of the segment
3031
*
@@ -81,7 +82,10 @@ export function* graphemeSegments(input) {
8182
/** InCB=Consonant InCB=Linker x InCB=Consonant */
8283
let incb = false;
8384

84-
let cp = /** @type number */ (input.codePointAt(cursor));
85+
let cp = /** @type {number} */ (input.codePointAt(cursor));
86+
87+
/** Memoize the beginnig code point a the segment. */
88+
let _hd = cp;
8589

8690
let index = 0;
8791
let segment = '';
@@ -117,6 +121,7 @@ export function* graphemeSegments(input) {
117121
segment,
118122
index,
119123
input,
124+
_hd,
120125
_catBegin: /** @type {typeof catBefore} */ (catBegin),
121126
_catEnd: catBefore,
122127
};
@@ -146,6 +151,7 @@ export function* graphemeSegments(input) {
146151
segment,
147152
index,
148153
input,
154+
_hd,
149155
_catBegin: /** @type {typeof catBefore} */ (catBegin),
150156
_catEnd: catBefore,
151157
};
@@ -156,6 +162,7 @@ export function* graphemeSegments(input) {
156162
emoji = false;
157163
incb = false;
158164
catBegin = catAfter;
165+
_hd = cp;
159166
}
160167
}
161168
}

test/grapheme.js

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,12 @@ test('graphemeSegments', async t => {
2121
assert.deepEqual(
2222
[...graphemeSegments('abc123')],
2323
[
24-
{ segment: 'a', index: 0, input: 'abc123', _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Any },
25-
{ segment: 'b', index: 1, input: 'abc123', _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Any },
26-
{ segment: 'c', index: 2, input: 'abc123', _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Any },
27-
{ segment: '1', index: 3, input: 'abc123', _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Any },
28-
{ segment: '2', index: 4, input: 'abc123', _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Any },
29-
{ segment: '3', index: 5, input: 'abc123', _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Any },
24+
{ segment: 'a', index: 0, input: 'abc123', _hd: 'a'.codePointAt(0), _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Any },
25+
{ segment: 'b', index: 1, input: 'abc123', _hd: 'b'.codePointAt(0), _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Any },
26+
{ segment: 'c', index: 2, input: 'abc123', _hd: 'c'.codePointAt(0), _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Any },
27+
{ segment: '1', index: 3, input: 'abc123', _hd: '1'.codePointAt(0), _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Any },
28+
{ segment: '2', index: 4, input: 'abc123', _hd: '2'.codePointAt(0), _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Any },
29+
{ segment: '3', index: 5, input: 'abc123', _hd: '3'.codePointAt(0), _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Any },
3030
],
3131
);
3232
});
@@ -35,10 +35,10 @@ test('graphemeSegments', async t => {
3535
assert.deepEqual(
3636
[...graphemeSegments('a̐éö̲\r\n')],
3737
[
38-
{ segment: 'a̐', index: 0, input: 'a̐éö̲\r\n', _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
39-
{ segment: 'é', index: 2, input: 'a̐éö̲\r\n', _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
40-
{ segment: 'ö̲', index: 4, input: 'a̐éö̲\r\n', _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
41-
{ segment: '\r\n', index: 7, input: 'a̐éö̲\r\n', _catBegin: GraphemeCategory.CR, _catEnd: GraphemeCategory.LF },
38+
{ segment: 'a̐', index: 0, input: 'a̐éö̲\r\n', _hd: 'a̐'.codePointAt(0), _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
39+
{ segment: 'é', index: 2, input: 'a̐éö̲\r\n', _hd: 'é'.codePointAt(0), _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
40+
{ segment: 'ö̲', index: 4, input: 'a̐éö̲\r\n', _hd: 'ö̲'.codePointAt(0), _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
41+
{ segment: '\r\n', index: 7, input: 'a̐éö̲\r\n', _hd: '\r\n'.codePointAt(0), _catBegin: GraphemeCategory.CR, _catEnd: GraphemeCategory.LF },
4242
],
4343
);
4444
});
@@ -47,8 +47,8 @@ test('graphemeSegments', async t => {
4747
assert.deepEqual(
4848
[...graphemeSegments('🇷🇸🇮🇴')],
4949
[
50-
{ segment: '🇷🇸', index: 0, input: '🇷🇸🇮🇴', _catBegin: GraphemeCategory.Regional_Indicator, _catEnd: GraphemeCategory.Regional_Indicator },
51-
{ segment: '🇮🇴', index: 4, input: '🇷🇸🇮🇴', _catBegin: GraphemeCategory.Regional_Indicator, _catEnd: GraphemeCategory.Regional_Indicator },
50+
{ segment: '🇷🇸', index: 0, input: '🇷🇸🇮🇴', _hd: '🇷🇸'.codePointAt(0), _catBegin: GraphemeCategory.Regional_Indicator, _catEnd: GraphemeCategory.Regional_Indicator },
51+
{ segment: '🇮🇴', index: 4, input: '🇷🇸🇮🇴', _hd: '🇮🇴'.codePointAt(0), _catBegin: GraphemeCategory.Regional_Indicator, _catEnd: GraphemeCategory.Regional_Indicator },
5252
],
5353
);
5454
});
@@ -57,8 +57,8 @@ test('graphemeSegments', async t => {
5757
assert.deepEqual(
5858
[...graphemeSegments('🇷🇸🇮')],
5959
[
60-
{ segment: '🇷🇸', index: 0, input: '🇷🇸🇮', _catBegin: GraphemeCategory.Regional_Indicator, _catEnd: GraphemeCategory.Regional_Indicator },
61-
{ segment: '🇮', index: 4, input: '🇷🇸🇮', _catBegin: GraphemeCategory.Regional_Indicator, _catEnd: GraphemeCategory.Regional_Indicator },
60+
{ segment: '🇷🇸', index: 0, input: '🇷🇸🇮', _hd: '🇷🇸'.codePointAt(0), _catBegin: GraphemeCategory.Regional_Indicator, _catEnd: GraphemeCategory.Regional_Indicator },
61+
{ segment: '🇮', index: 4, input: '🇷🇸🇮', _hd: '🇮'.codePointAt(0), _catBegin: GraphemeCategory.Regional_Indicator, _catEnd: GraphemeCategory.Regional_Indicator },
6262
],
6363
);
6464
});
@@ -67,8 +67,8 @@ test('graphemeSegments', async t => {
6767
assert.deepEqual(
6868
[...graphemeSegments('👻👩‍👩‍👦‍👦')],
6969
[
70-
{ segment: '👻', index: 0, input: '👻👩‍👩‍👦‍👦', _catBegin: GraphemeCategory.Extended_Pictographic, _catEnd: GraphemeCategory.Extended_Pictographic },
71-
{ segment: '👩‍👩‍👦‍👦', index: 2, input: '👻👩‍👩‍👦‍👦', _catBegin: GraphemeCategory.Extended_Pictographic, _catEnd: GraphemeCategory.Extended_Pictographic },
70+
{ segment: '👻', index: 0, input: '👻👩‍👩‍👦‍👦', _hd: '👻'.codePointAt(0), _catBegin: GraphemeCategory.Extended_Pictographic, _catEnd: GraphemeCategory.Extended_Pictographic },
71+
{ segment: '👩‍👩‍👦‍👦', index: 2, input: '👻👩‍👩‍👦‍👦', _hd: '👩‍👩‍👦‍👦'.codePointAt(0), _catBegin: GraphemeCategory.Extended_Pictographic, _catEnd: GraphemeCategory.Extended_Pictographic },
7272
],
7373
);
7474
});

test/intl-adapter.js

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -37,39 +37,39 @@ test('containing', async _ => {
3737

3838
assert.deepEqual(
3939
segments.containing(0),
40-
{ segment: 'a̐', index: 0, input: 'a̐éö̲\r\n', _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
40+
{ segment: 'a̐', index: 0, input: 'a̐éö̲\r\n', _hd: 'a̐'.codePointAt(0), _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
4141
);
4242
assert.deepEqual(
4343
segments.containing(1),
44-
{ segment: 'a̐', index: 0, input: 'a̐éö̲\r\n', _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
44+
{ segment: 'a̐', index: 0, input: 'a̐éö̲\r\n', _hd: 'a̐'.codePointAt(0), _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
4545
);
4646
assert.deepEqual(
4747
segments.containing(2),
48-
{ segment: 'é', index: 2, input: 'a̐éö̲\r\n', _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
48+
{ segment: 'é', index: 2, input: 'a̐éö̲\r\n', _hd: 'é'.codePointAt(0), _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
4949
);
5050
assert.deepEqual(
5151
segments.containing(3),
52-
{ segment: 'é', index: 2, input: 'a̐éö̲\r\n', _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
52+
{ segment: 'é', index: 2, input: 'a̐éö̲\r\n', _hd: 'é'.codePointAt(0), _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
5353
);
5454
assert.deepEqual(
5555
segments.containing(4),
56-
{ segment: 'ö̲', index: 4, input: 'a̐éö̲\r\n', _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
56+
{ segment: 'ö̲', index: 4, input: 'a̐éö̲\r\n', _hd: 'ö̲'.codePointAt(0), _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
5757
);
5858
assert.deepEqual(
5959
segments.containing(5),
60-
{ segment: 'ö̲', index: 4, input: 'a̐éö̲\r\n', _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
60+
{ segment: 'ö̲', index: 4, input: 'a̐éö̲\r\n', _hd: 'ö̲'.codePointAt(0), _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
6161
);
6262
assert.deepEqual(
6363
segments.containing(6),
64-
{ segment: 'ö̲', index: 4, input: 'a̐éö̲\r\n', _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
64+
{ segment: 'ö̲', index: 4, input: 'a̐éö̲\r\n', _hd: 'ö̲'.codePointAt(0), _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
6565
);
6666
assert.deepEqual(
6767
segments.containing(7),
68-
{ segment: '\r\n', index: 7, input: 'a̐éö̲\r\n', _catBegin: GraphemeCategory.CR, _catEnd: GraphemeCategory.LF },
68+
{ segment: '\r\n', index: 7, input: 'a̐éö̲\r\n', _hd: '\r\n'.codePointAt(0), _catBegin: GraphemeCategory.CR, _catEnd: GraphemeCategory.LF },
6969
);
7070
assert.deepEqual(
7171
segments.containing(8),
72-
{ segment: '\r\n', index: 7, input: 'a̐éö̲\r\n', _catBegin: GraphemeCategory.CR, _catEnd: GraphemeCategory.LF },
72+
{ segment: '\r\n', index: 7, input: 'a̐éö̲\r\n', _hd: '\r\n'.codePointAt(0), _catBegin: GraphemeCategory.CR, _catEnd: GraphemeCategory.LF },
7373
);
7474
assert.equal(segments.containing(9), undefined);
7575
});

0 commit comments

Comments
 (0)