Skip to content

Commit be49399

Browse files
authored
grapheme: Add splitGraphemes() utility (#63)
1 parent 027c991 commit be49399

File tree

5 files changed

+95
-9
lines changed

5 files changed

+95
-9
lines changed

.changeset/many-years-deny.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"unicode-segmenter": patch
3+
---
4+
5+
Add `splitGraphemes()` utility

README.md

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,19 @@ import { graphemeSegments } from 'unicode-segmenter/grapheme';
5858
// 3: { segment: '\r\n', index: 7, input: 'a̐éö̲\r\n' }
5959
```
6060

61+
#### Example: Split graphemes
62+
63+
```js
64+
import { splitGraphemes } from 'unicode-segmenter/grapheme';
65+
66+
[...splitGraphemes('#️⃣*️⃣0️⃣1️⃣2️⃣')];
67+
// 0: #️⃣
68+
// 1: *️⃣
69+
// 2: 0️⃣
70+
// 3: 1️⃣
71+
// 4: 2️⃣
72+
```
73+
6174
#### Example: Count graphemes
6275

6376
```js
@@ -77,7 +90,7 @@ countGrapheme('a̐éö̲');
7790
> [!NOTE]
7891
> `countGrapheme()` is a small wrapper around `graphemeSegments()`.
7992
>
80-
> If you call it more than once, use `graphemeSegments()` once instead, Or memoize it yourself.
93+
> If you need it more than once at a time, consider memoization or use `graphemeSegments()` or `splitSegments()` once instead.
8194
8295
#### Example: Build an advanced grapheme matcher
8396

@@ -238,7 +251,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb
238251

239252
| Name | Unicode® | ESM? | Size | Size (min) | Size (min+gzip) | Size (min+br) |
240253
|------------------------------|----------|------|----------:|-----------:|----------------:|--------------:|
241-
| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 17,125 | 12,720 | 5,256 | 3,913 |
254+
| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 17,347 | 12,822 | 5,307 | 4,093 |
242255
| `graphemer` | 15.0.0 | ✖️ ️| 410,435 | 95,104 | 15,752 | 10,660 |
243256
| `grapheme-splitter` | 10.0.0 | ✖️ | 122,252 | 23,680 | 7,852 | 4,841 |
244257
| `@formatjs/intl-segmenter`* | 15.0.0 | ✖️ | 491,043 | 318,721 | 54,248 | 34,380 |
@@ -254,7 +267,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb
254267

255268
| Name | Bytecode size | Bytecode size (gzip)* |
256269
|------------------------------|--------------:|----------------------:|
257-
| `unicode-segmenter/grapheme` | 23,992 | 12,533 |
270+
| `unicode-segmenter/grapheme` | 24,521 | 12,773 |
258271
| `graphemer` | 133,949 | 31,710 |
259272
| `grapheme-splitter` | 63,810 | 19,125 |
260273
| `@formatjs/intl-segmenter`* | 315,865 | 99,063 |
Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,5 @@
1-
export { graphemeSegments } from '../../../src/grapheme.js';
1+
export {
2+
graphemeSegments,
3+
countGrapheme,
4+
splitGraphemes,
5+
} from '../../../src/grapheme.js';

src/grapheme.js

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,14 @@ export function countGrapheme(str) {
195195
return count;
196196
}
197197

198+
/**
199+
* @param {string} str
200+
* @return {IterableIterator<string>}
201+
*/
202+
export function* splitGraphemes(str) {
203+
for (let s of graphemeSegments(str)) yield s.segment;
204+
}
205+
198206
/**
199207
* `Grapheme_Cluster_Break` property value of a given codepoint
200208
*

test/grapheme.js

Lines changed: 61 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,12 @@ import { test } from 'node:test';
44
import * as assert from 'node:assert/strict';
55
import fc from 'fast-check';
66

7-
import { graphemeSegments, countGrapheme, GraphemeCategory } from 'unicode-segmenter/grapheme';
7+
import {
8+
GraphemeCategory,
9+
graphemeSegments,
10+
countGrapheme,
11+
splitGraphemes,
12+
} from 'unicode-segmenter/grapheme';
813
import { assertObjectContaining } from './_helper.js';
914

1015
test('graphemeSegments', async t => {
@@ -74,10 +79,6 @@ test('countGrapheme', async t => {
7479
assert.equal(countGrapheme('abcd'), 4);
7580
});
7681

77-
await t.test('latin', () => {
78-
assert.equal(countGrapheme('abcd'), 4);
79-
});
80-
8182
await t.test('flags', () => {
8283
assert.equal(countGrapheme('🇷🇸🇮🇴'), 2);
8384
});
@@ -104,6 +105,61 @@ test('countGrapheme', async t => {
104105
});
105106
});
106107

108+
test('splitGrapheme', async t => {
109+
await t.test('latin', () => {
110+
assert.deepEqual(
111+
[...splitGraphemes('abcd')],
112+
['a', 'b', 'c', 'd'],
113+
);
114+
});
115+
116+
await t.test('flags', () => {
117+
assert.deepEqual(
118+
[...splitGraphemes('🇷🇸🇮🇴')],
119+
['🇷🇸', '🇮🇴'],
120+
);
121+
});
122+
123+
await t.test('emoji', () => {
124+
assert.deepEqual(
125+
[...splitGraphemes('👻👩‍👩‍👦‍👦')],
126+
['👻', '👩‍👩‍👦‍👦'],
127+
);
128+
assert.deepEqual(
129+
[...splitGraphemes('🌷🎁💩😜👍🏳️‍🌈')],
130+
['🌷', '🎁', '💩', '😜', '👍', '🏳️‍🌈'],
131+
);
132+
});
133+
134+
await t.test('diacritics as combining marks', () => {
135+
assert.deepEqual(
136+
[...splitGraphemes('Ĺo͂řȩm̅')],
137+
['Ĺ', 'o͂', 'ř', 'ȩ', 'm̅'],
138+
);
139+
});
140+
141+
await t.test('Jamo', () => {
142+
assert.deepEqual(
143+
[...splitGraphemes('가갉')],
144+
['가', '갉'],
145+
);
146+
});
147+
148+
await t.test('Hindi', () => {
149+
assert.deepEqual(
150+
[...splitGraphemes('अनुच्छेद')],
151+
['अ', 'नु', 'च्छे', 'द'],
152+
);
153+
});
154+
155+
await t.test('demonic', () => {
156+
assert.deepEqual(
157+
[...splitGraphemes('Z͑ͫ̓ͪ̂ͫ̽͏̴̙̤̞͉͚̯̞̠͍A̴̵̜̰͔ͫ͗͢L̠ͨͧͩ͘G̴̻͈͍͔̹̑͗̎̅͛́Ǫ̵̹̻̝̳͂̌̌͘!͖̬̰̙̗̿̋ͥͥ̂ͣ̐́́͜͞')],
158+
['Z͑ͫ̓ͪ̂ͫ̽͏̴̙̤̞͉͚̯̞̠͍', 'A̴̵̜̰͔ͫ͗͢', 'L̠ͨͧͩ͘', 'G̴̻͈͍͔̹̑͗̎̅͛́', 'Ǫ̵̹̻̝̳͂̌̌͘', '!͖̬̰̙̗̿̋ͥͥ̂ͣ̐́́͜͞'],
159+
);
160+
});
161+
});
162+
107163
test('spec compliant', async t => {
108164
fc.configureGlobal({
109165
// Fix seed here for stable coverage report

0 commit comments

Comments
 (0)