Skip to content

Commit 3665cf7

Browse files
authored
fix G9Bc rule check to fix Hindi break (#55)
* fix G9Bc rule check to fix Hindi break * update bundle stats * add changeset * clarify a bit * what a compression... * add notes
1 parent 4605f74 commit 3665cf7

File tree

4 files changed

+40
-16
lines changed

4 files changed

+40
-16
lines changed

.changeset/young-readers-act.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"unicode-segmenter": patch
3+
---
4+
5+
Fix Hindi text segmentation

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,7 @@ See more on [benchmark](benchmark).
233233

234234
| Name | Unicode® | ESM? | Size | Size (min) | Size (min+gzip) | Size (min+br) |
235235
|------------------------------|----------|------|----------:|-----------:|----------------:|--------------:|
236-
| `unicode-segmenter/grapheme` | 15.1.0 | ✔️ | 28,288 | 24,616 | 6,592 | 4,433 |
236+
| `unicode-segmenter/grapheme` | 15.1.0 | ✔️ | 28,337 | 24,623 | 6,599 | 4,360 |
237237
| `graphemer` | 15.0.0 | ✖️ ️| 410,424 | 95,104 | 15,752 | 10,660 |
238238
| `grapheme-splitter` | 10.0.0 | ✖️ | 122,241 | 23,680 | 7,852 | 4,841 |
239239
| `unicode-segmentation`* | 15.0.0 | ✔️ | 51,251 | 51,251 | 22,545 | 16,614 |

src/grapheme.js

Lines changed: 32 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -108,9 +108,12 @@ export function* graphemeSegments(input) {
108108
catBegin = catBefore;
109109
}
110110

111+
// Note: Lazily update `consonant` and `linker` state
112+
// which is a extra overhead only for Hindi text.
111113
if (!consonant && catBefore === 0) {
112114
consonant = isIndicConjunctCosonant(cp);
113-
} else if (catBefore === 3) {
115+
} else if (catBefore === 3 /* Extend */) {
116+
// Note: \p{InCB=Linker} is a subset of \p{Extend}
114117
linker = isIndicConjunctLinker(cp);
115118
}
116119

@@ -130,7 +133,7 @@ export function* graphemeSegments(input) {
130133
return;
131134
}
132135

133-
if (catBefore === 10 /* Regional_Indicator*/) {
136+
if (catBefore === 10 /* Regional_Indicator */) {
134137
risCount += 1;
135138
} else {
136139
risCount = 0;
@@ -140,9 +143,11 @@ export function* graphemeSegments(input) {
140143
) {
141144
emoji = true;
142145

143-
// Put GB9c rule checking here to reduce.
146+
// Note: Put GB9c rule checking here to reduce.
144147
} else if (catAfter === 0 /* Any */) {
145-
incb = consonant && linker && isIndicConjunctCosonant(cp);
148+
incb = consonant && linker && (consonant = isIndicConjunctCosonant(cp));
149+
// It cannot be both a linker and a consonant.
150+
linker = linker && !consonant;
146151
}
147152
}
148153

@@ -160,8 +165,7 @@ export function* graphemeSegments(input) {
160165
index = cursor;
161166
segment = '';
162167
emoji = false;
163-
consonant = false;
164-
linker = false;
168+
incb = false;
165169
catBegin = catAfter;
166170
}
167171
}
@@ -227,7 +231,14 @@ function isIndicConjunctCosonant(cp) {
227231
* @return {boolean}
228232
*/
229233
function isIndicConjunctLinker(cp) {
230-
return (cp === 0x094D || cp === 0x09CD || cp === 0x0ACD || cp === 0x0B4D || cp === 0x0C4D || cp === 0x0D4D);
234+
return (
235+
cp === 2381 /* 0x094D */ ||
236+
cp === 2509 /* 0x09CD */ ||
237+
cp === 2765 /* 0x0ACD */ ||
238+
cp === 2893 /* 0x0B4D */ ||
239+
cp === 3149 /* 0x0C4D */ ||
240+
cp === 3405 /* 0x0D4D */
241+
);
231242
}
232243

233244
/**
@@ -246,12 +257,13 @@ function isBoundary(catBefore, catAfter, risCount, emoji, incb) {
246257
return false;
247258
}
248259

249-
if (
250-
// GB4
251-
(catBefore === 1 || catBefore === 2 || catBefore === 6) ||
252-
// GB5
253-
(catAfter === 1 || catAfter === 2 || catAfter === 6)
254-
) {
260+
// GB4
261+
if (catBefore === 1 || catBefore === 2 || catBefore === 6) {
262+
return true;
263+
}
264+
265+
// GB5
266+
if (catAfter === 1 || catAfter === 2 || catAfter === 6) {
255267
return true;
256268
}
257269

@@ -279,8 +291,13 @@ function isBoundary(catBefore, catAfter, risCount, emoji, incb) {
279291
return false;
280292
}
281293

282-
// GB9. GB9a
283-
if (catAfter === 3 || catAfter === 11 || catAfter === 14) {
294+
// GB9
295+
if (catAfter === 3 || catAfter === 14) {
296+
return false;
297+
}
298+
299+
// GB9a
300+
if (catAfter === 11) {
284301
return false;
285302
}
286303

test/grapheme.js

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,8 @@ test('counterexamples', async t => {
178178
'🇷🇸A',
179179
'👩‍🦰👩‍👩‍👦‍👦🏳️‍🌈',
180180
'अनुच्छेद',
181+
'पक्षियों',
182+
'क्‍त',
181183
'് ',
182184
'्क',
183185
];

0 commit comments

Comments
 (0)