Skip to content

Support supplementary CPs in Unicode identifiers #2522

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Sep 26, 2022
42 changes: 42 additions & 0 deletions scripts/unicode-identifier.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// see https://github.com/microsoft/TypeScript/blob/main/scripts/regenerate-unicode-identifier-parts.js

const MAX_UNICODE_CODEPOINT = 0x10FFFF;
const isStart = c => /[\p{ID_Start}\u{2118}\u{212E}\u{309B}\u{309C}]/u.test(c); // Other_ID_Start explicitly included for back compat - see http://www.unicode.org/reports/tr31/#Introduction
const isPart = c => /[\p{ID_Continue}\u{00B7}\u{0387}\u{19DA}\u{1369}\u{136A}\u{136B}\u{136C}\u{136D}\u{136E}\u{136F}\u{1370}\u{1371}]/u.test(c) || isStart(c); // Likewise for Other_ID_Continue
const parts = [];
let partsActive = false;
let startsActive = false;
const starts = [];

// Skip 0-9 (48..57), A-Z (65..90), a-z (97..122) - checked otherwise
for (let cp = 123; cp <= MAX_UNICODE_CODEPOINT; cp++) {
if (isStart(String.fromCodePoint(cp)) !== startsActive) {
starts.push(cp - +startsActive);
startsActive = !startsActive;
}
if (isPart(String.fromCodePoint(cp)) !== partsActive) {
parts.push(cp - +partsActive);
partsActive = !partsActive;
}
}
if (startsActive) starts.push(MAX_UNICODE_CODEPOINT);
if (partsActive) parts.push(MAX_UNICODE_CODEPOINT);

function tablify(cps) {
let sb = ["/*\n| from ... to | from ... to | from ... to | from ... to |*/"];
let i = 0;
while (i < cps.length) {
if (!(i % 8)) sb.push("\n ");
sb.push(`${cps[i++].toString().padEnd(6)}, `);
}
return sb.join("") + "\n";
}

console.log(`/** Unicode ${process.versions.unicode} ID_Start/Other_ID_Start ranges */`);
console.log(`const unicodeIdentifierStart: i32[] = [${tablify(starts)}];`);
console.log(`const unicodeIdentifierStartMin = ${starts[0]};`);
console.log(`const unicodeIdentifierStartMax = ${starts[starts.length - 1]};\n`);
Comment on lines +37 to +38
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
console.log(`const unicodeIdentifierStartMin = ${starts[0]};`);
console.log(`const unicodeIdentifierStartMax = ${starts[starts.length - 1]};\n`);
console.log(`const UnicodeIdentifierStartMin = ${starts[0]};`);
console.log(`const UnicodeIdentifierStartMax = ${starts[starts.length - 1]};\n`);

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using title case for SomeEnum.SomeValue seems fine, but just SomeValue conflicts visually with classes, enums and interfaces. Not sure it's preferable?

console.log(`/** Unicode ${process.versions.unicode} ID_Continue/Other_ID_Continue + ID_Start/Other_ID_Start ranges*/`);
console.log(`const unicodeIdentifierPart: i32[] = [${tablify(parts)}];`);
console.log(`const unicodeIdentifierPartMin = ${parts[0]};`);
console.log(`const unicodeIdentifierPartMax = ${parts[parts.length - 1]};\n`);
2 changes: 1 addition & 1 deletion src/diagnostics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ function formatDiagnosticContext(range: Range): string {
var lineSpace = " ".repeat(lineNumber.length);
// Find preceeding line break
while (start > 0 && !isLineBreak(text.charCodeAt(start - 1))) start--;
// Skip leading whitespace
// Skip leading whitespace (assume no supplementary whitespaces)
while (start < len && isWhiteSpace(text.charCodeAt(start))) start++;
// Find next line break
while (end < len && !isLineBreak(text.charCodeAt(end))) end++;
Expand Down
26 changes: 15 additions & 11 deletions src/tokenizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ import {
isOctal,
isHexBase,
isHighSurrogate,
isLowSurrogate
combineSurrogates,
numCodeUnits
} from "./util";

/** Named token types. */
Expand Down Expand Up @@ -913,11 +914,15 @@ export class Tokenizer extends DiagnosticEmitter {
return Token.AT;
}
default: {
// Unicode-aware from here on
if (isHighSurrogate(c) && pos + 1 < end) {
c = combineSurrogates(c, text.charCodeAt(pos + 1));
}
if (isIdentifierStart(c)) {
let posBefore = pos;
while (
++pos < end &&
isIdentifierPart(c = text.charCodeAt(pos))
(pos += numCodeUnits(c)) < end &&
isIdentifierPart(c = <i32>text.codePointAt(pos))
) { /* nop */ }
if (identifierHandling != IdentifierHandling.ALWAYS) {
let maybeKeywordToken = tokenFromKeyword(text.substring(posBefore, pos));
Expand All @@ -935,14 +940,11 @@ export class Tokenizer extends DiagnosticEmitter {
this.pos = posBefore;
return Token.IDENTIFIER;
} else if (isWhiteSpace(c)) {
++pos;
++pos; // assume no supplementary whitespaces
break;
}
let start = pos++;
if (
isHighSurrogate(c) && pos < end &&
isLowSurrogate(text.charCodeAt(pos))
) ++pos;
let start = pos;
pos += numCodeUnits(c);
this.error(
DiagnosticCode.Invalid_character,
this.range(start, pos)
Expand Down Expand Up @@ -1055,9 +1057,11 @@ export class Tokenizer extends DiagnosticEmitter {
var end = this.end;
var pos = this.pos;
var start = pos;
var c = <i32>text.codePointAt(pos);
assert(isIdentifierStart(c));
while (
++pos < end &&
isIdentifierPart(text.charCodeAt(pos))
(pos += numCodeUnits(c)) < end &&
isIdentifierPart(c = <i32>text.codePointAt(pos))
);
this.pos = pos;
return text.substring(start, pos);
Expand Down
Loading