Skip to content

Commit 8fb5fe2

Browse files
committed
util: improve unicode support
The array grouping function relies on the width of the characters. It was not calculated correct so far, since it used the string length instead. This improves the unicode output by calculating the mono-spaced font width (other fonts might differ). PR-URL: nodejs#31319 Reviewed-By: James M Snell <[email protected]> Reviewed-By: Steven R Loomis <[email protected]> Reviewed-By: Rich Trott <[email protected]> Reviewed-By: Minwoo Jung <[email protected]>
1 parent 2606e1e commit 8fb5fe2

11 files changed

+211
-192
lines changed

lib/internal/cli_table.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ const {
66
ObjectPrototypeHasOwnProperty,
77
} = primordials;
88

9-
const { getStringWidth } = require('internal/readline/utils');
9+
const { getStringWidth } = require('internal/util/inspect');
1010

1111
// The use of Unicode characters below is the only non-comment use of non-ASCII
1212
// Unicode characters in Node.js built-in modules. If they are ever removed or

lib/internal/readline/utils.js

Lines changed: 0 additions & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,13 @@
11
'use strict';
22

33
const {
4-
RegExp,
54
Symbol,
65
} = primordials;
76

8-
// Regex used for ansi escape code splitting
9-
// Adopted from https://github.com/chalk/ansi-regex/blob/master/index.js
10-
// License: MIT, authors: @sindresorhus, Qix-, arjunmehta and LitoMore
11-
// Matches all ansi escape code sequences in a string
12-
const ansiPattern = '[\\u001B\\u009B][[\\]()#;?]*' +
13-
'(?:(?:(?:[a-zA-Z\\d]*(?:;[-a-zA-Z\\d\\/#&.:=?%@~_]*)*)?\\u0007)' +
14-
'|(?:(?:\\d{1,4}(?:;\\d{0,4})*)?[\\dA-PR-TZcf-ntqry=><~]))';
15-
const ansi = new RegExp(ansiPattern, 'g');
16-
177
const kUTF16SurrogateThreshold = 0x10000; // 2 ** 16
188
const kEscape = '\x1b';
199
const kSubstringSearch = Symbol('kSubstringSearch');
2010

21-
let getStringWidth;
22-
2311
function CSI(strings, ...args) {
2412
let ret = `${kEscape}[`;
2513
for (let n = 0; n < strings.length; n++) {
@@ -59,109 +47,6 @@ function charLengthAt(str, i) {
5947
return str.codePointAt(i) >= kUTF16SurrogateThreshold ? 2 : 1;
6048
}
6149

62-
if (internalBinding('config').hasIntl) {
63-
const icu = internalBinding('icu');
64-
// icu.getStringWidth(string, ambiguousAsFullWidth, expandEmojiSequence)
65-
// Defaults: ambiguousAsFullWidth = false; expandEmojiSequence = true;
66-
// TODO(BridgeAR): Expose the options to the user. That is probably the
67-
// best thing possible at the moment, since it's difficult to know what
68-
// the receiving end supports.
69-
getStringWidth = function getStringWidth(str) {
70-
let width = 0;
71-
str = stripVTControlCharacters(str);
72-
for (let i = 0; i < str.length; i++) {
73-
// Try to avoid calling into C++ by first handling the ASCII portion of
74-
// the string. If it is fully ASCII, we skip the C++ part.
75-
const code = str.charCodeAt(i);
76-
if (code >= 127) {
77-
width += icu.getStringWidth(str.slice(i));
78-
break;
79-
}
80-
width += code >= 32 ? 1 : 0;
81-
}
82-
return width;
83-
};
84-
} else {
85-
/**
86-
* Returns the number of columns required to display the given string.
87-
*/
88-
getStringWidth = function getStringWidth(str) {
89-
let width = 0;
90-
91-
str = stripVTControlCharacters(str);
92-
93-
for (const char of str) {
94-
const code = char.codePointAt(0);
95-
if (isFullWidthCodePoint(code)) {
96-
width += 2;
97-
} else if (!isZeroWidthCodePoint(code)) {
98-
width++;
99-
}
100-
}
101-
102-
return width;
103-
};
104-
105-
/**
106-
* Returns true if the character represented by a given
107-
* Unicode code point is full-width. Otherwise returns false.
108-
*/
109-
const isFullWidthCodePoint = (code) => {
110-
// Code points are partially derived from:
111-
// http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
112-
return code >= 0x1100 && (
113-
code <= 0x115f || // Hangul Jamo
114-
code === 0x2329 || // LEFT-POINTING ANGLE BRACKET
115-
code === 0x232a || // RIGHT-POINTING ANGLE BRACKET
116-
// CJK Radicals Supplement .. Enclosed CJK Letters and Months
117-
(code >= 0x2e80 && code <= 0x3247 && code !== 0x303f) ||
118-
// Enclosed CJK Letters and Months .. CJK Unified Ideographs Extension A
119-
(code >= 0x3250 && code <= 0x4dbf) ||
120-
// CJK Unified Ideographs .. Yi Radicals
121-
(code >= 0x4e00 && code <= 0xa4c6) ||
122-
// Hangul Jamo Extended-A
123-
(code >= 0xa960 && code <= 0xa97c) ||
124-
// Hangul Syllables
125-
(code >= 0xac00 && code <= 0xd7a3) ||
126-
// CJK Compatibility Ideographs
127-
(code >= 0xf900 && code <= 0xfaff) ||
128-
// Vertical Forms
129-
(code >= 0xfe10 && code <= 0xfe19) ||
130-
// CJK Compatibility Forms .. Small Form Variants
131-
(code >= 0xfe30 && code <= 0xfe6b) ||
132-
// Halfwidth and Fullwidth Forms
133-
(code >= 0xff01 && code <= 0xff60) ||
134-
(code >= 0xffe0 && code <= 0xffe6) ||
135-
// Kana Supplement
136-
(code >= 0x1b000 && code <= 0x1b001) ||
137-
// Enclosed Ideographic Supplement
138-
(code >= 0x1f200 && code <= 0x1f251) ||
139-
// Miscellaneous Symbols and Pictographs .. Emoticons
140-
(code >= 0x1f300 && code <= 0x1f64f) ||
141-
// CJK Unified Ideographs Extension B .. Tertiary Ideographic Plane
142-
(code >= 0x20000 && code <= 0x3fffd)
143-
);
144-
};
145-
146-
const isZeroWidthCodePoint = (code) => {
147-
return code <= 0x1F || // C0 control codes
148-
(code > 0x7F && code <= 0x9F) || // C1 control codes
149-
(code >= 0x0300 && code <= 0x036F) || // Combining Diacritical Marks
150-
(code >= 0x200B && code <= 0x200F) || // Modifying Invisible Characters
151-
(code >= 0xFE00 && code <= 0xFE0F) || // Variation Selectors
152-
(code >= 0xFE20 && code <= 0xFE2F) || // Combining Half Marks
153-
(code >= 0xE0100 && code <= 0xE01EF); // Variation Selectors
154-
};
155-
}
156-
157-
/**
158-
* Tries to remove all VT control characters. Use to estimate displayed
159-
* string width. May be buggy due to not running a real state machine
160-
*/
161-
function stripVTControlCharacters(str) {
162-
return str.replace(ansi, '');
163-
}
164-
16550
/*
16651
Some patterns seen in terminal key escape codes, derived from combos seen
16752
at http://www.midnight-commander.org/browser/lib/tty/key.c
@@ -477,8 +362,6 @@ module.exports = {
477362
charLengthLeft,
478363
commonPrefix,
479364
emitKeys,
480-
getStringWidth,
481365
kSubstringSearch,
482-
stripVTControlCharacters,
483366
CSI
484367
};

lib/internal/repl/utils.js

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,13 @@ const {
3232

3333
const {
3434
commonPrefix,
35-
getStringWidth,
3635
kSubstringSearch,
3736
} = require('internal/readline/utils');
3837

39-
const { inspect } = require('util');
38+
const {
39+
getStringWidth,
40+
inspect,
41+
} = require('internal/util/inspect');
4042

4143
const debug = require('internal/util/debuglog').debuglog('repl');
4244

lib/internal/util/inspect.js

Lines changed: 122 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,17 @@ const meta = [
192192
'\\x98', '\\x99', '\\x9A', '\\x9B', '\\x9C', '\\x9D', '\\x9E', '\\x9F', // x9F
193193
];
194194

195+
// Regex used for ansi escape code splitting
196+
// Adopted from https://github.com/chalk/ansi-regex/blob/master/index.js
197+
// License: MIT, authors: @sindresorhus, Qix-, arjunmehta and LitoMore
198+
// Matches all ansi escape code sequences in a string
199+
const ansiPattern = '[\\u001B\\u009B][[\\]()#;?]*' +
200+
'(?:(?:(?:[a-zA-Z\\d]*(?:;[-a-zA-Z\\d\\/#&.:=?%@~_]*)*)?\\u0007)' +
201+
'|(?:(?:\\d{1,4}(?:;\\d{0,4})*)?[\\dA-PR-TZcf-ntqry=><~]))';
202+
const ansi = new RegExp(ansiPattern, 'g');
203+
204+
let getStringWidth;
205+
195206
function getUserOptions(ctx) {
196207
return {
197208
stylize: ctx.stylize,
@@ -1154,7 +1165,7 @@ function groupArrayElements(ctx, output, value) {
11541165
// entries length of all output entries. We have to remove colors first,
11551166
// otherwise the length would not be calculated properly.
11561167
for (; i < outputLength; i++) {
1157-
const len = ctx.colors ? removeColors(output[i]).length : output[i].length;
1168+
const len = getStringWidth(output[i], ctx.colors);
11581169
dataLen[i] = len;
11591170
totalLength += len + separatorSpace;
11601171
if (maxLength < len)
@@ -1197,8 +1208,6 @@ function groupArrayElements(ctx, output, value) {
11971208
if (columns <= 1) {
11981209
return output;
11991210
}
1200-
// TODO(BridgeAR): Add unicode support. Use the readline getStringWidth
1201-
// function.
12021211
const tmp = [];
12031212
const maxLineLength = [];
12041213
for (let i = 0; i < columns; i++) {
@@ -1565,11 +1574,8 @@ function formatProperty(ctx, value, recurseTimes, key, type, desc) {
15651574
const diff = (ctx.compact !== true || type !== kObjectType) ? 2 : 3;
15661575
ctx.indentationLvl += diff;
15671576
str = formatValue(ctx, desc.value, recurseTimes);
1568-
if (diff === 3) {
1569-
const len = ctx.colors ? removeColors(str).length : str.length;
1570-
if (ctx.breakLength < len) {
1571-
extra = `\n${' '.repeat(ctx.indentationLvl)}`;
1572-
}
1577+
if (diff === 3 && ctx.breakLength < getStringWidth(str, ctx.colors)) {
1578+
extra = `\n${' '.repeat(ctx.indentationLvl)}`;
15731579
}
15741580
ctx.indentationLvl -= diff;
15751581
} else if (desc.get !== undefined) {
@@ -1889,9 +1895,116 @@ function formatWithOptionsInternal(inspectOptions, ...args) {
18891895
return str;
18901896
}
18911897

1898+
if (internalBinding('config').hasIntl) {
1899+
const icu = internalBinding('icu');
1900+
// icu.getStringWidth(string, ambiguousAsFullWidth, expandEmojiSequence)
1901+
// Defaults: ambiguousAsFullWidth = false; expandEmojiSequence = true;
1902+
// TODO(BridgeAR): Expose the options to the user. That is probably the
1903+
// best thing possible at the moment, since it's difficult to know what
1904+
// the receiving end supports.
1905+
getStringWidth = function getStringWidth(str, removeControlChars = true) {
1906+
let width = 0;
1907+
if (removeControlChars)
1908+
str = stripVTControlCharacters(str);
1909+
for (let i = 0; i < str.length; i++) {
1910+
// Try to avoid calling into C++ by first handling the ASCII portion of
1911+
// the string. If it is fully ASCII, we skip the C++ part.
1912+
const code = str.charCodeAt(i);
1913+
if (code >= 127) {
1914+
width += icu.getStringWidth(str.slice(i));
1915+
break;
1916+
}
1917+
width += code >= 32 ? 1 : 0;
1918+
}
1919+
return width;
1920+
};
1921+
} else {
1922+
/**
1923+
* Returns the number of columns required to display the given string.
1924+
*/
1925+
getStringWidth = function getStringWidth(str, removeControlChars = true) {
1926+
let width = 0;
1927+
1928+
if (removeControlChars)
1929+
str = stripVTControlCharacters(str);
1930+
1931+
for (const char of str) {
1932+
const code = char.codePointAt(0);
1933+
if (isFullWidthCodePoint(code)) {
1934+
width += 2;
1935+
} else if (!isZeroWidthCodePoint(code)) {
1936+
width++;
1937+
}
1938+
}
1939+
1940+
return width;
1941+
};
1942+
1943+
/**
1944+
* Returns true if the character represented by a given
1945+
* Unicode code point is full-width. Otherwise returns false.
1946+
*/
1947+
const isFullWidthCodePoint = (code) => {
1948+
// Code points are partially derived from:
1949+
// http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
1950+
return code >= 0x1100 && (
1951+
code <= 0x115f || // Hangul Jamo
1952+
code === 0x2329 || // LEFT-POINTING ANGLE BRACKET
1953+
code === 0x232a || // RIGHT-POINTING ANGLE BRACKET
1954+
// CJK Radicals Supplement .. Enclosed CJK Letters and Months
1955+
(code >= 0x2e80 && code <= 0x3247 && code !== 0x303f) ||
1956+
// Enclosed CJK Letters and Months .. CJK Unified Ideographs Extension A
1957+
(code >= 0x3250 && code <= 0x4dbf) ||
1958+
// CJK Unified Ideographs .. Yi Radicals
1959+
(code >= 0x4e00 && code <= 0xa4c6) ||
1960+
// Hangul Jamo Extended-A
1961+
(code >= 0xa960 && code <= 0xa97c) ||
1962+
// Hangul Syllables
1963+
(code >= 0xac00 && code <= 0xd7a3) ||
1964+
// CJK Compatibility Ideographs
1965+
(code >= 0xf900 && code <= 0xfaff) ||
1966+
// Vertical Forms
1967+
(code >= 0xfe10 && code <= 0xfe19) ||
1968+
// CJK Compatibility Forms .. Small Form Variants
1969+
(code >= 0xfe30 && code <= 0xfe6b) ||
1970+
// Halfwidth and Fullwidth Forms
1971+
(code >= 0xff01 && code <= 0xff60) ||
1972+
(code >= 0xffe0 && code <= 0xffe6) ||
1973+
// Kana Supplement
1974+
(code >= 0x1b000 && code <= 0x1b001) ||
1975+
// Enclosed Ideographic Supplement
1976+
(code >= 0x1f200 && code <= 0x1f251) ||
1977+
// Miscellaneous Symbols and Pictographs 0x1f300 - 0x1f5ff
1978+
// Emoticons 0x1f600 - 0x1f64f
1979+
(code >= 0x1f300 && code <= 0x1f64f) ||
1980+
// CJK Unified Ideographs Extension B .. Tertiary Ideographic Plane
1981+
(code >= 0x20000 && code <= 0x3fffd)
1982+
);
1983+
};
1984+
1985+
const isZeroWidthCodePoint = (code) => {
1986+
return code <= 0x1F || // C0 control codes
1987+
(code > 0x7F && code <= 0x9F) || // C1 control codes
1988+
(code >= 0x300 && code <= 0x36F) || // Combining Diacritical Marks
1989+
(code >= 0x200B && code <= 0x200F) || // Modifying Invisible Characters
1990+
(code >= 0xFE00 && code <= 0xFE0F) || // Variation Selectors
1991+
(code >= 0xFE20 && code <= 0xFE2F) || // Combining Half Marks
1992+
(code >= 0xE0100 && code <= 0xE01EF); // Variation Selectors
1993+
};
1994+
}
1995+
1996+
/**
1997+
* Remove all VT control characters. Use to estimate displayed string width.
1998+
*/
1999+
function stripVTControlCharacters(str) {
2000+
return str.replace(ansi, '');
2001+
}
2002+
18922003
module.exports = {
18932004
inspect,
18942005
format,
18952006
formatWithOptions,
1896-
inspectDefaultOptions
2007+
getStringWidth,
2008+
inspectDefaultOptions,
2009+
stripVTControlCharacters
18972010
};

lib/readline.js

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,17 +46,19 @@ const {
4646
ERR_INVALID_OPT_VALUE
4747
} = require('internal/errors').codes;
4848
const { validateString } = require('internal/validators');
49-
const { inspect } = require('internal/util/inspect');
49+
const {
50+
inspect,
51+
getStringWidth,
52+
stripVTControlCharacters,
53+
} = require('internal/util/inspect');
5054
const EventEmitter = require('events');
5155
const {
5256
charLengthAt,
5357
charLengthLeft,
5458
commonPrefix,
5559
CSI,
5660
emitKeys,
57-
getStringWidth,
5861
kSubstringSearch,
59-
stripVTControlCharacters
6062
} = require('internal/readline/utils');
6163

6264
const { clearTimeout, setTimeout } = require('timers');

0 commit comments

Comments
 (0)