From 13b613de15828fa79a6c6dc9c894bf9a770ddfda Mon Sep 17 00:00:00 2001 From: Sam Saccone Date: Thu, 8 Apr 2021 23:01:21 -0700 Subject: [PATCH 1/2] In profiling postcss I found that a significant amount of time was being spent in [`unesc`](https://github.com/postcss/postcss-selector-parser/commits/master/src/util/unesc.js), this was due to the expensive regex checks that were being performed on the fly for every selector in the codebase which looked to be performing quite poorly inside of modern node and v8. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ![image](https://user-images.githubusercontent.com/883126/114136698-fdd98a80-98bf-11eb-8068-ace4f6f2274d.png) ---- As an experiment and based on some prior experience with this class of slowdown I migrated the implementation to one that performs a scan through the string instead of running a regex replace. By testing this on my local application I instantly saw the work from this function go from > 900 ms to ~100ms. ![image](https://user-images.githubusercontent.com/883126/114136734-0c27a680-98c0-11eb-82ab-f0c9529fd32d.png) This implementation passes all of the existing test cases and aims to mirror the prior implementation's implementation details :) ----- Based on my application I am seeing the major wins come from purgecss dropping my total application build by multiple seconds! 🔥 --- src/__tests__/classes.js | 1 + src/__tests__/util/unesc.js | 50 +++++++++++++++++++++ src/util/unesc.js | 86 ++++++++++++++++++++++++++++++------- 3 files changed, 121 insertions(+), 16 deletions(-) create mode 100644 src/__tests__/util/unesc.js diff --git a/src/__tests__/classes.js b/src/__tests__/classes.js index 7a9c396..d6eb6a7 100644 --- a/src/__tests__/classes.js +++ b/src/__tests__/classes.js @@ -264,3 +264,4 @@ test('class selector with escaping (36)', '.not-pseudo\\:\\:focus', (t, tree) => t.deepEqual(tree.nodes[0].nodes[0].type, 'class'); t.deepEqual(tree.nodes[0].nodes[0].raws.value, 'not-pseudo\\:\\:focus'); }); + diff --git a/src/__tests__/util/unesc.js b/src/__tests__/util/unesc.js new file mode 100644 index 0000000..1ab86db --- /dev/null +++ b/src/__tests__/util/unesc.js @@ -0,0 +1,50 @@ +import {test} from '../util/helpers'; + +test('id selector', '#foo', (t, tree) => { + t.deepEqual(tree.nodes[0].nodes[0].value, 'foo'); +}); + +test('escaped special char', '#w\\+', (t, tree) => { + t.deepEqual(tree.nodes[0].nodes[0].value, 'w+'); +}); + +test('tailing escape', '#foo\\', (t, tree) => { + t.deepEqual(tree.nodes[0].nodes[0].value, 'foo\\'); +}); + +test('double escape', '#wow\\\\k', (t, tree) => { + t.deepEqual(tree.nodes[0].nodes[0].value, 'wow\\k'); +}); + +test('leading numeric', '.\\31 23', (t, tree) => { + t.deepEqual(tree.nodes[0].nodes[0].value, '123'); +}); + +test('emoji', '.\\🐐', (t, tree) => { + t.deepEqual(tree.nodes[0].nodes[0].value, '🐐'); +}); + +// https://www.w3.org/International/questions/qa-escapes#cssescapes +test('hex escape', '.\\E9motion', (t, tree) => { + t.deepEqual(tree.nodes[0].nodes[0].value, 'émotion'); +}); + +test('hex escape with space', '.\\E9 dition', (t, tree) => { + t.deepEqual(tree.nodes[0].nodes[0].value, 'édition'); +}); + +test('hex escape with hex number', '.\\0000E9dition', (t, tree) => { + t.deepEqual(tree.nodes[0].nodes[0].value, 'édition'); +}); + +test('class selector with escaping', '.\\1D306', (t, tree) => { + t.deepEqual(tree.nodes[0].nodes[0].value, '𝌆'); +}); + +test('class selector with escaping with more chars', '.\\1D306k', (t, tree) => { + t.deepEqual(tree.nodes[0].nodes[0].value, '𝌆k'); +}); + +test('class selector with escaping with more chars with whitespace', '.wow\\1D306 k', (t, tree) => { + t.deepEqual(tree.nodes[0].nodes[0].value, 'wow𝌆k'); +}); diff --git a/src/util/unesc.js b/src/util/unesc.js index bb18ad9..eb4f93a 100644 --- a/src/util/unesc.js +++ b/src/util/unesc.js @@ -1,19 +1,73 @@ -const whitespace = '[\\x20\\t\\r\\n\\f]'; -const unescapeRegExp = new RegExp('\\\\([\\da-f]{1,6}' + whitespace + '?|(' + whitespace + ')|.)', 'ig'); +// Many thanks for this post which made this migration much easier. +// https://mathiasbynens.be/notes/css-escapes + +/** + * + * @param {string} str + * @returns {[string, number]|undefined} + */ +function gobbleHex (str) { + const lower = str.toLowerCase(); + let hex = ''; + let spaceTerminated = false; + for (let i = 0; i < 6 && lower[i] !== undefined; i++) { + const code = lower.charCodeAt(i); + // check to see if we are dealing with a valid hex char [a-f|0-9] + const valid = (code >= 97 && code <= 102) || (code >= 48 && code <= 57); + // https://drafts.csswg.org/css-syntax/#consume-escaped-code-point + spaceTerminated = code === 32; + if (!valid) { + break; + } + hex += lower[i]; + } + + if (hex.length === 0) { + return undefined; + } + + return [ + String.fromCodePoint(parseInt(hex, 16)), + hex.length + (spaceTerminated ? 1 : 0), + ]; +} + +const CONTAINS_ESCAPE = /\\/; export default function unesc (str) { - return str.replace(unescapeRegExp, (_, escaped, escapedWhitespace) => { - const high = '0x' + escaped - 0x10000; - - // NaN means non-codepoint - // Workaround erroneous numeric interpretation of +"0x" - // eslint-disable-next-line no-self-compare - return high !== high || escapedWhitespace - ? escaped - : high < 0 - ? // BMP codepoint - String.fromCharCode(high + 0x10000) - : // Supplemental Plane codepoint (surrogate pair) - String.fromCharCode((high >> 10) | 0xd800, (high & 0x3ff) | 0xdc00); - }); + let needToProcess = CONTAINS_ESCAPE.test(str); + if (!needToProcess) { + return str; + } + let ret = ""; + + for (let i = 0; i < str.length; i++) { + if ((str[i] === "\\")) { + const gobbled = gobbleHex(str.slice(i + 1, i + 7)); + if (gobbled !== undefined) { + ret += gobbled[0]; + i += gobbled[1]; + continue; + } + + // Retain a pair of \\ if double escaped `\\\\` + // https://github.com/postcss/postcss-selector-parser/commit/268c9a7656fb53f543dc620aa5b73a30ec3ff20e + if (str[i + 1] === "\\") { + ret += "\\"; + i++; + continue; + } + + // if \\ is at the end of the string retain it + // https://github.com/postcss/postcss-selector-parser/commit/01a6b346e3612ce1ab20219acc26abdc259ccefb + if (str.length === i + 1) { + ret += str[i]; + } + continue; + } + + ret += str[i]; + } + + return ret; } From 5c6c988854844e1e1e7dddab5c271c0870eea86b Mon Sep 17 00:00:00 2001 From: Sam Saccone Date: Sun, 11 Apr 2021 11:11:55 -0700 Subject: [PATCH 2/2] Expand unesc handling to correctly handle spec edgecase for lone surrogates and out of bound codepoint values. --- src/__tests__/util/unesc.js | 12 ++++++++++++ src/util/unesc.js | 11 ++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/__tests__/util/unesc.js b/src/__tests__/util/unesc.js index 1ab86db..87ccaf1 100644 --- a/src/__tests__/util/unesc.js +++ b/src/__tests__/util/unesc.js @@ -48,3 +48,15 @@ test('class selector with escaping with more chars', '.\\1D306k', (t, tree) => { test('class selector with escaping with more chars with whitespace', '.wow\\1D306 k', (t, tree) => { t.deepEqual(tree.nodes[0].nodes[0].value, 'wow𝌆k'); }); + +test('handles 0 value hex', '\\0', (t, tree) => { + t.deepEqual(tree.nodes[0].nodes[0].value, String.fromCodePoint(0xFFFD)); +}); + +test('handles lone surrogate value hex', '\\DBFF', (t, tree) => { + t.deepEqual(tree.nodes[0].nodes[0].value, String.fromCodePoint(0xFFFD)); +}); + +test('handles out of bound values', '\\110000', (t, tree) => { + t.deepEqual(tree.nodes[0].nodes[0].value, String.fromCodePoint(0xFFFD)); +}); diff --git a/src/util/unesc.js b/src/util/unesc.js index eb4f93a..5b3cd17 100644 --- a/src/util/unesc.js +++ b/src/util/unesc.js @@ -25,9 +25,18 @@ function gobbleHex (str) { if (hex.length === 0) { return undefined; } + const codePoint = parseInt(hex, 16); + + const isSurrogate = codePoint >= 0xD800 && codePoint <= 0xDFFF; + // Add special case for + // "If this number is zero, or is for a surrogate, or is greater than the maximum allowed code point" + // https://drafts.csswg.org/css-syntax/#maximum-allowed-code-point + if (isSurrogate || codePoint === 0x0000 || codePoint > 0x10FFFF) { + return ['\uFFFD', hex.length + (spaceTerminated ? 1 : 0)]; + } return [ - String.fromCodePoint(parseInt(hex, 16)), + String.fromCodePoint(codePoint), hex.length + (spaceTerminated ? 1 : 0), ]; }