normalize out unicode ligatures

isaacs · isaacs · commit 3b1abfae6500 · 2026-01-19T13:39:47.000-08:00
Fix: GHSA-r6q2-hw4h-h46w
diff --git a/src/normalize-unicode.ts b/src/normalize-unicode.ts
@@ -9,7 +9,11 @@ const MAX = 10000
 const cache = new Set<string>()
 export const normalizeUnicode = (s: string): string => {
   if (!cache.has(s)) {
-    normalizeCache[s] = s.normalize('NFD')
+    // shake out identical accents and ligatures
+    normalizeCache[s] = s
+      .normalize('NFD')
+      .toLocaleLowerCase('en')
+      .toLocaleUpperCase('en')
   } else {
     cache.delete(s)
   }
diff --git a/src/path-reservations.ts b/src/path-reservations.ts
@@ -56,9 +56,7 @@ export class PathReservations {
         ['win32 parallelization disabled']
       : paths.map(p => {
           // don't need normPath, because we skip this entirely for windows
-          return stripTrailingSlashes(
-            join(normalizeUnicode(p)),
-          ).toLowerCase()
+          return stripTrailingSlashes(join(normalizeUnicode(p)))
         })
 
     const dirs = new Set<string>(
diff --git a/tap-snapshots/test/normalize-unicode.js-win32.test.cjs b/tap-snapshots/test/normalize-unicode.js-win32.test.cjs
@@ -6,25 +6,25 @@
  */
 'use strict'
 exports[`test/normalize-unicode.js win32 > TAP > normalize with strip slashes > "＼＼＼＼＼eee＼＼＼＼＼＼" > normalized 1`] = `
-＼＼＼＼＼eee＼＼＼＼＼＼
+＼＼＼＼＼EEE＼＼＼＼＼＼
 `
 
 exports[`test/normalize-unicode.js win32 > TAP > normalize with strip slashes > "\\\\a\\\\b\\\\c\\\\d\\\\" > normalized 1`] = `
-/a/b/c/d
+/A/B/C/D
 `
 
 exports[`test/normalize-unicode.js win32 > TAP > normalize with strip slashes > "﹨aaaa﹨dddd﹨" > normalized 1`] = `
-﹨aaaa﹨dddd﹨
+﹨AAAA﹨DDDD﹨
 `
 
 exports[`test/normalize-unicode.js win32 > TAP > normalize with strip slashes > "＼bbb＼eee＼" > normalized 1`] = `
-＼bbb＼eee＼
+＼BBB＼EEE＼
 `
 
 exports[`test/normalize-unicode.js win32 > TAP > normalize with strip slashes > "1/4foo.txt" > normalized 1`] = `
-1/4foo.txt
+1/4FOO.TXT
 `
 
 exports[`test/normalize-unicode.js win32 > TAP > normalize with strip slashes > "¼foo.txt" > normalized 1`] = `
-¼foo.txt
+¼FOO.TXT
 `
diff --git a/test/ghsa-8qq5-rm4j-mr97.ts b/test/ghsa-8qq5-rm4j-mr97.ts
@@ -42,7 +42,10 @@ t.test('verify that linkpaths get sanitized properly', async t => {
   })
 
   writeFileSync(resolve(out, 'exploit_hard'), 'OVERWRITTEN')
-  t.equal(readFileSync(resolve(dir, 'secret.txt'), 'utf8'), 'ORIGINAL DATA')
+  t.equal(
+    readFileSync(resolve(dir, 'secret.txt'), 'utf8'),
+    'ORIGINAL DATA',
+  )
 
   t.not(readlinkSync(resolve(out, 'exploit_sym')), targetSym)
 })
diff --git a/test/ghsa-r6q2-hw4h-h46w.ts b/test/ghsa-r6q2-hw4h-h46w.ts
@@ -0,0 +1,71 @@
+import t from 'tap'
+import { normalizeUnicode } from '../src/normalize-unicode.js'
+import { Header } from '../src/header.js'
+import { extract } from '../src/extract.js'
+import { resolve } from 'node:path'
+import { lstatSync, readFileSync, statSync } from 'node:fs'
+
+// these characters are problems on macOS's APFS
+const chars = {
+  ['ﬀ'.normalize('NFC')]: 'FF',
+  ['ﬁ'.normalize('NFC')]: 'FI',
+  ['ﬂ'.normalize('NFC')]: 'FL',
+  ['ﬃ'.normalize('NFC')]: 'FFI',
+  ['ﬄ'.normalize('NFC')]: 'FFL',
+  ['ﬅ'.normalize('NFC')]: 'ST',
+  ['ﬆ'.normalize('NFC')]: 'ST',
+  ['ẛ'.normalize('NFC')]: 'Ṡ',
+  ['ß'.normalize('NFC')]: 'SS',
+  ['ẞ'.normalize('NFC')]: 'SS',
+  ['ſ'.normalize('NFC')]: 'S',
+}
+
+for (const [c, n] of Object.entries(chars)) {
+  t.test(`${c} => ${n}`, async t => {
+    t.equal(normalizeUnicode(c), n)
+
+    t.test('link then file', async t => {
+      const tarball = Buffer.alloc(2048)
+      new Header({
+        path: c,
+        type: 'SymbolicLink',
+        linkpath: './target',
+      }).encode(tarball, 0)
+      new Header({
+        path: n,
+        type: 'File',
+        size: 1,
+      }).encode(tarball, 512)
+      tarball[1024] = 'x'.charCodeAt(0)
+
+      const cwd = t.testdir({ tarball })
+
+      await extract({ cwd, file: resolve(cwd, 'tarball') })
+
+      t.throws(() => statSync(resolve(cwd, 'target')))
+      t.equal(readFileSync(resolve(cwd, n), 'utf8'), 'x')
+    })
+
+    t.test('file then link', { saveFixture: true }, async t => {
+      const tarball = Buffer.alloc(2048)
+      new Header({
+        path: n,
+        type: 'File',
+        size: 1,
+      }).encode(tarball, 0)
+      tarball[512] = 'x'.charCodeAt(0)
+      new Header({
+        path: c,
+        type: 'SymbolicLink',
+        linkpath: './target',
+      }).encode(tarball, 1024)
+
+      const cwd = t.testdir({ tarball })
+
+      await extract({ cwd, file: resolve(cwd, 'tarball') })
+
+      t.throws(() => statSync(resolve(cwd, 'target')))
+      t.equal(lstatSync(resolve(cwd, c)).isSymbolicLink(), true)
+    })
+  })
+}
diff --git a/test/normalize-unicode.js b/test/normalize-unicode.js
@@ -21,7 +21,7 @@ t.equal(
   'matching unicodes',
 )
 t.equal(normalizeUnicode(cafe1), normalizeUnicode(cafe2), 'cached')
-t.equal(normalizeUnicode('foo'), 'foo', 'non-unicode string')
+t.equal(normalizeUnicode('foo'), 'FOO', 'non-unicode string')
 
 if (fakePlatform === 'win32') {
   t.test('normalize with strip slashes', t => {

Original file line number	Diff line number	Diff line change
`@@ -6,25 +6,25 @@`
`6`	`6`	`*/`
`7`	`7`	`'use strict'`
`8`	`8`	exports[`test/normalize-unicode.js win32 > TAP > normalize with strip slashes > "＼＼＼＼＼eee＼＼＼＼＼＼" > normalized 1`] = `
`9`		`-＼＼＼＼＼eee＼＼＼＼＼＼`
	`9`	`+＼＼＼＼＼EEE＼＼＼＼＼＼`
`10`	`10`	`
`11`	`11`
`12`	`12`	exports[`test/normalize-unicode.js win32 > TAP > normalize with strip slashes > "\\\\a\\\\b\\\\c\\\\d\\\\" > normalized 1`] = `
`13`		`-/a/b/c/d`
	`13`	`+/A/B/C/D`
`14`	`14`	`
`15`	`15`
`16`	`16`	exports[`test/normalize-unicode.js win32 > TAP > normalize with strip slashes > "﹨aaaa﹨dddd﹨" > normalized 1`] = `
`17`		`-﹨aaaa﹨dddd﹨`
	`17`	`+﹨AAAA﹨DDDD﹨`
`18`	`18`	`
`19`	`19`
`20`	`20`	exports[`test/normalize-unicode.js win32 > TAP > normalize with strip slashes > "＼bbb＼eee＼" > normalized 1`] = `
`21`		`-＼bbb＼eee＼`
	`21`	`+＼BBB＼EEE＼`
`22`	`22`	`
`23`	`23`
`24`	`24`	exports[`test/normalize-unicode.js win32 > TAP > normalize with strip slashes > "1/4foo.txt" > normalized 1`] = `
`25`		`-1/4foo.txt`
	`25`	`+1/4FOO.TXT`
`26`	`26`	`
`27`	`27`
`28`	`28`	exports[`test/normalize-unicode.js win32 > TAP > normalize with strip slashes > "¼foo.txt" > normalized 1`] = `
`29`		`-¼foo.txt`
	`29`	`+¼FOO.TXT`
`30`	`30`	`
Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,7 @@ t.equal(`
`21`	`21`	`'matching unicodes',`
`22`	`22`	`)`
`23`	`23`	`t.equal(normalizeUnicode(cafe1), normalizeUnicode(cafe2), 'cached')`
`24`		`-t.equal(normalizeUnicode('foo'), 'foo', 'non-unicode string')`
	`24`	`+t.equal(normalizeUnicode('foo'), 'FOO', 'non-unicode string')`
`25`	`25`
`26`	`26`	`if (fakePlatform === 'win32') {`
`27`	`27`	`t.test('normalize with strip slashes', t => {`