Skip to content

Commit 15d13b1

Browse files
committed
feat: add a LRU BPE merge cache
defaults to 100k items, but can be disabled or increased for higher performance fixes #68
1 parent 15009fa commit 15d13b1

48 files changed

Lines changed: 482 additions & 61 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

README.md

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -235,9 +235,10 @@ const tokenLimit = 10
235235
const withinTokenLimit = isWithinTokenLimit(text, tokenLimit)
236236
```
237237

238-
### `countTokens(text: string | Iterable<ChatMessage>): number`
238+
### `countTokens(text: string | Iterable<ChatMessage>, encodeOptions?: EncodeOptions): number`
239239

240240
Counts the number of tokens in the input text or chat. Use this method when you need to determine the number of tokens without checking against a limit.
241+
The optional `encodeOptions` parameter allows you to specify custom sets of allowed or disallowed special tokens.
241242

242243
Example:
243244

@@ -370,6 +371,28 @@ const encoded = encode(inputText, undefined, disallowedSpecial)
370371

371372
In this example, an Error is thrown, because the input text contains a disallowed special token.
372373

374+
## Performance Optimization
375+
376+
### LRU Merge Cache
377+
378+
The tokenizer uses an LRU (Least Recently Used) cache to improve encoding performance for similar strings. By default, it stores up to 100,000 merged token pairs. You can adjust this value to optimize for your specific use case:
379+
380+
- Increasing the cache size will make encoding similar strings faster but consume more memory
381+
- Setting it to 0 will disable caching completely
382+
- For applications processing many unique strings, a smaller cache might be more efficient
383+
384+
You can modify the cache size using the `setMergeCacheSize` function:
385+
386+
```ts
387+
import { setMergeCacheSize } from 'gpt-tokenizer'
388+
389+
// Set to 5000 entries
390+
setMergeCacheSize(5000)
391+
392+
// Disable caching completely
393+
setMergeCacheSize(0)
394+
```
395+
373396
## Testing and Validation
374397

375398
`gpt-tokenizer` includes a set of test cases in the [TestPlans.txt](./data/TestPlans.txt) file to ensure its compatibility with OpenAI's Python `tiktoken` library. These test cases validate the functionality and behavior of `gpt-tokenizer`, providing a reliable reference for developers.

benchmark/src/benchmarkRunner.ts

Lines changed: 65 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,16 @@ const calculateAverage = (numbers: number[]): number => {
1515
return numbers.reduce((a, b) => a + b, 0) / numbers.length
1616
}
1717

18-
// Define the number of executions for performance testing
19-
const EXECUTIONS = 10000
20-
2118
// Define the number of iterations for averaging
22-
const ITERATIONS = 3
19+
const ITERATIONS = 1
20+
21+
// Define the execution multiplier for performance testing
22+
const EXECUTIONS_MULTIPLIER = 1
2323

2424
// Function to run a single benchmark iteration in a child process
2525
const runSingleBenchmark = (
2626
tokenizerIndex: number,
27-
executions: number,
27+
executionsMultiplier: number,
2828
): Promise<BenchmarkResult> => {
2929
return new Promise((resolve, reject) => {
3030
const workerPath = path.resolve(__dirname, 'benchmarkWorker.js')
@@ -36,7 +36,10 @@ const runSingleBenchmark = (
3636
reject(new Error('Failed to spawn child process'))
3737
return
3838
}
39-
const message: WorkerInput = { tokenizerIndex, executions }
39+
const message: WorkerInput = {
40+
tokenizerIndex,
41+
executionsMultiplier,
42+
}
4043
child.send(message)
4144
child.on('message', (msg: any) => {
4245
// Changed to any to avoid TypeScript issues
@@ -117,12 +120,17 @@ const displayUnifiedResults = (results: BenchmarkResult[]) => {
117120
label: 'Encode Avg (ms)',
118121
better: 'lower' as const,
119122
precision: 4,
120-
}, // Increased precision
123+
},
121124
decodeTimeAvg: {
122125
label: 'Decode Avg (ms)',
123126
better: 'lower' as const,
124127
precision: 4,
125-
}, // Increased precision
128+
},
129+
countTokensTimeAvg: {
130+
label: 'Count Tokens Avg (ms)',
131+
better: 'lower' as const,
132+
precision: 4,
133+
},
126134
memoryIncrease: {
127135
label: 'Memory Increase (MB)',
128136
better: 'lower' as const,
@@ -146,6 +154,8 @@ const displayUnifiedResults = (results: BenchmarkResult[]) => {
146154
return r.datasetsAverage?.encodeTimeMs || 0
147155
case 'decodeTimeAvg':
148156
return r.datasetsAverage?.decodeTimeMs || 0
157+
case 'countTokensTimeAvg':
158+
return r.datasetsAverage?.countTimeMs || 0
149159
case 'memoryIncrease':
150160
return r.memoryChangeAfterRunMb
151161
default:
@@ -166,6 +176,7 @@ const displayUnifiedResults = (results: BenchmarkResult[]) => {
166176
chalk.green('Init\nMem RSS'),
167177
chalk.yellow('Encode\nAvg (ms)'),
168178
chalk.yellow('Decode\nAvg (ms)'),
179+
chalk.yellow('Count\nAvg (ms)'),
169180
chalk.red('Memory\nIncrease'),
170181
chalk.red('Mem\nLeak?'),
171182
],
@@ -238,6 +249,13 @@ const displayUnifiedResults = (results: BenchmarkResult[]) => {
238249
changes.decodeTimeMs,
239250
),
240251
)
252+
row.push(
253+
applyHighlight(
254+
res.datasetsAverage?.countTimeMs || 0,
255+
'countTokensTimeAvg',
256+
changes.countTimeMs,
257+
),
258+
)
241259
row.push(
242260
applyHighlight(
243261
res.memoryChangeAfterRunMb,
@@ -277,7 +295,10 @@ const runBenchmarks = async (
277295
for (let i = 0; i < ITERATIONS; i++) {
278296
console.log(` ${chalk.yellow(`Iteration ${i + 1}/${ITERATIONS}`)}`)
279297
try {
280-
const result = await runSingleBenchmark(tokenizerIndex, EXECUTIONS)
298+
const result = await runSingleBenchmark(
299+
tokenizerIndex,
300+
EXECUTIONS_MULTIPLIER,
301+
)
281302
tokenizerResults.push(result)
282303
} catch (error) {
283304
console.error(
@@ -317,6 +338,9 @@ const runBenchmarks = async (
317338
const decodeTimes = tokenizerResults.map(
318339
(r) => r.datasets[dataset].decode.averageTimeMs,
319340
)
341+
const countTimes = tokenizerResults.map(
342+
(r) => r.datasets[dataset].countTokens.averageTimeMs,
343+
)
320344
const memoryChanges = tokenizerResults.map(
321345
(r) => r.datasets[dataset].memoryChangeAfterExecutionsMb,
322346
)
@@ -327,6 +351,9 @@ const runBenchmarks = async (
327351
decode: {
328352
averageTimeMs: calculateAverage(decodeTimes),
329353
},
354+
countTokens: {
355+
averageTimeMs: calculateAverage(countTimes),
356+
},
330357
memoryChangeAfterExecutionsMb: calculateAverage(memoryChanges),
331358
}
332359
}
@@ -344,6 +371,13 @@ const runBenchmarks = async (
344371
Object.values(r.datasets).map((d) => d.decode.averageTimeMs),
345372
),
346373
),
374+
countTimeMs: calculateAverage(
375+
tokenizerResults.flatMap((r) =>
376+
Object.values(r.datasets).map(
377+
(d) => d.countTokens?.averageTimeMs || 0,
378+
),
379+
),
380+
),
347381
},
348382
})
349383
}
@@ -406,7 +440,10 @@ const watchMode = async (previousResults: BenchmarkResult[] | null) => {
406440
for (let i = 0; i < ITERATIONS; i++) {
407441
console.log(` ${chalk.yellow(`Iteration ${i + 1}/${ITERATIONS}`)}`)
408442
try {
409-
const result = await runSingleBenchmark(tokenizerIndex, EXECUTIONS)
443+
const result = await runSingleBenchmark(
444+
tokenizerIndex,
445+
EXECUTIONS_MULTIPLIER,
446+
)
410447
tokenizerResults.push(result)
411448
} catch (error) {
412449
console.error(
@@ -447,6 +484,13 @@ const watchMode = async (previousResults: BenchmarkResult[] | null) => {
447484
Object.values(r.datasets).map((d) => d.decode.averageTimeMs),
448485
),
449486
),
487+
countTimeMs: calculateAverage(
488+
tokenizerResults.flatMap((r) =>
489+
Object.values(r.datasets).map(
490+
(d) => d.countTokens.averageTimeMs,
491+
),
492+
),
493+
),
450494
},
451495
}
452496
// Aggregate per-dataset results
@@ -458,6 +502,9 @@ const watchMode = async (previousResults: BenchmarkResult[] | null) => {
458502
const decodeTimes = tokenizerResults.map(
459503
(r) => r.datasets[dataset].decode.averageTimeMs,
460504
)
505+
const countTimes = tokenizerResults.map(
506+
(r) => r.datasets[dataset].countTokens.averageTimeMs,
507+
)
461508
const memoryChanges = tokenizerResults.map(
462509
(r) => r.datasets[dataset].memoryChangeAfterExecutionsMb,
463510
)
@@ -468,6 +515,9 @@ const watchMode = async (previousResults: BenchmarkResult[] | null) => {
468515
decode: {
469516
averageTimeMs: calculateAverage(decodeTimes),
470517
},
518+
countTokens: {
519+
averageTimeMs: calculateAverage(countTimes),
520+
},
471521
memoryChangeAfterExecutionsMb: calculateAverage(memoryChanges),
472522
}
473523
}
@@ -506,6 +556,11 @@ const watchMode = async (previousResults: BenchmarkResult[] | null) => {
506556
(lastResult.datasetsAverage?.decodeTimeMs || 0)) /
507557
(lastResult.datasetsAverage?.decodeTimeMs || 1)) *
508558
100,
559+
countTimeMs:
560+
(((newAggregated.datasetsAverage?.countTimeMs || 0) -
561+
(lastResult.datasetsAverage?.countTimeMs || 0)) /
562+
(lastResult.datasetsAverage?.countTimeMs || 1)) *
563+
100,
509564
memoryChangeAfterRunMb:
510565
((newAggregated.memoryChangeAfterRunMb -
511566
lastResult.memoryChangeAfterRunMb) /

benchmark/src/benchmarkWorker.ts

Lines changed: 73 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import { memoryUsage } from 'process'
1010
import { tokenizers } from './tokenizers.js'
1111

1212
const runWorker = async (message: WorkerInput) => {
13-
const { tokenizerIndex, executions } = message
13+
const { tokenizerIndex, executionsMultiplier } = message
1414
const tokenizer = tokenizers[tokenizerIndex]
1515
const result: BenchmarkResult = {
1616
packageName: tokenizer.packageName,
@@ -19,10 +19,10 @@ const runWorker = async (message: WorkerInput) => {
1919
datasets: {},
2020
memoryChangeAfterRunMb: 0,
2121
memoryLeakWarning: false,
22-
datasetsAverage: { encodeTimeMs: 0, decodeTimeMs: 0 },
22+
datasetsAverage: { encodeTimeMs: 0, decodeTimeMs: 0, countTimeMs: 0 },
2323
}
24-
const encodeTimes: number[] = new Array(executions)
25-
const decodeTimes: number[] = new Array(executions)
24+
const testData = Object.entries(datasets)
25+
2626
try {
2727
const initMemoryUsageBefore = memoryUsage()
2828
const initStart = performance.now()
@@ -46,38 +46,66 @@ const runWorker = async (message: WorkerInput) => {
4646
}
4747

4848
// Prepare datasets
49-
const testData = Object.entries(datasets)
50-
for (const [name, text] of testData) {
51-
// Warm-up encode and decode
49+
for (const [name, data] of testData) {
50+
// Calculate actual execution counts
51+
const encodeExecs = Math.max(
52+
1,
53+
Math.round(data.encodeExecutionsCount * executionsMultiplier),
54+
)
55+
const decodeExecs = Math.max(
56+
1,
57+
Math.round(data.decodeExecutionsCount * executionsMultiplier),
58+
)
59+
const countExecs = Math.max(
60+
1,
61+
Math.round(data.countTokensExecutionsCount * executionsMultiplier),
62+
)
63+
64+
// Warm-up encode, decode and countTokens (using 5% of execution count)
5265
let encodedTokens: number[] | Uint8Array = []
53-
for (let i = 0; i < 50; i++) {
54-
encodedTokens = tokenizerModule.encode(text)
66+
const warmUpCount = Math.max(1, Math.round(encodeExecs * 0.05))
67+
for (let i = 0; i < warmUpCount; i++) {
68+
encodedTokens = tokenizerModule.encode(data.text)
5569
tokenizerModule.decode(encodedTokens)
70+
tokenizerModule.countTokens(data.text)
5671
}
5772

5873
// Encode benchmark
59-
for (let i = 0; i < executions; i++) {
74+
const encodeTimes: number[] = new Array(encodeExecs)
75+
for (let i = 0; i < encodeExecs; i++) {
6076
const start = performance.now()
61-
encodedTokens = tokenizerModule.encode(text)
77+
encodedTokens = tokenizerModule.encode(data.text)
6278
const end = performance.now()
6379
encodeTimes[i] = end - start
6480
}
65-
const avgEncodeTime = encodeTimes.reduce((a, b) => a + b, 0) / executions
81+
const avgEncodeTime = encodeTimes.reduce((a, b) => a + b, 0) / encodeExecs
6682

6783
// Decode benchmark
84+
const decodeTimes: number[] = new Array(decodeExecs)
6885
let decodedText: string = ''
69-
for (let i = 0; i < executions; i++) {
86+
for (let i = 0; i < decodeExecs; i++) {
7087
const start = performance.now()
7188
decodedText = tokenizerModule.decode(encodedTokens)
7289
const end = performance.now()
7390
decodeTimes[i] = end - start
7491
}
75-
const avgDecodeTime = decodeTimes.reduce((a, b) => a + b, 0) / executions
92+
const avgDecodeTime = decodeTimes.reduce((a, b) => a + b, 0) / decodeExecs
93+
94+
// Count tokens benchmark
95+
const countTokensTimes: number[] = new Array(countExecs)
96+
for (let i = 0; i < countExecs; i++) {
97+
const start = performance.now()
98+
tokenizerModule.countTokens(data.text)
99+
const end = performance.now()
100+
countTokensTimes[i] = end - start
101+
}
102+
const avgCountTokensTime =
103+
countTokensTimes.reduce((a, b) => a + b, 0) / countExecs
76104

77105
// Verify correctness
78-
if (decodedText !== text) {
106+
if (decodedText !== data.text) {
79107
console.warn(
80-
`Warning: Decoded text does not match original for dataset ${name}. \nExpected:\n${text}\nGot:\n${decodedText}`,
108+
`Warning: Decoded text does not match original for dataset ${name}. \nExpected:\n${data.text}\nGot:\n${decodedText}`,
81109
)
82110
}
83111

@@ -97,20 +125,44 @@ const runWorker = async (message: WorkerInput) => {
97125
decode: {
98126
averageTimeMs: parseFloat(avgDecodeTime.toFixed(4)),
99127
},
128+
countTokens: {
129+
averageTimeMs: parseFloat(avgCountTokensTime.toFixed(4)),
130+
},
100131
memoryChangeAfterExecutionsMb: parseFloat(
101132
(memoryUsed / 1024 / 1024).toFixed(2),
102133
),
103134
}
104135
}
105136

137+
// Calculate dataset averages
138+
const datasetCount = Object.keys(result.datasets).length
139+
const encodeTimeSum = Object.values(result.datasets).reduce(
140+
(sum, dataset) => sum + dataset.encode.averageTimeMs,
141+
0,
142+
)
143+
const decodeTimeSum = Object.values(result.datasets).reduce(
144+
(sum, dataset) => sum + dataset.decode.averageTimeMs,
145+
0,
146+
)
147+
const countTimeSum = Object.values(result.datasets).reduce(
148+
(sum, dataset) => sum + dataset.countTokens.averageTimeMs,
149+
0,
150+
)
151+
152+
result.datasetsAverage = {
153+
encodeTimeMs: parseFloat((encodeTimeSum / datasetCount).toFixed(4)),
154+
decodeTimeMs: parseFloat((decodeTimeSum / datasetCount).toFixed(4)),
155+
countTimeMs: parseFloat((countTimeSum / datasetCount).toFixed(4)),
156+
}
157+
106158
// Overall memory leak detection
107159
const finalMemoryUsage = memoryUsage()
108160
const totalMemoryIncrease =
109161
finalMemoryUsage.heapUsed - initMemoryUsageAfter.heapUsed
110162
result.memoryChangeAfterRunMb = parseFloat(
111163
(totalMemoryIncrease / 1024 / 1024).toFixed(2),
112164
)
113-
result.memoryLeakWarning = totalMemoryIncrease > 1 * 1024 * 1024 // 1 MB threshold
165+
result.memoryLeakWarning = totalMemoryIncrease > 10 * 1024 * 1024 // 10 MB threshold
114166

115167
// Send the result back to the parent process
116168
const output: WorkerOutput = {
@@ -130,7 +182,10 @@ const runWorker = async (message: WorkerInput) => {
130182
}
131183

132184
if (process.argv.length > 2) {
133-
runWorker({ executions: 100000, tokenizerIndex: tokenizers.length - 1 })
185+
runWorker({
186+
executionsMultiplier: 1,
187+
tokenizerIndex: tokenizers.length - 1,
188+
})
134189
} else {
135190
process.on('message', runWorker)
136191
}

0 commit comments

Comments
 (0)