import { test } from "bun:test"
import { encoding_for_model } from "tiktoken"
test("compare tiktoken and gpt-tokenizer for string", async () => {
const testString = "What's the weather like in San Francisco?"
// Test with cl100k_base encoding (used by GPT-3.5 and GPT-4)
const tiktokenEncoderCl100k = encoding_for_model("gpt-3.5-turbo")
const tiktokenTokensCl100k = tiktokenEncoderCl100k.encode(testString).length
const { encode: gptEncodeCl100k } = await import("gpt-tokenizer/encoding/cl100k_base")
const gptTokensCl100k = gptEncodeCl100k(testString).length
console.log(`cl100k_base - tiktoken: ${tiktokenTokensCl100k}, gpt-tokenizer: ${gptTokensCl100k}`)
tiktokenEncoderCl100k.free()
// Test with o200k_base encoding (used by GPT-4o)
const tiktokenEncoderO200k = encoding_for_model("gpt-4o")
const tiktokenTokensO200k = tiktokenEncoderO200k.encode(testString).length
const { encode: gptEncodeO200k } = await import("gpt-tokenizer/encoding/o200k_base")
const gptTokensO200k = gptEncodeO200k(testString).length
console.log(`o200k_base - tiktoken: ${tiktokenTokensO200k}, gpt-tokenizer: ${gptTokensO200k}`)
tiktokenEncoderO200k.free()
})
bun test tests/tokenizer-compare.test.ts
bun test v1.2.21 (7c45ed97)
tests\tokenizer-compare.test.ts:
cl100k_base - tiktoken: 9, gpt-tokenizer: 9
o200k_base - tiktoken: 8, gpt-tokenizer: 9
✓ compare tiktoken and gpt-tokenizer for string [1766.00ms]
1 pass
0 fail
Ran 1 test across 1 file. [2.02s]
bun test tests/tokenizer-compare.test.ts
bun test v1.2.21 (7c45ed97)
tests\tokenizer-compare.test.ts:
cl100k_base - tiktoken: 9, gpt-tokenizer: 9
o200k_base - tiktoken: 8, gpt-tokenizer: 9
✓ compare tiktoken and gpt-tokenizer for string [1766.00ms]
1 pass
0 fail
Ran 1 test across 1 file. [2.02s]