fix: workaround for webpack not exposing the default export in UMD correctly

niieani · niieani · commit 84887b487789 · 2023-06-01T00:25:25.000-07:00
fixes #12
diff --git a/README.md b/README.md
@@ -4,21 +4,22 @@
 
 `gpt-tokenizer` is a highly optimized Token Byte Pair Encoder/Decoder for all OpenAI's models (including those used by GPT-2, GPT-3, GPT-3.5 and GPT-4). It's written in TypeScript, and is fully compatible with all modern JavaScript environments.
 
+This package is a port of OpenAI's [tiktoken](https://github.com/openai/tiktoken), with some additional features sprinkled on top.
+
 OpenAI's GPT models utilize byte pair encoding to transform text into a sequence of integers before feeding them into the model.
 
 As of 2023, it is the most feature-complete, open-source GPT tokenizer on NPM. It implements some unique features, such as:
 
+- Support for easily tokenizing chats thanks to the `encodeChat` function
 - Support for all current OpenAI models (available encodings: `r50k_base`, `p50k_base`, `p50k_edit` and `cl100k_base`)
-- Generator function versions of both the decoder and encoder
+- Generator function versions of both the decoder and encoder functions
 - Provides the ability to decode an asynchronous stream of data (using `decodeAsyncGenerator` and `decodeGenerator` with any iterable input)
 - No global cache (no accidental memory leaks, as with the original GPT-3-Encoder implementation)
-- Includes a highly performant `isWithinTokenLimit` function to assess token limit without encoding the entire text
+- Includes a highly performant `isWithinTokenLimit` function to assess token limit without encoding the entire text/chat
 - Improves overall performance by eliminating transitive arrays
 - Type-safe (written in TypeScript)
 - Works in the browser out-of-the-box
 
-This package is a port of OpenAI's [tiktoken](https://github.com/openai/tiktoken), with some additional features sprinkled on top.
-
 Thanks to @dmitry-brazhenko's [SharpToken](https://github.com/dmitry-brazhenko/SharpToken), whose code was served as a reference for the port.
 
 Historical note: This package started off as a fork of [latitudegames/GPT-3-Encoder](https://github.com/latitudegames/GPT-3-Encoder), but version 2.0 was rewritten from scratch.
@@ -38,17 +39,19 @@ npm install gpt-tokenizer
 
 <script>
   // the package is now available as a global:
-  const { encode, decode } = GPTTokenizer
+  const { encode, decode } = GPTTokenizer_cl100k_base
 </script>
 ```
 
-If you wish to use a custom encoding, fetch the relevant script:
+If you wish to use a custom encoding, fetch the relevant script.
 
 - https://unpkg.com/gpt-tokenizer/dist/cl100k_base.js
 - https://unpkg.com/gpt-tokenizer/dist/p50k_base.js
 - https://unpkg.com/gpt-tokenizer/dist/p50k_edit.js
 - https://unpkg.com/gpt-tokenizer/dist/r50k_base.js
 
+The global name is a concatenation: `GPTTokenizer_${encoding}`.
+
 Refer to [supported models and their encodings](#Supported-models-and-their-encodings) section for more information.
 
 ## Playground
diff --git a/package.json b/package.json
@@ -1,7 +1,7 @@
 {
   "name": "gpt-tokenizer",
   "version": "0.0.0",
-  "description": "BPE Encoder Decoder for GPT-2 / GPT-3",
+  "description": "A pure JavaScript implementation of a BPE tokenizer (Encoder/Decoder) for GPT-2 / GPT-3 / GPT-4 and other OpenAI models",
   "keywords": [
     "BPE",
     "encoder",
@@ -76,10 +76,10 @@
     "build:cjs": "yarn rrun tsc --outDir cjs --module commonjs --target es2022 --project tsconfig-cjs.json",
     "build:esm": "yarn rrun tsc --outDir esm --module esnext --target es2022 && echo '{\"name\": \"gpt-tokenizer\", \"type\": \"module\"}' > ./esm/package.json",
     "build:umd": "yarn build:umd:cl100k_base && yarn build:umd:p50k_base && yarn build:umd:p50k_edit && yarn build:umd:r50k_base",
-    "build:umd:cl100k_base": "beemo webpack --entry='./src/main.ts' --env 'outDir=dist' --env 'moduleTarget=umd' --env 'engineTarget=web' --env 'codeTarget=es2022' --env 'name=GPTTokenizer' --env 'export=default' --env 'filename=cl100k_base.js'",
-    "build:umd:p50k_base": "beemo webpack --entry='./src/encoding/p50k_base.ts' --env 'outDir=dist' --env 'moduleTarget=umd' --env 'engineTarget=web' --env 'codeTarget=es2022' --env 'name=GPTTokenizer' --env 'export=default' --env 'filename=p50k_base.js'",
-    "build:umd:p50k_edit": "beemo webpack --entry='./src/encoding/p50k_edit.ts' --env 'outDir=dist' --env 'moduleTarget=umd' --env 'engineTarget=web' --env 'codeTarget=es2022' --env 'name=GPTTokenizer' --env 'export=default' --env 'filename=p50k_edit.js'",
-    "build:umd:r50k_base": "beemo webpack --entry='./src/encoding/r50k_base.ts' --env 'outDir=dist' --env 'moduleTarget=umd' --env 'engineTarget=web' --env 'codeTarget=es2022' --env 'name=GPTTokenizer' --env 'export=default' --env 'filename=r50k_base.js'",
+    "build:umd:cl100k_base": "beemo webpack --entry='./src/main.ts' --env 'outDir=dist' --env 'moduleTarget=umd' --env 'engineTarget=web' --env 'codeTarget=es2022' --env 'name=GPTTokenizer_cl100k_base' --env 'export=api' --env 'filename=cl100k_base.js'",
+    "build:umd:p50k_base": "beemo webpack --entry='./src/encoding/p50k_base.ts' --env 'outDir=dist' --env 'moduleTarget=umd' --env 'engineTarget=web' --env 'codeTarget=es2022' --env 'name=GPTTokenizer_p50k_base' --env 'export=api' --env 'filename=p50k_base.js'",
+    "build:umd:p50k_edit": "beemo webpack --entry='./src/encoding/p50k_edit.ts' --env 'outDir=dist' --env 'moduleTarget=umd' --env 'engineTarget=web' --env 'codeTarget=es2022' --env 'name=GPTTokenizer_p50k_edit' --env 'export=api' --env 'filename=p50k_edit.js'",
+    "build:umd:r50k_base": "beemo webpack --entry='./src/encoding/r50k_base.ts' --env 'outDir=dist' --env 'moduleTarget=umd' --env 'engineTarget=web' --env 'codeTarget=es2022' --env 'name=GPTTokenizer_r50k_base' --env 'export=api' --env 'filename=r50k_base.js'",
     "clean": "git clean -dfX --exclude=node_modules src && beemo typescript:sync-project-refs",
     "format": "yarn rrun prettier --write \"./{src,tests,.config}/**/!(*.d).{.js,jsx,ts,tsx,json,md}\"",
     "postinstallDev": "yarn prepare",
diff --git a/src/encoding/cl100k_base.ts b/src/encoding/cl100k_base.ts
@@ -5,7 +5,7 @@ import { GptEncoding } from '../GptEncoding.js'
 
 export * from '../specialTokens.js'
 
-const api = GptEncoding.getEncodingApi('cl100k_base', () =>
+export const api = GptEncoding.getEncodingApi('cl100k_base', () =>
   convertTokenBytePairEncodingFromTuples(encoder),
 )
 const {
diff --git a/src/encoding/p50k_base.ts b/src/encoding/p50k_base.ts
@@ -5,7 +5,7 @@ import { GptEncoding } from '../GptEncoding.js'
 
 export * from '../specialTokens.js'
 
-const api = GptEncoding.getEncodingApi('p50k_base', () =>
+export const api = GptEncoding.getEncodingApi('p50k_base', () =>
   convertTokenBytePairEncodingFromTuples(encoder),
 )
 const {
diff --git a/src/encoding/p50k_edit.ts b/src/encoding/p50k_edit.ts
@@ -5,7 +5,7 @@ import { GptEncoding } from '../GptEncoding.js'
 
 export * from '../specialTokens.js'
 
-const api = GptEncoding.getEncodingApi('p50k_edit', () =>
+export const api = GptEncoding.getEncodingApi('p50k_edit', () =>
   convertTokenBytePairEncodingFromTuples(encoder),
 )
 const {
diff --git a/src/encoding/r50k_base.ts b/src/encoding/r50k_base.ts
@@ -5,7 +5,7 @@ import { GptEncoding } from '../GptEncoding.js'
 
 export * from '../specialTokens.js'
 
-const api = GptEncoding.getEncodingApi('r50k_base', () =>
+export const api = GptEncoding.getEncodingApi('r50k_base', () =>
   convertTokenBytePairEncodingFromTuples(encoder),
 )
 const {

Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,7 @@ import { GptEncoding } from '../GptEncoding.js'`
`5`	`5`
`6`	`6`	`export * from '../specialTokens.js'`
`7`	`7`
`8`		`-const api = GptEncoding.getEncodingApi('cl100k_base', () =>`
	`8`	`+export const api = GptEncoding.getEncodingApi('cl100k_base', () =>`
`9`	`9`	`convertTokenBytePairEncodingFromTuples(encoder),`
`10`	`10`	`)`
`11`	`11`	`const {`