zurawiki
diff --git a/‎.gitmodules‎
Lines changed: 1 addition & 1 deletion b/‎.gitmodules‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tiktoken-rs/src/patched_tiktoken.rs‎
Lines changed: 9 additions & 27 deletions b/‎tiktoken-rs/src/patched_tiktoken.rs‎
Lines changed: 9 additions & 27 deletions
diff --git a/‎tiktoken-rs/src/tokenizer.rs‎
Lines changed: 8 additions & 0 deletions b/‎tiktoken-rs/src/tokenizer.rs‎
Lines changed: 8 additions & 0 deletions
@@ -1,4 +1,4 @@
 [submodule "vendor/tiktoken"]
 	path = vendor/tiktoken
 	url = https://github.com/openai/tiktoken.git
-	ref = refs/tags/0.8.0
+	ref = refs/tags/0.9.0
@@ -3,10 +3,6 @@ use anyhow::anyhow;
 use anyhow::Result;
 use fancy_regex::Regex;
 use rustc_hash::FxHashMap as HashMap;
-use std::collections::HashSet;
-
-// used to handle errors in the core below
-impl std::error::Error for DecodeKeyError {}
 
 /// Rust API
 impl CoreBPE {
@@ -21,20 +17,23 @@ impl CoreBPE {
         special_tokens_encoder: HashMap<String, Rank>,
         pattern: &str,
     ) -> Result<Self> {
-        let regex = Regex::new(pattern).map_err(|e| anyhow!(e.to_string()))?;
+        let regex = Regex::new(pattern)?;
 
         let special_regex = {
-            let _parts = special_tokens_encoder
+            let parts = special_tokens_encoder
                 .keys()
                 .map(|s| fancy_regex::escape(s))
                 .collect::<Vec<_>>();
-            Regex::new(&_parts.join("|")).map_err(|e| anyhow!(e.to_string()))?
+            Regex::new(&parts.join("|"))?
         };
 
         let decoder: HashMap<Rank, Vec<u8>> =
             encoder.iter().map(|(k, v)| (*v, k.clone())).collect();
 
-        assert!(encoder.len() == decoder.len());
+        assert!(
+            encoder.len() == decoder.len(),
+            "Encoder and decoder must be of equal length; maybe you had duplicate token indices in your encoder?"
+        );
 
         let special_tokens_decoder: HashMap<Rank, Vec<u8>> = special_tokens_encoder
             .iter()
@@ -45,7 +44,7 @@ impl CoreBPE {
         let mut sorted_token_bytes: Vec<Vec<u8>> = encoder.keys().cloned().collect();
         sorted_token_bytes.sort();
 
-        Ok(CoreBPE {
+        Ok(Self {
             encoder,
             special_tokens_encoder,
             decoder,
@@ -58,23 +57,6 @@ impl CoreBPE {
         })
     }
 
-    pub fn encode_ordinary(&self, text: &str) -> Vec<Rank> {
-        self._encode_ordinary_native(text)
-    }
-
-    pub fn encode(&self, text: &str, allowed_special: HashSet<&str>) -> Vec<Rank> {
-        self._encode_native(text, &allowed_special).0
-    }
-
-    pub fn encode_with_special_tokens(&self, text: &str) -> Vec<Rank> {
-        let allowed_special = self
-            .special_tokens_encoder
-            .keys()
-            .map(|s| s.as_str())
-            .collect();
-        self._encode_native(text, &allowed_special).0
-    }
-
     // ====================
     // Decoding
     // ====================
@@ -83,7 +65,7 @@ impl CoreBPE {
     ///
     /// If unicode validation is not wanted, see _decode_native.
     pub fn decode(&self, tokens: Vec<Rank>) -> Result<String> {
-        match String::from_utf8(self._decode_native(&tokens)?) {
+        match String::from_utf8(self.decode_bytes(&tokens)?) {
             Ok(text) => Ok(text),
             Err(e) => Err(anyhow!("Unable to decode into a valid UTF-8 string: {}", e)),
         }
 
@@ -32,13 +32,17 @@ pub enum Tokenizer {
 // https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/model.py#L7
 const MODEL_PREFIX_TO_TOKENIZER: &[(&str, Tokenizer)] = &[
     ("o1-", Tokenizer::O200kBase),
+    ("o3-", Tokenizer::O200kBase),
+    ("o4-", Tokenizer::O200kBase),
     // chat
+    ("gpt-4.1-", Tokenizer::O200kBase),
     ("chatgpt-4o-", Tokenizer::O200kBase),
     ("gpt-4o-", Tokenizer::O200kBase),
     ("gpt-4-", Tokenizer::Cl100kBase),
     ("gpt-3.5-turbo-", Tokenizer::Cl100kBase),
     ("gpt-35-turbo-", Tokenizer::Cl100kBase),
     // fine-tuned
+    ("ft:gpt-4o", Tokenizer::O200kBase),
     ("ft:gpt-4", Tokenizer::Cl100kBase),
     ("ft:gpt-3.5-turbo", Tokenizer::Cl100kBase),
     ("ft:davinci-002", Tokenizer::Cl100kBase),
@@ -48,7 +52,11 @@ const MODEL_PREFIX_TO_TOKENIZER: &[(&str, Tokenizer)] = &[
 // Keep this in sync with:
 // https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/model.py#L22
 const MODEL_TO_TOKENIZER: &[(&str, Tokenizer)] = &[
+    // reasoning
+    ("o1", Tokenizer::O200kBase),
+    ("o3", Tokenizer::O200kBase),
     // chat
+    ("gpt-4.1", Tokenizer::O200kBase),
     ("chatgpt-4o-latest", Tokenizer::O200kBase),
     ("gpt-4o", Tokenizer::O200kBase),
     ("gpt-4", Tokenizer::Cl100kBase),