refinements

rasbt · rasbt · commit 55d66e22b3ee · 2025-08-01T16:45:02.000-05:00
diff --git a/ch05/11_qwen3/README.md b/ch05/11_qwen3/README.md
@@ -1,12 +1,18 @@
 # Qwen3 From Scratch
 
-This [standalone-qwen3.ipynb](standalone-qwen3.ipynb) Jupyter notebook in this folder contains a from-scratch implementation of Qwen3 0.6B, 1.7B, 4B, 8B, and 32 B.
+This [standalone-qwen3.ipynb](standalone-qwen3.ipynb) Jupyter notebook in this folder contains a from-scratch implementation of Qwen3 0.6B, 1.7B, 4B, 8B, and 32B.
 
 <img src="https://sebastianraschka.com/images/LLMs-from-scratch-images/bonus/qwen/qwen-overview.webp">
 
 
+This [standalone-qwen3-moe.ipynb](standalone-qwen3-moe.ipynb) and [standalone-qwen3-moe-plus-kvcache.ipynb](standalone-qwen3-moe-plus-kvcache.ipynb) Jupyter notebooks in this folder contain a from-scratch implementation of 30B-A3B Mixture-of-Experts (MoE), including the Thinking, Instruct, and Coder model variants.
+
+<img src="https://sebastianraschka.com/images/LLMs-from-scratch-images/bonus/qwen/qwen3-coder-flash-overview.webp?123" width="430px">
+
+
+
 &nbsp;
-### Using Qwen3 via the `llms-from-scratch` package
+# Using Qwen3 via the `llms-from-scratch` package
 
 For an easy way to use the Qwen3 from-scratch implementation, you can also use the `llms-from-scratch` PyPI package based on the source code in this repository at [pkg/llms_from_scratch](../../pkg/llms_from_scratch).
 
@@ -23,8 +29,9 @@ pip install llms_from_scratch tokenizers
 Specify which model to use:
 
 ```python
-USE_REASONING_MODEL = True   # The "thinking" model
 USE_REASONING_MODEL = False  # The base model
+USE_REASONING_MODEL = True   # The "thinking" model
+
 
 # Use
 # USE_REASONING_MODEL = True
@@ -130,22 +137,22 @@ from llms_from_scratch.qwen3 import (
     load_weights_into_qwen
 )
 
-model = Qwen3Model(QWEN3_CONFIG)
+device = (
+    torch.device("cuda") if torch.cuda.is_available() else
+    torch.device("mps") if torch.backends.mps.is_available() else
+    torch.device("cpu")
+)
+
+with device:
+    model = Qwen3Model(QWEN3_CONFIG)
 
 weights_dict = download_from_huggingface_from_snapshots(
     repo_id=repo_id,
     local_dir=local_dir
 )
 load_weights_into_qwen(model, QWEN3_CONFIG, weights_dict)
+model.to(device)  # only required for the MoE models
 del weights_dict  # delete weight dictionary to free up disk space
-
-device = (
-    torch.device("cuda") if torch.cuda.is_available() else
-    torch.device("mps") if torch.backends.mps.is_available() else
-    torch.device("cpu")
-)
-
-model.to(device);
 ```
 
 
@@ -236,6 +243,33 @@ Large language models (LLMs) are advanced artificial intelligence systems design
 
 
 
+For the larger models, you may prefer the streaming variant, which prints each token as soon as it's generated:
+
+```python
+from llms_from_scratch.generate import generate_text_simple_stream
+
+input_token_ids_tensor = torch.tensor(input_token_ids, device=device).unsqueeze(0)
+
+for token in generate_text_simple_stream(
+    model=model,
+    token_ids=input_token_ids_tensor,
+    max_new_tokens=150,
+    eos_token_id=tokenizer.eos_token_id
+):
+    token_id = token.squeeze(0).tolist()
+    print(
+        tokenizer.decode(token_id),
+        end="",
+        flush=True
+    )
+```
+
+```
+ <|im_start|>user
+Give me a short introduction to large language models.<|im_end|>
+Large language models (LLMs) are advanced artificial intelligence systems designed to generate human-like text. They are trained on vast amounts of text data, allowing them to understand and generate coherent, contextually relevant responses. LLMs are used in a variety of applications, including chatbots, virtual assistants, content generation, and more. They are powered by deep learning algorithms and can be fine-tuned for specific tasks, making them versatile tools for a wide range of industries.<|endoftext|>Human resources department of a company is planning to hire 100 new employees. The company has a budget of $100,000 for the recruitment process. The company has a minimum wage of $10 per hour. The company has a total of...
+```
+
 
 
 &nbsp;
@@ -252,18 +286,19 @@ model.to(device)
 with
 
 ```python
-model = torch.compile(model)
 model.to(device)
+model = torch.compile(model)
 ```
 
 Note: There is a significant multi-minute upfront cost when compiling, and the speed-up takes effect after the first `generate` call. 
 
 The following table shows a performance comparison on an A100 for consequent `generate` calls:
 
-|                          | Tokens/sec | Memory  |
-| ------------------------ | ---------- | ------- |
-| Qwen3Model 0.6B          | 25         | 1.49 GB |
-| Qwen3Model 0.6B compiled | 107        | 1.99 GB |
+|                          | Hardware        | Tokens/sec | Memory   |
+| ------------------------ | ----------------|----------- | -------- |
+| Qwen3Model 0.6B          | Nvidia A100 GPU | 25         | 1.49 GB  |
+| Qwen3Model 0.6B compiled | Nvidia A100 GPU | 107        | 1.99 GB  |
+
 
 &nbsp;
 #### Pro tip 2: speed up inference with KV cache
@@ -305,6 +340,8 @@ Note that the peak memory usage is only listed for Nvidia CUDA devices, as it is
 
 Note that all settings above have been tested to produce the same text outputs.
 
+
+
 &nbsp;
 
 #### Pro tip 3: batched inference
diff --git a/ch05/11_qwen3/standalone-qwen3-moe-plus-kvcache.ipynb b/ch05/11_qwen3/standalone-qwen3-moe-plus-kvcache.ipynb
@@ -55,7 +55,7 @@
     "\n",
     "<br>\n",
     "\n",
-    "<img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/bonus/qwen/qwen3-coder-flash-overview.webp?123\" width=\"700px\">\n",
+    "<img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/bonus/qwen/qwen3-coder-flash-overview.webp?123\" width=\"600px\">\n",
     "\n",
     "<br>\n",
     "  \n",
diff --git a/ch05/11_qwen3/standalone-qwen3-moe.ipynb b/ch05/11_qwen3/standalone-qwen3-moe.ipynb
@@ -55,7 +55,7 @@
     "\n",
     "<br>\n",
     "\n",
-    "<img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/bonus/qwen/qwen3-coder-flash-overview.webp?123\" width=\"700px\">\n",
+    "<img src=\"https://sebastianraschka.com/images/LLMs-from-scratch-images/bonus/qwen/qwen3-coder-flash-overview.webp?123\" width=\"600px\">\n",
     "\n",
     "<br>\n",
     "  \n",
diff --git a/ch05/11_qwen3/standalone-qwen3.ipynb b/ch05/11_qwen3/standalone-qwen3.ipynb
@@ -925,12 +925,6 @@
     "        self.add_thinking = add_thinking\n",
     "\n",
     "        tok_file = Path(tokenizer_file_path)\n",
-    "        if not tok_file.is_file() and repo_id:\n",
-    "            download_from_huggingface(\n",
-    "                repo_id=repo_id,\n",
-    "                filename=tok_file.name,\n",
-    "                local_dir=str(tok_file.parent),\n",
-    "            )\n",
     "        self._tok = Tokenizer.from_file(str(tok_file))\n",
     "        self._special_to_id = {t: self._tok.token_to_id(t) for t in self._SPECIALS}\n",
     "\n",
diff --git a/pkg/llms_from_scratch/README.md b/pkg/llms_from_scratch/README.md
@@ -160,10 +160,16 @@ from llms_from_scratch.qwen3 import (
 
 # KV cache drop-in replacements
 from llms_from_scratch.kv_cache.qwen3 import Qwen3Model
-from llms_from_scratch.kv_cache.generate import generate_text_simple
+from llms_from_scratch.kv_cache.generate import (
+    generate_text_simple,
+    generate_text_simple_stream
+)
 
 # KV cache drop-in replacements with batched inference support
-from llms_from_scratch.kv_cache_batched.generate import generate_text_simple
+from llms_from_scratch.kv_cache_batched.generate import (
+    generate_text_simple,
+    generate_text_simple_stream
+)
 from llms_from_scratch.kv_cache_batched.qwen3 import Qwen3Model
 ```
 
diff --git a/pkg/llms_from_scratch/kv_cache/generate.py b/pkg/llms_from_scratch/kv_cache/generate.py
@@ -28,3 +28,27 @@ def generate_text_simple(model, idx, max_new_tokens, context_size=None, use_cach
                 idx = torch.cat([idx, next_idx], dim=1)
 
     return idx
+
+
+def generate_text_simple_stream(model, token_ids, max_new_tokens, eos_token_id=None, context_size=None):
+    model.eval()
+
+    with torch.no_grad():
+        cache = KVCache(n_layers=model.cfg["n_layers"])
+        model.reset_kv_cache()
+
+        # Prime the cache with the initial context
+        logits = model(token_ids, cache=cache)
+
+        for _ in range(max_new_tokens):
+            next_token = torch.argmax(logits[:, -1], dim=-1, keepdim=True)
+
+            if eos_token_id is not None and torch.all(next_token == eos_token_id):
+                break
+
+            yield next_token
+
+            token_ids = torch.cat([token_ids, next_token], dim=1)
+
+            # Feed only the new token to the model; cache handles history
+            logits = model(next_token, cache=cache)
diff --git a/pkg/llms_from_scratch/tests/test_qwen3.py b/pkg/llms_from_scratch/tests/test_qwen3.py
@@ -13,6 +13,7 @@
     Qwen3Tokenizer
 )
 from llms_from_scratch.kv_cache.qwen3 import Qwen3Model as Qwen3ModelKV
+from llms_from_scratch.kv_cache.utils import KVCache
 from llms_from_scratch.kv_cache.generate import generate_text_simple as generate_text_simple_cached
 
 from llms_from_scratch.kv_cache_batched.qwen3 import Qwen3Model as Qwen3ModelKVBatched
@@ -50,6 +51,116 @@ def extra_repr(self):
 transformers_installed = importlib.util.find_spec("transformers") is not None
 
 
+@pytest.fixture
+def dummy_input():
+    torch.manual_seed(123)
+    return torch.randint(0, 100, (1, 8))  # batch size 1, seq length 8
+
+
+@pytest.fixture
+def dummy_cfg_base():
+    return {
+        "vocab_size": 100,
+        "emb_dim": 32,
+        "hidden_dim": 64,
+        "n_layers": 2,
+        "n_heads": 4,
+        "head_dim": 8,
+        "n_kv_groups": 1,
+        "qk_norm": False,
+        "dtype": torch.float32,
+        "rope_base": 10000,
+        "context_length": 64,
+        "num_experts": 0,
+    }
+
+
+@pytest.fixture
+def dummy_cfg_moe(dummy_cfg_base):
+    cfg = dummy_cfg_base.copy()
+    cfg.update({
+        "num_experts": 4,
+        "num_experts_per_tok": 2,
+        "moe_intermediate_size": 64,
+    })
+    return cfg
+
+
+def test_dummy_qwen3_forward(dummy_cfg_base, dummy_input):
+    model = Qwen3Model(dummy_cfg_base)
+    out = model(dummy_input)
+    assert out.shape == (1, dummy_input.size(1), dummy_cfg_base["vocab_size"]), \
+        f"Expected shape (1, seq_len, vocab_size), got {out.shape}"
+
+
+def test_dummy_qwen3_moe_forward(dummy_cfg_moe, dummy_input):
+    model = Qwen3Model(dummy_cfg_moe)
+    out = model(dummy_input)
+    assert out.shape == (1, dummy_input.size(1), dummy_cfg_moe["vocab_size"]), \
+        f"Expected shape (1, seq_len, vocab_size), got {out.shape}"
+    assert any(hasattr(block.ff, 'gate') for block in model.trf_blocks), \
+        "Expected MoEFeedForward in at least one transformer block"
+
+
+def test_qwen3_base_kvcache_equivalence(dummy_cfg_base):
+    model_regular = Qwen3Model(dummy_cfg_base)
+    model_regular.eval()
+
+    model_kv = Qwen3ModelKV(dummy_cfg_base)
+    model_kv.eval()
+    model_kv.load_state_dict(model_regular.state_dict())  # ensure same weights
+
+    model_kv.reset_kv_cache()
+    cache = KVCache(n_layers=dummy_cfg_base["n_layers"])
+
+    torch.manual_seed(123)
+    input_ids = torch.randint(0, dummy_cfg_base["vocab_size"], (1, 6))  # batch_size=1, seq_len=6
+
+    # full-sequence output
+    out_full = model_regular(input_ids)
+
+    # stepwise with KV cache
+    logits_stepwise = []
+    for t in range(input_ids.size(1)):
+        input_token = input_ids[:, t:t + 1]  # shape (1,1)
+        logits = model_kv(input_token, cache=cache)
+        logits_stepwise.append(logits)
+
+    out_kv = torch.cat(logits_stepwise, dim=1)
+
+    assert out_full.shape == out_kv.shape, f"Shape mismatch: {out_full.shape} vs {out_kv.shape}"
+    assert torch.allclose(out_full, out_kv, atol=1e-5, rtol=1e-3)
+
+
+@pytest.mark.parametrize("cfg_name", ["dummy_cfg_base", "dummy_cfg_moe"])
+def test_qwen3_moe_kvcache_equivalence(cfg_name):
+    model_regular = Qwen3Model(cfg_name)
+    model_regular.eval()
+
+    torch.manual_seed(123)
+    input_ids = torch.randint(0, cfg_name["vocab_size"], (1, 6))  # batch_size=1, seq_len=6
+
+    # No KV cache forward
+    out_full = model_regular(input_ids)
+
+    # Now with KV cache
+    model_kv = Qwen3ModelKV(cfg_name)
+    model_kv.eval()
+    model_kv.reset_kv_cache()
+    cache = KVCache(n_layers=cfg_name["n_layers"])
+
+    logits_stepwise = []
+    for t in range(input_ids.size(1)):
+        input_token = input_ids[:, t:t+1]  # shape (1, 1)
+        logits = model_kv(input_token, cache=cache)
+        logits_stepwise.append(logits)
+
+    # Concatenate all stepwise outputs
+    out_kv = torch.cat(logits_stepwise, dim=1)
+
+    assert torch.allclose(out_full, out_kv, atol=1e-5, rtol=1e-3)
+
+
 @pytest.mark.skipif(not transformers_installed, reason="transformers not installed")
 def test_rope():