tangbinh
diff --git a/‎docs/user_guide/diffusion/parallelism_acceleration.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/user_guide/diffusion/parallelism_acceleration.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎vllm_omni/diffusion/layers/rope.py‎
Lines changed: 26 additions & 0 deletions b/‎vllm_omni/diffusion/layers/rope.py‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎vllm_omni/diffusion/models/flux/flux_transformer.py‎
Lines changed: 2 additions & 7 deletions b/‎vllm_omni/diffusion/models/flux/flux_transformer.py‎
Lines changed: 2 additions & 7 deletions
@@ -35,7 +35,7 @@ The following table shows which models are currently supported by parallelism me
 | **Qwen-Image-Layered**   | `Qwen/Qwen-Image-Layered`            |     ✅      |    ✅    |      ✅       |        ✅        |         ❌          |        N/A        |
 | **Z-Image**              | `Tongyi-MAI/Z-Image-Turbo`           |     ✅      |    ✅    |      ❌       |  ✅ (TP=2 only)  |         ✅          |        N/A        |
 | **Stable-Diffusion3.5**  | `stabilityai/stable-diffusion-3.5`   |     ❌      |    ❌    |      ❌       |        ✅        |         ✅          |        N/A        |
-| **FLUX.2-klein**         | `black-forest-labs/FLUX.2-klein-4B`  |     ❌      |    ❌    |      ❌       |        ✅        |         ❌          |        N/A        |
+| **FLUX.2-klein**         | `black-forest-labs/FLUX.2-klein-4B`  |     ✅      |    ✅    |      ❌       |        ✅        |         ❌          |        N/A        |
 | **FLUX.1-dev**           | `black-forest-labs/FLUX.1-dev`       |     ❌      |    ❌    |      ✅       |        ✅        |         ❌          |        N/A        |
 | **FLUX.2-dev**           | `black-forest-labs/FLUX.2-dev`       |     ❌      |    ❌    |      ❌       |        ✅        |         ❌          |        N/A        |
 | **HunyuanImage3.0**      | `tencent/HunyuanImage-3.0`, `tencent/HunyuanImage-3.0-Instruct` |     ❌      |    ❌    |      ❌       |        ✅        |         ❌          |        ✅        |
 
@@ -157,3 +157,29 @@ def forward_native(
             sin,
             interleaved=self.interleaved,
         )
+
+
+def apply_rope_to_qk(
+    rope: RotaryEmbedding,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    image_rotary_emb: tuple[torch.Tensor, torch.Tensor] | None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Apply rotary positional embeddings to query and key tensors.
+
+    Args:
+        rope: RotaryEmbedding instance for applying position embeddings
+        query: Query tensor [B, S, H, D]
+        key: Key tensor [B, S, H, D]
+        image_rotary_emb: Tuple of (cos, sin) tensors or None
+
+    Returns:
+        Tuple of (query, key) with RoPE applied if rotary embeddings provided
+    """
+    if image_rotary_emb is not None:
+        cos, sin = image_rotary_emb
+        cos = cos.to(query.dtype)
+        sin = sin.to(query.dtype)
+        query = rope(query, cos, sin)
+        key = rope(key, cos, sin)
+    return query, key
@@ -30,7 +30,7 @@
 
 from vllm_omni.diffusion.attention.layer import Attention
 from vllm_omni.diffusion.data import OmniDiffusionConfig
-from vllm_omni.diffusion.layers.rope import RotaryEmbedding
+from vllm_omni.diffusion.layers.rope import RotaryEmbedding, apply_rope_to_qk
 
 logger = init_logger(__name__)
 
@@ -224,12 +224,7 @@ def forward(
             key = torch.cat([encoder_key, key], dim=1)
             value = torch.cat([encoder_value, value], dim=1)
 
-        if image_rotary_emb is not None:
-            cos, sin = image_rotary_emb  # [S, D/2]
-            cos = cos.to(query.dtype)
-            sin = sin.to(query.dtype)
-            query = self.rope(query, cos, sin)
-            key = self.rope(key, cos, sin)
+        query, key = apply_rope_to_qk(self.rope, query, key, image_rotary_emb)  # [S, D/2]
 
         hidden_states = self.attn(
             query,