fix janus demo with device dispatch (#2161)

lvyufeng · web-flow · commit f5c1be8527e2 · 2025-09-05T12:28:27.000+08:00
diff --git a/examples/diffusers/janus/demo/app.py b/examples/diffusers/janus/demo/app.py
@@ -1,13 +1,16 @@
-import gradio as gr
-import mindspore
-import mindnlp
 from mindnlp import core
+import gradio as gr
 from transformers import AutoConfig, AutoModelForCausalLM
-from janus.models import MultiModalityCausalLM, VLChatProcessor
+from janus.models import VLChatProcessor
 from PIL import Image
 
 import numpy as np
 
+device = 'cpu'
+if core.npu.is_available():
+    device = 'npu'
+elif core.cuda.is_available():
+    device = 'cuda'
 
 # Load model and processor
 model_path = "deepseek-ai/Janus-1.3B"
@@ -16,7 +19,8 @@
 language_config._attn_implementation = 'eager'
 vl_gpt = AutoModelForCausalLM.from_pretrained(model_path,
                                              language_config=language_config,
-                                             trust_remote_code=True, ms_dtype=mindspore.float16)
+                                             trust_remote_code=True)
+vl_gpt = vl_gpt.to(core.bfloat16).to(device)
 
 vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
 tokenizer = vl_chat_processor.tokenizer
@@ -26,9 +30,12 @@
 # Multimodal Understanding function
 def multimodal_understanding(image, question, seed, top_p, temperature):
     # Clear CUDA cache before generating
+    core.cuda.empty_cache()
+    
     # set seed
-    mindspore.manual_seed(seed)
+    core.manual_seed(seed)
     np.random.seed(seed)
+    core.cuda.manual_seed(seed)
     
     conversation = [
         {
@@ -42,9 +49,9 @@ def multimodal_understanding(image, question, seed, top_p, temperature):
     pil_images = [Image.fromarray(image)]
     prepare_inputs = vl_chat_processor(
         conversations=conversation, images=pil_images, force_batchify=True
-    ).to(core.get_default_device(), dtype=mindspore.float16)
-    
+    ).to(device, dtype=core.bfloat16 if core.cuda.is_available() else core.float16)
     
+    print(prepare_inputs)
     inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
     
     outputs = vl_gpt.language_model.generate(
@@ -75,13 +82,13 @@ def generate(input_ids,
     # Clear CUDA cache before generating
     core.cuda.empty_cache()
     
-    tokens = core.zeros((parallel_size * 2, len(input_ids)), dtype=core.int)
+    tokens = core.zeros((parallel_size * 2, len(input_ids)), dtype=core.int).to(device)
     for i in range(parallel_size * 2):
         tokens[i, :] = input_ids
         if i % 2 != 0:
             tokens[i, 1:-1] = vl_chat_processor.pad_id
     inputs_embeds = vl_gpt.language_model.get_input_embeddings()(tokens)
-    generated_tokens = core.zeros((parallel_size, image_token_num_per_image), dtype=core.int)
+    generated_tokens = core.zeros((parallel_size, image_token_num_per_image), dtype=core.int).to(device)
 
     pkv = None
     for i in range(image_token_num_per_image):
diff --git a/examples/diffusers/janus/demo/app_janusflow.py b/examples/diffusers/janus/demo/app_janusflow.py
@@ -1,29 +1,40 @@
-import gradio as gr
-import mindspore
-import mindnlp
 from mindnlp import core
+import gradio as gr
 from janus.janusflow.models import MultiModalityCausalLM, VLChatProcessor
 from PIL import Image
+from transformers import DynamicCache
 from diffusers.models import AutoencoderKL
 import numpy as np
 
+device = 'cpu'
+if core.npu.is_available():
+    device = 'npu'
+elif core.cuda.is_available():
+    device = 'cuda'
+
 # Load model and processor
 model_path = "deepseek-ai/JanusFlow-1.3B"
 vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
 tokenizer = vl_chat_processor.tokenizer
 
-vl_gpt = MultiModalityCausalLM.from_pretrained(model_path, ms_dtype=mindspore.float16)
-vl_gpt = vl_gpt.eval()
+vl_gpt = MultiModalityCausalLM.from_pretrained(model_path)
+vl_gpt = vl_gpt.to(core.bfloat16).to(device).eval()
 
 # remember to use bfloat16 dtype, this vae doesn't work with fp16
-vae = AutoencoderKL.from_pretrained("stabilityai/sdxl-vae", ms_dtype=mindspore.float16)
-vae = vae.eval()
+vae = AutoencoderKL.from_pretrained("stabilityai/sdxl-vae")
+vae = vae.to(core.bfloat16).to(device).eval()
 
+# Multimodal Understanding function
+@core.inference_mode()
 # Multimodal Understanding function
 def multimodal_understanding(image, question, seed, top_p, temperature):
+    # Clear CUDA cache before generating
+    core.cuda.empty_cache()
+    
     # set seed
-    mindspore.manual_seed(seed)
+    core.manual_seed(seed)
     np.random.seed(seed)
+    core.cuda.manual_seed(seed)
     
     conversation = [
         {
@@ -37,9 +48,9 @@ def multimodal_understanding(image, question, seed, top_p, temperature):
     pil_images = [Image.fromarray(image)]
     prepare_inputs = vl_chat_processor(
         conversations=conversation, images=pil_images, force_batchify=True
-    ).to(core.get_default_device(), mindspore.float16)
-
-
+    ).to(device, dtype=core.bfloat16 if core.cuda.is_available() else core.float16)
+    
+    
     inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
     
     outputs = vl_gpt.language_model.generate(
@@ -60,13 +71,14 @@ def multimodal_understanding(image, question, seed, top_p, temperature):
     return answer
 
 
+@core.inference_mode()
 def generate(
     input_ids,
     cfg_weight: float = 2.0,
     num_inference_steps: int = 30
 ):
     # we generate 5 images at a time, *2 for CFG
-    tokens = core.stack([input_ids] * 10)
+    tokens = core.stack([input_ids] * 10).cuda()
     tokens[5:, 1:] = vl_chat_processor.pad_id
     inputs_embeds = vl_gpt.language_model.get_input_embeddings()(tokens)
     print(inputs_embeds.shape)
@@ -76,10 +88,10 @@ def generate(
     
     # generate with rectified flow ode
     # step 1: encode with vision_gen_enc
-    z = core.randn((5, 4, 48, 48), dtype=mindspore.float16)
+    z = core.randn((5, 4, 48, 48), dtype=core.bfloat16).cuda()
     
     dt = 1.0 / num_inference_steps
-    dt = core.zeros_like(z).to(mindspore.float16) + dt
+    dt = core.zeros_like(z).cuda().to(core.bfloat16) + dt
     
     # step 2: run ode
     attention_mask = core.ones((10, inputs_embeds.shape[1]+577)).to(vl_gpt.device)
@@ -103,18 +115,21 @@ def generate(
                                              use_cache=True, 
                                              attention_mask=attention_mask,
                                              past_key_values=None)
-            past_key_values = []
-            for kv_cache in past_key_values:
-                k, v = kv_cache[0], kv_cache[1]
-                past_key_values.append((k[:, :, :inputs_embeds.shape[1], :], v[:, :, :inputs_embeds.shape[1], :]))
-            past_key_values = tuple(past_key_values)
+            past_key_values = DynamicCache.from_legacy_cache(outputs.past_key_values)
+
         else:
             outputs = vl_gpt.language_model.model(inputs_embeds=llm_emb, 
                                              use_cache=True, 
                                              attention_mask=attention_mask,
                                              past_key_values=past_key_values)
+            past_key_values = []
+            for kv_cache in outputs.past_key_values:
+                k, v = kv_cache[0], kv_cache[1]
+                past_key_values.append((k[:, :, :inputs_embeds.shape[1], :], v[:, :, :inputs_embeds.shape[1], :]))
+            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+
         hidden_states = outputs.last_hidden_state
-        
+
         # transform hidden_states back to v
         hidden_states = vl_gpt.vision_gen_dec_aligner(vl_gpt.vision_gen_dec_aligner_norm(hidden_states[:, -576:, :]))
         hidden_states = hidden_states.reshape(z_emb.shape[0], 24, 24, 768).permute(0, 3, 1, 2)
@@ -141,13 +156,17 @@ def unpack(dec, width, height, parallel_size=5):
     return visual_img
 
 
+@core.inference_mode()
 def generate_image(prompt,
                    seed=None,
                    guidance=5,
                    num_inference_steps=30):
+    # Clear CUDA cache and avoid tracking gradients
+    core.cuda.empty_cache()
     # Set the seed for reproducible results
     if seed is not None:
-        mindspore.manual_seed(seed)
+        core.manual_seed(seed)
+        core.cuda.manual_seed(seed)
         np.random.seed(seed)
     
     with core.no_grad():
diff --git a/examples/diffusers/janus/demo/app_januspro.py b/examples/diffusers/janus/demo/app_januspro.py
@@ -1,31 +1,43 @@
-import gradio as gr
-import mindnlp
-import mindspore
 from mindnlp import core
+import gradio as gr
 from transformers import AutoConfig, AutoModelForCausalLM
-from janus.models import MultiModalityCausalLM, VLChatProcessor
-from janus.utils.io import load_pil_images
+from janus.models import VLChatProcessor
 from PIL import Image
 
 import numpy as np
+# import spaces  # Import spaces for ZeroGPU compatibility
+
+device = 'cpu'
+if core.npu.is_available():
+    device = 'npu'
+elif core.cuda.is_available():
+    device = 'cuda'
+
 
 # Load model and processor
 model_path = "deepseek-ai/Janus-Pro-7B"
 config = AutoConfig.from_pretrained(model_path)
 language_config = config.language_config
 language_config._attn_implementation = 'eager'
 vl_gpt = AutoModelForCausalLM.from_pretrained(model_path,
-                                              language_config=language_config,
-                                              trust_remote_code=True, ms_dtype=mindspore.float16)
+                                             language_config=language_config,
+                                             trust_remote_code=True)
+vl_gpt = vl_gpt.to(core.bfloat16).to(device)
 
 vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
 tokenizer = vl_chat_processor.tokenizer
 
+@core.inference_mode()
+# @spaces.GPU(duration=120) 
 # Multimodal Understanding function
 def multimodal_understanding(image, question, seed, top_p, temperature):
+    # Clear CUDA cache before generating
+    core.cuda.empty_cache()
+    
     # set seed
-    mindspore.manual_seed(seed)
+    core.manual_seed(seed)
     np.random.seed(seed)
+    core.cuda.manual_seed(seed)
     
     conversation = [
         {
@@ -39,8 +51,9 @@ def multimodal_understanding(image, question, seed, top_p, temperature):
     pil_images = [Image.fromarray(image)]
     prepare_inputs = vl_chat_processor(
         conversations=conversation, images=pil_images, force_batchify=True
-    ).to(core.get_default_device(), mindspore.float16)
-
+    ).to(device, dtype=core.bfloat16 if core.cuda.is_available() else core.float16)
+    
+    
     inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
     
     outputs = vl_gpt.language_model.generate(
@@ -68,14 +81,16 @@ def generate(input_ids,
              cfg_weight: float = 5,
              image_token_num_per_image: int = 576,
              patch_size: int = 16):
+    # Clear CUDA cache before generating
+    core.cuda.empty_cache()
     
-    tokens = core.zeros((parallel_size * 2, len(input_ids)), dtype=mindspore.int32)
+    tokens = core.zeros((parallel_size * 2, len(input_ids)), dtype=core.int).to(device)
     for i in range(parallel_size * 2):
         tokens[i, :] = input_ids
         if i % 2 != 0:
             tokens[i, 1:-1] = vl_chat_processor.pad_id
     inputs_embeds = vl_gpt.language_model.get_input_embeddings()(tokens)
-    generated_tokens = core.zeros((parallel_size, image_token_num_per_image), dtype=mindspore.int32)
+    generated_tokens = core.zeros((parallel_size, image_token_num_per_image), dtype=core.int).to(device)
 
     pkv = None
     for i in range(image_token_num_per_image):
@@ -99,13 +114,13 @@ def generate(input_ids,
 
     
 
-    patches = vl_gpt.gen_vision_model.decode_code(generated_tokens.to(dtype=mindspore.int32),
+    patches = vl_gpt.gen_vision_model.decode_code(generated_tokens.to(dtype=core.int),
                                                  shape=[parallel_size, 8, width // patch_size, height // patch_size])
 
-    return generated_tokens.to(dtype=mindspore.int32), patches
+    return generated_tokens.to(dtype=core.int), patches
 
 def unpack(dec, width, height, parallel_size=5):
-    dec = dec.to(mindspore.float32).cpu().numpy().transpose(0, 2, 3, 1)
+    dec = dec.to(core.float32).cpu().numpy().transpose(0, 2, 3, 1)
     dec = np.clip((dec + 1) / 2 * 255, 0, 255)
 
     visual_img = np.zeros((parallel_size, width, height, 3), dtype=np.uint8)
@@ -114,13 +129,19 @@ def unpack(dec, width, height, parallel_size=5):
     return visual_img
 
 
+
+@core.inference_mode()
+# @spaces.GPU(duration=120)  # Specify a duration to avoid timeout
 def generate_image(prompt,
                    seed=None,
                    guidance=5,
                    t2i_temperature=1.0):
+    # Clear CUDA cache and avoid tracking gradients
+    core.cuda.empty_cache()
     # Set the seed for reproducible results
     if seed is not None:
-        mindspore.manual_seed(seed)
+        core.manual_seed(seed)
+        core.cuda.manual_seed(seed)
         np.random.seed(seed)
     width = 384
     height = 384
diff --git a/examples/diffusers/janus/generation_inference.py b/examples/diffusers/janus/generation_inference.py
@@ -36,7 +36,7 @@
 vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
     model_path, language_config=language_config, trust_remote_code=True,  ms_dtype=torch.float16
 )
-vl_gpt = vl_gpt.eval()
+vl_gpt = vl_gpt.eval().cuda()
 
 conversation = [
     {
diff --git a/examples/diffusers/janus/inference.py b/examples/diffusers/janus/inference.py
@@ -35,7 +35,7 @@
 vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(
     model_path, language_config=language_config, trust_remote_code=True, torch_dtype=torch.float16
 )
-vl_gpt = vl_gpt.eval()
+vl_gpt = vl_gpt.eval().npu()
 
 conversation = [
     {
diff --git a/mindnlp/core/_prims/numpy.py b/mindnlp/core/_prims/numpy.py
@@ -103,6 +103,8 @@ def dyn_shape(self):
 __all__.append('dyn_shape')
 
 def cast(input, dtype):
+    if input.dtype == dtype:
+        return input
     out = input.asnumpy().astype(core.dtype2np[dtype])
     return core.Tensor.from_numpy(out)
 
@@ -150,6 +152,13 @@ def bitwise_and_scalar(input, other):
 
 __all__.append('bitwise_and_scalar')
 
+
+def bitwise_or_tensor(input, other):
+    out = np.bitwise_or(input.numpy(), other.numpy())
+    return core.Tensor.from_numpy(out)
+
+__all__.append('bitwise_or_tensor')
+
 def right_shift(input, other):
     out = np.right_shift(input.numpy(), other)
     return core.Tensor.from_numpy(out)
diff --git a/mindnlp/core/_tensor.py b/mindnlp/core/_tensor.py
@@ -920,7 +920,8 @@ def data(self, new_value):
         if isinstance(self, StubTensor) and isinstance(new_value, StubTensor):
             self.stub = new_value.stub
         else:
-            if self.device.type == 'cpu' and new_value.device.type == 'cpu' and self.shape == new_value.shape:
+            if self.device.type == 'cpu' and new_value.device.type == 'cpu' \
+                and self.shape == new_value.shape and self.dtype == new_value.dtype:
                 src_ct = ctypes.c_void_p(new_value.data_ptr())
                 dst_ct = ctypes.c_void_p(self.data_ptr())
                 ctypes.memmove(dst_ct, src_ct, self.nbytes)

Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@`
`36`	`36`	`vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(`
`37`	`37`	`model_path, language_config=language_config, trust_remote_code=True, ms_dtype=torch.float16`
`38`	`38`	`)`
`39`		`-vl_gpt = vl_gpt.eval()`
	`39`	`+vl_gpt = vl_gpt.eval().cuda()`
`40`	`40`
`41`	`41`	`conversation = [`
`42`	`42`	`{`
Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@`
`35`	`35`	`vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(`
`36`	`36`	`model_path, language_config=language_config, trust_remote_code=True, torch_dtype=torch.float16`
`37`	`37`	`)`
`38`		`-vl_gpt = vl_gpt.eval()`
	`38`	`+vl_gpt = vl_gpt.eval().npu()`
`39`	`39`
`40`	`40`	`conversation = [`
`41`	`41`	`{`