Helios/app.py at main · PKU-YuanGroup/Helios · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
import tempfile
import time

import gradio as gr
import spaces
import torch

from diffusers import AutoencoderKLWan, HeliosDMDScheduler, HeliosPyramidPipeline
from diffusers.utils import export_to_video, load_image, load_video


# ---------------------------------------------------------------------------
# Pre-load model
# ---------------------------------------------------------------------------
MODEL_ID = "BestWishYsh/Helios-Distilled"

vae = AutoencoderKLWan.from_pretrained(MODEL_ID, subfolder="vae", torch_dtype=torch.float32)
scheduler = HeliosDMDScheduler.from_pretrained(MODEL_ID, subfolder="scheduler")
pipe = HeliosPyramidPipeline.from_pretrained(
    MODEL_ID, vae=vae, scheduler=scheduler, torch_dtype=torch.bfloat16, is_distilled=True
)

pipe.to("cuda")
try:
    pipe.transformer.set_attention_backend("_flash_3_hub")
except Exception:
    pipe.transformer.set_attention_backend("flash_hub")

# @spaces.GPU(duration=1500)
# def compile_transformer():
#     with spaces.aoti_capture(pipe.transformer) as call:
#         pipe("arbitrary example prompt")

#     exported = torch.export.export(
#         pipe.transformer,
#         args=call.args,
#         kwargs=call.kwargs,
#     )
#     return spaces.aoti_compile(exported)

# compiled_transformer = compile_transformer()
# spaces.aoti_apply(compiled_transformer, pipe.transformer)


# ---------------------------------------------------------------------------
# Generation
# ---------------------------------------------------------------------------
@spaces.GPU(duration=300)
def generate_video(
    mode: str,
    prompt: str,
    image_input,
    video_input,
    height: int,
    width: int,
    num_frames: int,
    num_inference_steps: int,
    seed: int,
    is_amplify_first_chunk: bool,
    progress=gr.Progress(track_tqdm=True),
):
    if not prompt:
        raise gr.Error("Please provide a prompt.")

    generator = torch.Generator(device="cuda").manual_seed(int(seed))

    kwargs = {
        "prompt": prompt,
        "height": int(height),
        "width": int(width),
        "num_frames": int(num_frames),
        "guidance_scale": 1.0,
        "generator": generator,
        "output_type": "np",
        "pyramid_num_inference_steps_list": [
            int(num_inference_steps),
            int(num_inference_steps),
            int(num_inference_steps),
        ],
        "is_amplify_first_chunk": is_amplify_first_chunk,
    }

    if mode == "Image-to-Video" and image_input is not None:
        img = load_image(image_input).resize((int(width), int(height)))
        kwargs["image"] = img
    elif mode == "Video-to-Video" and video_input is not None:
        kwargs["video"] = load_video(video_input)

    t0 = time.time()
    output = pipe(**kwargs).frames[0]
    elapsed = time.time() - t0

    tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
    export_to_video(output, tmp.name, fps=24)
    info = f"Generated in {elapsed:.1f}s · {num_frames} frames · {height}×{width}"
    return tmp.name, info


# ---------------------------------------------------------------------------
# UI Setup
# ---------------------------------------------------------------------------
def update_conditional_visibility(mode):
    if mode == "Image-to-Video":
        return gr.update(visible=True), gr.update(visible=False)
    elif mode == "Video-to-Video":
        return gr.update(visible=False), gr.update(visible=True)
    else:
        return gr.update(visible=False), gr.update(visible=False)


CSS = """
#header { text-align: center; margin-bottom: 1.5em; }
#header h1 { font-size: 2.2em; margin-bottom: 0.2em; }
.logo { max-height: 100px; margin: 0 auto 10px auto; display: block; }
.link-buttons { display: flex; justify-content: center; gap: 15px; margin-top: 10px; }
.link-buttons a {
    background-color: #2b3137;
    color: #ffffff !important;
    padding: 8px 20px;
    border-radius: 6px;
    text-decoration: none;
    font-weight: 600;
    font-size: 1em;
    transition: all 0.2s ease-in-out;
    box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
.link-buttons a:hover { background-color: #4a535c; transform: translateY(-1px); }
.contain { max-width: 1350px; margin: 0 auto !important; }
"""

with gr.Blocks(title="Helios Video Generation") as demo:
    gr.HTML(
        """
        <div style='display: flex; align-items: center; justify-content: center; width: 100%;'>
            <img src="https://raw.githubusercontent.com/SHYuanBest/shyuanbest_media/main/Helios/logo_white.png" style='width: 400px; height: auto;' />
        </div>
        <div id="header">
            <h1>🎬 Helios 14B Distilled: Real Real-Time Long Video Generation Model</h1>
            <p style="font-size: 1.1em; color: #666; margin-top: 0.5em; margin-bottom: 1em;">
                If you like our project, please give us a star ⭐ on GitHub for the latest update.
            </p>
            <div class="link-buttons">
                <a href="https://github.com/PKU-YuanGroup/Helios" target="_blank">💻 Code</a>
                <a href="https://pku-yuangroup.github.io/Helios-Page" target="_blank">📄 Page</a>
                <a href="https://www.youtube.com/watch?v=vd_AgHtOUFQ" target="_blank">🎥 Main Feature</a>
                <a href="https://www.youtube.com/watch?v=1GeIU2Dn7UY" target="_blank">⚡ Inference Speed</a>
            </div>
        </div>
        """
    )

    with gr.Row():
        with gr.Column(scale=1):
            mode = gr.Radio(
                choices=["Text-to-Video", "Image-to-Video", "Video-to-Video"],
                value="Text-to-Video",
                label="Generation Mode",
            )
            image_input = gr.Image(label="Image (for I2V)", type="filepath", visible=False)
            video_input = gr.Video(label="Video (for V2V)", visible=False)
            prompt = gr.Textbox(
                label="Prompt",
                lines=4,
                value=(
                    "A vibrant tropical fish swimming gracefully among colorful coral reefs in "
                    "a clear, turquoise ocean. The fish has bright blue and yellow scales with a "
                    "small, distinctive orange spot on its side, its fins moving fluidly. The coral "
                    "reefs are alive with a variety of marine life, including small schools of "
                    "colorful fish and sea turtles gliding by. The water is crystal clear, allowing "
                    "for a view of the sandy ocean floor below. The reef itself is adorned with a mix "
                    "of hard and soft corals in shades of red, orange, and green. The photo captures "
                    "the fish from a slightly elevated angle, emphasizing its lively movements and the "
                    "vivid colors of its surroundings. A close-up shot with dynamic movement."
                ),
            )
            with gr.Accordion("Advanced Settings", open=False):
                with gr.Row():
                    height = gr.Number(value=384, label="Height", precision=0, interactive=False)
                    width = gr.Number(value=640, label="Width", precision=0, interactive=False)
                with gr.Row():
                    num_frames = gr.Slider(33, 231, value=231, step=33, label="Num Frames")
                    num_inference_steps = gr.Slider(1, 10, value=2, step=1, label="Steps per stage")
                with gr.Row():
                    seed = gr.Number(value=42, label="Seed", precision=0)
                    is_amplify_first_chunk = gr.Checkbox(label="Amplify First Chunk", value=True)

            generate_btn = gr.Button("🚀 Generate Video", variant="primary", size="lg")

        with gr.Column(scale=1):
            video_output = gr.Video(label="Generated Video", autoplay=True)
            info_output = gr.Textbox(label="Info", interactive=False)

    mode.change(fn=update_conditional_visibility, inputs=[mode], outputs=[image_input, video_input])
    generate_btn.click(
        fn=generate_video,
        inputs=[
            mode,
            prompt,
            image_input,
            video_input,
            height,
            width,
            num_frames,
            num_inference_steps,
            seed,
            is_amplify_first_chunk,
        ],
        outputs=[video_output, info_output],
    )

    gr.Examples(
        examples=[
            [
                "Text-to-Video",
                "A vibrant tropical fish swimming gracefully among colorful coral reefs in "
                "a clear, turquoise ocean. The fish has bright blue and yellow scales with a "
                "small, distinctive orange spot on its side, its fins moving fluidly. The coral "
                "reefs are alive with a variety of marine life, including small schools of "
                "colorful fish and sea turtles gliding by. The water is crystal clear, allowing "
                "for a view of the sandy ocean floor below. The reef itself is adorned with a mix "
                "of hard and soft corals in shades of red, orange, and green. The photo captures "
                "the fish from a slightly elevated angle, emphasizing its lively movements and the "
                "vivid colors of its surroundings. A close-up shot with dynamic movement.",
                None,
                None,
            ],
            [
                "Text-to-Video",
                "An extreme close-up of an gray-haired man with a beard in his 60s, he is deep in "
                "thought pondering the history of the universe as he sits at a cafe in Paris, his eyes "
                "focus on people offscreen as they walk as he sits mostly motionless, he is dressed in "
                "a wool coat suit coat with a button-down shirt , he wears a brown beret and glasses "
                "and has a very professorial appearance, and the end he offers a subtle closed-mouth "
                "smile as if he found the answer to the mystery of life, the lighting is very cinematic "
                "with the golden light and the Parisian streets and city in the background, depth of "
                "field, cinematic 35mm film.",
                None,
                None,
            ],
            [
                "Text-to-Video",
                "A drone camera circles around a beautiful historic church built on a rocky outcropping "
                "along the Amalfi Coast, the view showcases historic and magnificent architectural "
                "details and tiered pathways and patios, waves are seen crashing against the rocks "
                "below as the view overlooks the horizon of the coastal waters and hilly landscapes "
                "of the Amalfi Coast Italy, several distant people are seen walking and enjoying vistas "
                "on patios of the dramatic ocean views, the warm glow of the afternoon sun creates a "
                "magical and romantic feeling to the scene, the view is stunning captured with beautiful photography.",
                None,
                None,
            ],
            [
                "Image-to-Video",
                "A towering emerald wave surges forward, its crest curling with raw power and energy. Sunlight glints off the translucent water, illuminating the intricate textures and deep green hues within the wave’s body. A thick spray erupts from the breaking crest, casting a misty veil that dances above the churning surface. As the perspective widens, the immense scale of the wave becomes apparent, revealing the restless expanse of the ocean stretching beyond. The scene captures the ocean’s untamed beauty and relentless force, with every droplet and ripple shimmering in the light. The dynamic motion and vivid colors evoke both awe and respect for nature’s might.",
                "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/helios/wave.jpg",
                None,
            ],
            [
                "Video-to-Video",
                "A bright yellow Lamborghini Huracn Tecnica speeds along a curving mountain road, surrounded by lush green trees under a partly cloudy sky. The car's sleek design and vibrant color stand out against the natural backdrop, emphasizing its dynamic movement. The road curves gently, with a guardrail visible on one side, adding depth to the scene. The motion blur captures the sense of speed and energy, creating a thrilling and exhilarating atmosphere. A front-facing shot from a slightly elevated angle, highlighting the car's aggressive stance and the surrounding greenery.",
                None,
                "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/helios/car.mp4",
            ],
        ],
        inputs=[mode, prompt, image_input, video_input],
        label="Example Prompts",
    )

if __name__ == "__main__":
    demo.launch(share=True, css=CSS, theme=gr.themes.Soft())