# %pip install gradio diffusers import os from huggingface_hub import login login(token=os.getenv("HUGGINGFACEHUB_API_KEY")) import gradio as gr import numpy as np import random # import spaces #[uncomment to use ZeroGPU] from diffusers import DiffusionPipeline import torch def get_device_type(idx): if torch.cuda.is_available(): return f"cuda:{idx}" if torch.cuda.device_count() >= idx else "cuda", torch.float16 elif torch.backends.mps.is_available(): return "mps", torch.float16 else: return "cpu", torch.float32 device0, torch_dtype = get_device_type(0) device1, torch_dtype = get_device_type(1) text_generation_model_name="meta-llama/Llama-3.2-3B-Instruct" from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline tokenizer = AutoTokenizer.from_pretrained(text_generation_model_name) model = AutoModelForCausalLM.from_pretrained(text_generation_model_name).to(device0, dtype=torch.float16) text_to_speech_model_name = "suno/bark" # Only PyTorch models and some Diffusers pipelines have .to(). text_to_speech_pipeline = pipeline("text-to-speech", model=text_to_speech_model_name, device=device0) text_to_image_model_name = "stabilityai/sdxl-turbo" text_to_image_pipeline = DiffusionPipeline.from_pretrained(text_to_image_model_name, torch_dtype=torch_dtype).to(device1) # from diffusers import StableDiffusionPipeline # text_to_image_model_name = "sd-legacy/stable-diffusion-v1-5" # text_to_image_pipeline = StableDiffusionPipeline.from_pretrained(text_to_image_model_name, torch_dtype=torch_dtype).to(device1) MAX_SEED = np.iinfo(np.int32).max MAX_IMAGE_SIZE = 1024 system_prompt = ( "You are a helpful AI assistant that generates a story for kids based on the input provided. " "The story should be engaging and creative. " "Here is the input: {input} " "Please respond with the story." ) history = [] def generate_text(message): global history sys_prompt = system_prompt.replace("{input}", message) if not history or history[0].get("role") != "system": history = [{"role": "system", "content": sys_prompt}] + history else: history[0]["content"] = sys_prompt history.append({"role": "user", "content": message}) # 1. Build prompt from history using chat template prompt = tokenizer.apply_chat_template(history, tokenize=False) inputs = tokenizer(prompt, return_tensors="pt").to(device0) outputs = model.generate(**inputs, max_new_tokens=128) decoded = tokenizer.decode(outputs[0], skip_special_tokens=False) if "<|start_header_id|>assistant<|end_header_id|>" in decoded: analysis_response = decoded.split("<|start_header_id|>assistant<|end_header_id|>")[-1] analysis_response = analysis_response.replace("<|eot_id|>", "").strip() elif "<|im_start|>assistant" in decoded: # This works for most chat templates that append the assistant's reply at the end analysis_response = decoded.split("<|im_start|>assistant")[-1] analysis_response = analysis_response.replace("<|im_end|>", "").strip() else: # Fallback: just return the decoded output analysis_response = decoded.strip() return analysis_response def generate_audio(text): tts_result = text_to_speech_pipeline(text) # example: [[ 0.00073422 0.00038968 0.00035801 ... -0.01280548 -0.0147996 -0.01798675]] audio = tts_result["audio"] # This is a numpy array (wav) audio_array = np.array(audio, dtype=np.float32).flatten() sample_rate = 22050 # or your actual sample rate # gr.Gradio expects tuple[int, np.ndarray] return (sample_rate, audio_array) # @spaces.GPU #[uncomment to use ZeroGPU] def generate_image( prompt, negative_prompt, guidance_scale, num_inference_steps, width, height, seed, randomize_seed, progress=gr.Progress(track_tqdm=True), ): if randomize_seed: seed = random.randint(0, MAX_SEED) generator = torch.Generator().manual_seed(seed) image = text_to_image_pipeline( prompt=prompt, negative_prompt=negative_prompt, guidance_scale=guidance_scale, num_inference_steps=num_inference_steps, width=width, height=height, generator=generator, ).images[0] return image, seed def generate_all( prompt, negative_prompt, guidance_scale, num_inference_steps, width, height, seed, randomize_seed, progress=gr.Progress(track_tqdm=True), ): # Generate text from the prompt story = generate_text(prompt) # Generate audio from the text audio = generate_audio(story) # Generate image from the text image, seed = generate_image( story, negative_prompt, guidance_scale, num_inference_steps, width, height, seed, randomize_seed, progress=progress, ) return story, audio, image, seed examples = [ "sky", "sea", ] css = """ #col-container { margin: 0 auto; max-width: 640px; } """ with gr.Blocks(css=css) as demo: with gr.Column(elem_id="col-container"): gr.Markdown("# Story Generator (text & audio on cuda0, image on cuda1)") with gr.Row(): prompt = gr.Text( label="Prompt", show_label=False, max_lines=1, placeholder="Enter your prompt", container=False, ) run_button = gr.Button("Run", scale=0, variant="primary") story = gr.Text(label="Story", show_label=False) audio = gr.Audio(label="Audio", show_label=False) image = gr.Image(label="Image", show_label=False) with gr.Accordion("Advanced Settings", open=False): negative_prompt = gr.Text( label="Negative prompt", max_lines=1, placeholder="Enter a negative prompt", visible=False, ) seed = gr.Slider( label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0, ) randomize_seed = gr.Checkbox(label="Randomize seed", value=True) with gr.Row(): width = gr.Slider( label="Width", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=1024, # Replace with defaults that work for your model ) height = gr.Slider( label="Height", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=1024, # Replace with defaults that work for your model ) with gr.Row(): guidance_scale = gr.Slider( label="Guidance scale", minimum=0.0, maximum=10.0, step=0.1, value=0.0, # Replace with defaults that work for your model ) num_inference_steps = gr.Slider( label="Number of inference steps", minimum=1, maximum=50, step=1, value=2, # Replace with defaults that work for your model ) gr.Examples(examples=examples, inputs=[prompt]) gr.on( triggers=[run_button.click, prompt.submit], fn=generate_all, inputs=[ prompt, negative_prompt, guidance_scale, num_inference_steps, width, height, seed, randomize_seed, ], outputs=[story, audio, image, seed], ) if __name__ == "__main__": demo.launch()