Spaces:

ivxivx
/

HF-story-generator

Sleeping

App Files Files Community

ivxivx commited on Jun 22

Commit

2866fc1

unverified ·

1 Parent(s): 0d717b7

feat: app

Browse files

Files changed (3) hide show

README.md +1 -1
app.py +263 -0
requirements.txt +4 -0

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: HF Story Generator
 emoji: 😻
 colorFrom: indigo
 colorTo: red

 ---
+title: Story Generator
 emoji: 😻
 colorFrom: indigo
 colorTo: red

app.py ADDED Viewed

	@@ -0,0 +1,263 @@

+# %pip install gradio diffusers
+import os
+from huggingface_hub import login
+login(token=os.getenv("HUGGINGFACEHUB_API_KEY"))
+import gradio as gr
+import numpy as np
+import random
+# import spaces #[uncomment to use ZeroGPU]
+from diffusers import DiffusionPipeline
+import torch
+def get_device_type(idx):
+    if torch.cuda.is_available():
+        return f"cuda:{idx}" if torch.cuda.device_count() >= idx else "cuda", torch.float16
+    elif torch.backends.mps.is_available():
+        return "mps", torch.float16
+    else:
+        return "cpu", torch.float32
+device0, torch_dtype = get_device_type(0)
+device1, torch_dtype = get_device_type(1)
+text_generation_model_name="meta-llama/Llama-3.2-3B-Instruct"
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+tokenizer = AutoTokenizer.from_pretrained(text_generation_model_name)
+model = AutoModelForCausalLM.from_pretrained(text_generation_model_name).to(device0, dtype=torch.float16)
+text_to_speech_model_name = "suno/bark"
+# Only PyTorch models and some Diffusers pipelines have .to().
+text_to_speech_pipeline = pipeline("text-to-speech", model=text_to_speech_model_name, device=device0)
+text_to_image_model_name = "stabilityai/sdxl-turbo"
+text_to_image_pipeline = DiffusionPipeline.from_pretrained(text_to_image_model_name, torch_dtype=torch_dtype).to(device1)
+# from diffusers import StableDiffusionPipeline
+# text_to_image_model_name = "sd-legacy/stable-diffusion-v1-5"
+# text_to_image_pipeline = StableDiffusionPipeline.from_pretrained(text_to_image_model_name, torch_dtype=torch_dtype).to(device1)
+MAX_SEED = np.iinfo(np.int32).max
+MAX_IMAGE_SIZE = 1024
+system_prompt = (
+    "You are a helpful AI assistant that generates a story for kids based on the input provided. "
+    "The story should be engaging and creative. "
+    "Here is the input: {input} "
+    "Please respond with the story."
+)
+history = []
+def generate_text(message):
+    global history
+    sys_prompt = system_prompt.replace("{input}", message)
+    if not history or history[0].get("role") != "system":
+        history = [{"role": "system", "content": sys_prompt}] + history
+    else:
+        history[0]["content"] = sys_prompt
+    history.append({"role": "user", "content": message})
+    # 1. Build prompt from history using chat template
+    prompt = tokenizer.apply_chat_template(history, tokenize=False)
+    inputs = tokenizer(prompt, return_tensors="pt").to(device0)
+    outputs = model.generate(**inputs, max_new_tokens=128)
+    decoded = tokenizer.decode(outputs[0], skip_special_tokens=False)
+    if "<|start_header_id|>assistant<|end_header_id|>" in decoded:
+        analysis_response = decoded.split("<|start_header_id|>assistant<|end_header_id|>")[-1]
+        analysis_response = analysis_response.replace("<|eot_id|>", "").strip()
+    elif "<|im_start|>assistant" in decoded:
+        # This works for most chat templates that append the assistant's reply at the end
+        analysis_response = decoded.split("<|im_start|>assistant")[-1]
+        analysis_response = analysis_response.replace("<|im_end|>", "").strip()
+    else:
+        # Fallback: just return the decoded output
+        analysis_response = decoded.strip()
+    return analysis_response
+def generate_audio(text):
+    tts_result = text_to_speech_pipeline(text)
+    # example: [[ 0.00073422 0.00038968 0.00035801 ... -0.01280548 -0.0147996 -0.01798675]]
+    audio = tts_result["audio"]  # This is a numpy array (wav)
+    audio_array = np.array(audio, dtype=np.float32).flatten()
+    sample_rate = 22050  # or your actual sample rate
+    # gr.Gradio expects tuple[int, np.ndarray]
+    return (sample_rate, audio_array)
+# @spaces.GPU #[uncomment to use ZeroGPU]
+def generate_image(
+    prompt,
+    negative_prompt,
+    guidance_scale,
+    num_inference_steps,
+    width,
+    height,
+    seed,
+    randomize_seed,
+    progress=gr.Progress(track_tqdm=True),
+):
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    generator = torch.Generator().manual_seed(seed)
+    image = text_to_image_pipeline(
+        prompt=prompt,
+        negative_prompt=negative_prompt,
+        guidance_scale=guidance_scale,
+        num_inference_steps=num_inference_steps,
+        width=width,
+        height=height,
+        generator=generator,
+    ).images[0]
+    return image, seed
+def generate_all(
+    prompt,
+    negative_prompt,
+    guidance_scale,
+    num_inference_steps,
+    width,
+    height,
+    seed,
+    randomize_seed,
+    progress=gr.Progress(track_tqdm=True),
+):
+    # Generate text from the prompt
+    story = generate_text(prompt)
+    # Generate audio from the text
+    audio = generate_audio(story)
+    # Generate image from the text
+    image, seed = generate_image(
+        story,
+        negative_prompt,
+        guidance_scale,
+        num_inference_steps,
+        width,
+        height,
+        seed,
+        randomize_seed,
+        progress=progress,
+    )
+    return story, audio, image, seed
+examples = [
+    "sky",
+    "sea",
+]
+css = """
+#col-container {
+    margin: 0 auto;
+    max-width: 640px;
+}
+"""
+with gr.Blocks(css=css) as demo:
+    with gr.Column(elem_id="col-container"):
+        gr.Markdown("# Story Generator (text & audio on cuda0, image on cuda1)")
+        with gr.Row():
+            prompt = gr.Text(
+                label="Prompt",
+                show_label=False,
+                max_lines=1,
+                placeholder="Enter your prompt",
+                container=False,
+            )
+            run_button = gr.Button("Run", scale=0, variant="primary")
+        story = gr.Text(label="Story", show_label=False)
+        audio = gr.Audio(label="Audio", show_label=False)
+        image = gr.Image(label="Image", show_label=False)
+        with gr.Accordion("Advanced Settings", open=False):
+            negative_prompt = gr.Text(
+                label="Negative prompt",
+                max_lines=1,
+                placeholder="Enter a negative prompt",
+                visible=False,
+            )
+            seed = gr.Slider(
+                label="Seed",
+                minimum=0,
+                maximum=MAX_SEED,
+                step=1,
+                value=0,
+            )
+            randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
+            with gr.Row():
+                width = gr.Slider(
+                    label="Width",
+                    minimum=256,
+                    maximum=MAX_IMAGE_SIZE,
+                    step=32,
+                    value=1024,  # Replace with defaults that work for your model
+                )
+                height = gr.Slider(
+                    label="Height",
+                    minimum=256,
+                    maximum=MAX_IMAGE_SIZE,
+                    step=32,
+                    value=1024,  # Replace with defaults that work for your model
+                )
+            with gr.Row():
+                guidance_scale = gr.Slider(
+                    label="Guidance scale",
+                    minimum=0.0,
+                    maximum=10.0,
+                    step=0.1,
+                    value=0.0,  # Replace with defaults that work for your model
+                )
+                num_inference_steps = gr.Slider(
+                    label="Number of inference steps",
+                    minimum=1,
+                    maximum=50,
+                    step=1,
+                    value=2,  # Replace with defaults that work for your model
+                )
+        gr.Examples(examples=examples, inputs=[prompt])
+    gr.on(
+        triggers=[run_button.click, prompt.submit],
+        fn=generate_all,
+        inputs=[
+            prompt,
+            negative_prompt,
+            guidance_scale,
+            num_inference_steps,
+            width,
+            height,
+            seed,
+            randomize_seed,
+        ],
+        outputs=[story, audio, image, seed],
+    )
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+accelerate
+diffusers
+torch
+transformers