pip install diffusers transformers torch torchvision torchaudio moviepy pillow openai accelerate safetensors # ai_cartoon_animated.py import os import torch from diffusers import StableDiffusionPipeline, StableVideoDiffusionPipeline from transformers import pipeline from moviepy.editor import concatenate_videoclips, AudioFileClip, VideoFileClip from PIL import Image # ========= SETTINGS ========= huggingface_token = "YOUR_HUGGINGFACE_TOKEN" output_dir = "ai_cartoon_movie" device = "cuda" if torch.cuda.is_available() else "cpu" os.makedirs(output_dir, exist_ok=True) # ========= 1. STORY CREATION ========= story_theme = "A brave robot exploring an alien planet with its cat companion." num_scenes = 20 # about 30 seconds per scene story_gen = pipeline("text-generation", model="gpt2") story_text = story_gen( f"Write a {num_scenes}-scene cartoon story about {story_theme}. " "Each scene should include a one-sentence visual description and one sentence of narration.", max_length=1200, temperature=0.8, do_sample=True )[0]["generated_text"] scenes = [s.strip() for s in story_text.split(".") if len(s.strip()) > 10][:num_scenes] # ========= 2. IMAGE GENERATION ========= print("\n🎨 Generating base cartoon frames...") sd_pipe = StableDiffusionPipeline.from_pretrained( "runwayml/stable-diffusion-v1-5", use_auth_token=huggingface_token ).to(device) image_paths = [] for i, desc in enumerate(scenes): prompt = f"cartoon style, vibrant colors, {desc}" img = sd_pipe(prompt).images[0] img_path = os.path.join(output_dir, f"scene_{i}.png") img.save(img_path) image_paths.append(img_path) # ========= 3. VIDEO ANIMATION ========= print("\n🎞️ Generating motion for each scene (Stable Video Diffusion)...") svd_pipe = StableVideoDiffusionPipeline.from_pretrained( "stabilityai/stable-video-diffusion-img2vid", use_auth_token=huggingface_token ).to(device) video_paths = [] for i, img_path in enumerate(image_paths): image = Image.open(img_path).convert("RGB") print(f"Animating scene {i+1}/{len(image_paths)}...") frames = svd_pipe(image, num_frames=16).frames[0] # 16 short frames frame_dir = os.path.join(output_dir, f"frames_{i}") os.makedirs(frame_dir, exist_ok=True) frame_paths = [] for j, frame in enumerate(frames): fpath = os.path.join(frame_dir, f"frame_{j}.png") frame.save(fpath) frame_paths.append(fpath) # Combine frames into a video import cv2 frame = cv2.imread(frame_paths[0]) height, width, _ = frame.shape out_path = os.path.join(output_dir, f"scene_{i}.mp4") out = cv2.VideoWriter(out_path, cv2.VideoWriter_fourcc(*'mp4v'), 8, (width, height)) for fpath in frame_paths: out.write(cv2.imread(fpath)) out.release() video_paths.append(out_path) # ========= 4. VOICE / NARRATION ========= print("\n🎤 Generating narration...") tts = pipeline("text-to-speech", model="espnet/kan-bayashi_ljspeech_vits") audio_paths = [] for i, desc in enumerate(scenes): narration = f"Scene {i+1}. {desc}." speech = tts(narration) audio_path = os.path.join(output_dir, f"scene_{i}.wav") with open(audio_path, "wb") as f: f.write(speech["audio"]) audio_paths.append(audio_path) # ========= 5. ASSEMBLE FINAL MOVIE ========= print("\n🎬 Combining all animated scenes and voiceovers...") final_clips = [] for i, video_path in enumerate(video_paths): clip = VideoFileClip(video_path) audio = AudioFileClip(audio_paths[i]) clip = clip.set_audio(audio).set_duration(audio.duration) final_clips.append(clip) final_video = concatenate_videoclips(final_clips, method="compose") output_path = os.path.join(output_dir, "ai_cartoon_animated.mp4") final_video.write_videofile(output_path, fps=24) print(f"\n✅ Finished cartoon: {output_path}")