import gradio import av, pathlib, diffusers, torch, transformers, builtins, numpy, re from animatediff.generate import controlnet_preprocess, img2img_preprocess, wild_card_conversion, region_preprocess, unload_controlnet_models from animatediff.settings import get_model_config, get_infer_config from animatediff.utils.pipeline import send_to_device from animatediff.utils.util import set_tensor_interpolation_method from animatediff.pipelines import load_text_embeddings from animatediff.pipelines.lora import load_lcm_lora import huggingface_hub import animatediff width=432 height=768 length=1440 model_config = get_model_config('config/prompts/prompt_travel.json') is_sdxl = False infer_config = get_infer_config(True, is_sdxl) set_tensor_interpolation_method(model_config.tensor_interpolation_slerp) device = torch.device('cuda') save_dir = pathlib.Path('output') controlnet_image_map, controlnet_type_map, controlnet_ref_map, controlnet_no_shrink = controlnet_preprocess(model_config.controlnet_map, width, height, length, save_dir, device, is_sdxl) img2img_map = img2img_preprocess(model_config.img2img_map, width, height, length, save_dir) base_model = pathlib.Path('/tmp/base') diffusers.StableDiffusionPipeline.from_pretrained('stable-diffusion-v1-5/stable-diffusion-v1-5').save_pretrained(base_model) tokenizer = transformers.CLIPTokenizer.from_pretrained(base_model, subfolder='tokenizer') text_encoder = transformers.CLIPTextModel.from_pretrained(base_model, subfolder='text_encoder') vae = diffusers.AutoencoderKL.from_single_file('https://huggingface.co/chaowenguoback/pal/blob/main/vae-ft-mse-840000-ema-pruned.safetensors') huggingface_hub.hf_hub_download(repo_id='wangfuyun/AnimateLCM', filename='AnimateLCM_sd15_t2v.ckpt', local_dir=pathlib.Path.cwd()) unet = animatediff.models.unet.UNet2DConditionModel.from_pretrained_2d( pretrained_model_path=base_model, motion_module_path=pathlib.Path.cwd().joinpath('AnimateLCM_sd15_t2v.ckpt'), subfolder='unet', unet_additional_kwargs=infer_config.unet_additional_kwargs, feature_extractor = transformers.CLIPImageProcessor.from_pretrained(base_model, subfolder='feature_extractor') ) pipeline = diffusers.StableDiffusionPipeline.from_single_file('https://huggingface.co/chaowenguoback/15/blob/main/chilloutMix-Ni.safetensors', config='stable-diffusion-v1-5/stable-diffusion-v1-5', safety_checker=None, use_safetensors=True) unet.load_state_dict(pipeline.unet.state_dict(), strict=False) text_encoder.load_state_dict(pipeline.text_encoder.state_dict(), strict=False) del pipeline unet.enable_xformers_memory_efficient_attention() pipeline = animatediff.pipelines.AnimationPipeline( vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet, scheduler=diffusers.LCMScheduler.from_config(infer_config.noise_scheduler_kwargs), feature_extractor=feature_extractor, controlnet_map=None, ) lcm_lora = pathlib.Path.cwd().joinpath('data/models/lcm_lora/sd15') lcm_lora.mkdir(parents=True) huggingface_hub.hf_hub_download(repo_id='wangfuyun/AnimateLCM', filename='AnimateLCM_sd15_t2v_lora.safetensors', local_dir=lcm_lora) load_lcm_lora(pipeline, {'start_scale':0.15, 'end_scale':0.75, 'gradient_start':0.2, 'gradient_end':0.75}, is_sdxl=is_sdxl) pipeline.lora_map = None pipeline.load_lora_weights('chaowenguoback/15', weight_name='add_detail.safetensors', adapter_name='detail') pipeline.load_lora_weights('chaowenguoback/15', weight_name='b1r1av5-000007.safetensors', adapter_name='bikini') pipeline.load_lora_weights('chaowenguoback/15', weight_name='btcstr.safetensors', adapter_name='c-string') pipeline.load_lora_weights('chaowenguoback/15', weight_name='蓝洁瑛.safetensors', adapter_name='character') pipeline.set_adapters(['detail', 'bikini', 'c-string', 'character'], [1, 0.4, 0.2, 0.8]) pipeline.unet = pipeline.unet.half() pipeline.text_encoder = pipeline.text_encoder.half() pipeline.text_encoder = pipeline.text_encoder.to(device) load_text_embeddings(pipeline) pipeline.text_encoder = pipeline.text_encoder.to('cpu') pipeline = send_to_device(pipeline, device, freeze=True, force_half=False, compile=False, is_sdxl=is_sdxl) wild_card_conversion(model_config) is_init_img_exist = img2img_map != None region_condi_list, region_list, ip_adapter_config_map, region2index = region_preprocess(model_config, width, height, length, save_dir, is_init_img_exist, is_sdxl) if controlnet_type_map: for c in controlnet_type_map: tmp_r = [region2index[r] for r in controlnet_type_map[c]["control_region_list"]] controlnet_type_map[c]["control_region_list"] = [r for r in tmp_r if r != -1] prompt_map = region_condi_list[0]["prompt_map"] prompt_tags = [re.compile(r"[^\w\-, ]").sub("", tag).strip().replace(" ", "-") for tag in prompt_map[list(prompt_map.keys())[0]].split(",")] prompt_str = "_".join((prompt_tags[:6]))[:50] output = pipeline( n_prompt='nipple, waistband, back view, monochrome, longbody, lowres, bad anatomy, bad hands, fused fingers, missing fingers, too many fingers, cropped, worst quality, low quality, deformed body, bloated, ugly, unrealistic, extra hands and arms', num_inference_steps=8, guidance_scale=3, unet_batch_size=1, width=width, height=height, video_length=length, return_dict=False, context_frames=16, context_stride=1, context_overlap=16 // 4, context_schedule='composite', clip_skip=2, controlnet_type_map=controlnet_image_map, controlnet_image_map=controlnet_image_map, controlnet_ref_map=controlnet_ref_map, controlnet_no_shrink=controlnet_no_shrink, controlnet_max_samples_on_vram=model_config.controlnet_map["max_samples_on_vram"] if "max_samples_on_vram" in model_config.controlnet_map else 999, controlnet_max_models_on_vram=model_config.controlnet_map["max_models_on_vram"] if "max_models_on_vram" in model_config.controlnet_map else 99, controlnet_is_loop = model_config.controlnet_map["is_loop"] if "is_loop" in model_config.controlnet_map else True, img2img_map=img2img_map, ip_adapter_config_map=ip_adapter_config_map, region_list=region_list, region_condi_list=region_condi_list, interpolation_factor=1, is_single_prompt_mode=model_config.is_single_prompt_mode, gradual_latent_map=model_config.gradual_latent_hires_fix_map, callback=None, callback_steps=None, ) unload_controlnet_models(pipe=pipeline) frames = output.permute(0, 2, 1, 3, 4).squeeze(0) frames = frames.mul(255).add_(0.5).clamp_(0, 255).permute(0, 2, 3, 1).to("cpu", torch.uint8).numpy() del pipeline torch.cuda.empty_cache() pipeline = diffusers.AudioLDM2Pipeline.from_pretrained('cvssp/audioldm2-music', torch_dtype=torch.float16).to('cuda') pipeline.scheduler = diffusers.DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config) music = pipeline(prompt='Light rhythm techno', negative_prompt='low quality, average quality', num_inference_steps=20, audio_length_in_s=180).audios[0] del pipeline torch.cuda.empty_cache() with av.open('video.mp4', mode='w') as writer: video = writer.add_stream('h264', rate=8) video.width = width * 4 video.height = height * 4 video.pix_fmt = 'yuv420p' audio = writer.add_stream('aac', rate=16000) for frame in frames: writer.mux(video.encode(av.VideoFrame.from_ndarray(frame))) writer.mux(video.encode()) for _ in builtins.range(0, music.shape[0], audio.frame_size): frame = av.AudioFrame.from_ndarray(music[_:_ + audio.frame_size][None], format='fltp', layout='mono') frame.sample_rate = audio.sample_rate frame.pts = _ writer.mux(audio.encode(frame)) writer.mux(audio.encode()) def greet(name, intensity): return "Hello, " + name + "!" * int(intensity) demo = gradio.Interface( fn=greet, inputs=["text", "slider"], outputs=["text"], api_name="predict" ) demo.launch()