exact-railcar commited on
Commit
1716635
·
verified ·
1 Parent(s): dbb5b65

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +145 -0
app.py CHANGED
@@ -1,5 +1,150 @@
1
  import gradio
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  def greet(name, intensity):
4
  return "Hello, " + name + "!" * int(intensity)
5
 
 
1
  import gradio
2
 
3
+ import basicsr, realesrgan, gfpgan, av, pathlib, diffusers, torch, transformers, builtins, numpy, re
4
+ from animatediff.generate import controlnet_preprocess, img2img_preprocess, wild_card_conversion, region_preprocess, unload_controlnet_models
5
+ from animatediff.settings import get_model_config, get_infer_config
6
+ from animatediff.utils.pipeline import send_to_device
7
+ from animatediff.utils.util import set_tensor_interpolation_method
8
+ from animatediff.pipelines import load_text_embeddings
9
+ from animatediff.pipelines.lora import load_lcm_lora
10
+ import huggingface_hub
11
+ import animatediff
12
+
13
+ width=432
14
+ height=768
15
+ length=1440
16
+ model_config = get_model_config('config/prompts/prompt_travel.json')
17
+ is_sdxl = False
18
+ infer_config = get_infer_config(True, is_sdxl)
19
+ set_tensor_interpolation_method(model_config.tensor_interpolation_slerp)
20
+ device = torch.device('cuda')
21
+ save_dir = pathlib.Path('output')
22
+ controlnet_image_map, controlnet_type_map, controlnet_ref_map, controlnet_no_shrink = controlnet_preprocess(model_config.controlnet_map, width, height, length, save_dir, device, is_sdxl)
23
+ img2img_map = img2img_preprocess(model_config.img2img_map, width, height, length, save_dir)
24
+
25
+ base_model = pathlib.Path('/tmp/base')
26
+ diffusers.StableDiffusionPipeline.from_pretrained('stable-diffusion-v1-5/stable-diffusion-v1-5').save_pretrained(base_model)
27
+
28
+ tokenizer = transformers.CLIPTokenizer.from_pretrained(base_model, subfolder='tokenizer')
29
+ text_encoder = transformers.CLIPTextModel.from_pretrained(base_model, subfolder='text_encoder')
30
+ vae = diffusers.AutoencoderKL.from_single_file('https://huggingface.co/chaowenguoback/pal/blob/main/vae-ft-mse-840000-ema-pruned.safetensors')
31
+ huggingface_hub.hf_hub_download(repo_id='wangfuyun/AnimateLCM', filename='AnimateLCM_sd15_t2v.ckpt', local_dir=pathlib.Path.cwd())
32
+ unet = animatediff.models.unet.UNet2DConditionModel.from_pretrained_2d(
33
+ pretrained_model_path=base_model,
34
+ motion_module_path=pathlib.Path.cwd().joinpath('AnimateLCM_sd15_t2v.ckpt'),
35
+ subfolder='unet',
36
+ unet_additional_kwargs=infer_config.unet_additional_kwargs,
37
+ feature_extractor = transformers.CLIPImageProcessor.from_pretrained(base_model, subfolder='feature_extractor')
38
+
39
+ pipeline = diffusers.StableDiffusionPipeline.from_single_file('https://huggingface.co/chaowenguoback/15/blob/main/chilloutMix-Ni.safetensors', config='stable-diffusion-v1-5/stable-diffusion-v1-5', safety_checker=None, use_safetensors=True)
40
+ unet.load_state_dict(pipeline.unet.state_dict(), strict=False)
41
+ text_encoder.load_state_dict(pipeline.text_encoder.state_dict(), strict=False)
42
+ del pipeline
43
+
44
+ unet.enable_xformers_memory_efficient_attention()
45
+
46
+ pipeline = animatediff.pipelines.AnimationPipeline(
47
+ vae=vae,
48
+ text_encoder=text_encoder,
49
+ tokenizer=tokenizer,
50
+ unet=unet,
51
+ scheduler=diffusers.LCMScheduler.from_config(infer_config.noise_scheduler_kwargs),
52
+ feature_extractor=feature_extractor,
53
+ controlnet_map=None,
54
+ )
55
+
56
+ lcm_lora = pathlib.Path.cwd().joinpath('data/models/lcm_lora/sd15')
57
+ lcm_lora.mkdir(parents=True)
58
+ huggingface_hub.hf_hub_download(repo_id='wangfuyun/AnimateLCM', filename='AnimateLCM_sd15_t2v_lora.safetensors', local_dir=lcm_lora)
59
+ load_lcm_lora(pipeline, {'start_scale':0.15, 'end_scale':0.75, 'gradient_start':0.2, 'gradient_end':0.75}, is_sdxl=is_sdxl)
60
+ pipeline.lora_map = None
61
+ pipeline.load_lora_weights('chaowenguoback/15', weight_name='add_detail.safetensors', adapter_name='detail')
62
+ pipeline.load_lora_weights('chaowenguoback/15', weight_name='b1r1av5-000007.safetensors', adapter_name='bikini')
63
+ pipeline.load_lora_weights('chaowenguoback/15', weight_name='btcstr.safetensors', adapter_name='c-string')
64
+ pipeline.load_lora_weights('chaowenguoback/15', weight_name='蓝洁瑛.safetensors', adapter_name='character')
65
+ pipeline.set_adapters(['detail', 'bikini', 'c-string', 'character'], [1, 0.4, 0.2, 0.8])
66
+
67
+ pipeline.unet = pipeline.unet.half()
68
+ pipeline.text_encoder = pipeline.text_encoder.half()
69
+ pipeline.text_encoder = pipeline.text_encoder.to(device)
70
+ load_text_embeddings(pipeline)
71
+ pipeline.text_encoder = pipeline.text_encoder.to('cpu')
72
+ pipeline = send_to_device(pipeline, device, freeze=True, force_half=False, compile=False, is_sdxl=is_sdxl)
73
+ wild_card_conversion(model_config)
74
+
75
+ is_init_img_exist = img2img_map != None
76
+ region_condi_list, region_list, ip_adapter_config_map, region2index = region_preprocess(model_config, width, height, length, save_dir, is_init_img_exist, is_sdxl)
77
+
78
+ if controlnet_type_map:
79
+ for c in controlnet_type_map:
80
+ tmp_r = [region2index[r] for r in controlnet_type_map[c]["control_region_list"]]
81
+ controlnet_type_map[c]["control_region_list"] = [r for r in tmp_r if r != -1]
82
+
83
+ prompt_map = region_condi_list[0]["prompt_map"]
84
+ prompt_tags = [re.compile(r"[^\w\-, ]").sub("", tag).strip().replace(" ", "-") for tag in prompt_map[list(prompt_map.keys())[0]].split(",")]
85
+ prompt_str = "_".join((prompt_tags[:6]))[:50]
86
+
87
+ output = pipeline(
88
+ n_prompt='nipple, waistband, back view, monochrome, longbody, lowres, bad anatomy, bad hands, fused fingers, missing fingers, too many fingers, cropped, worst quality, low quality, deformed body, bloated, ugly, unrealistic, extra hands and arms',
89
+ num_inference_steps=8,
90
+ guidance_scale=3,
91
+ unet_batch_size=1,
92
+ width=width,
93
+ height=height,
94
+ video_length=length,
95
+ return_dict=False,
96
+ context_frames=16,
97
+ context_stride=1,
98
+ context_overlap=16 // 4,
99
+ context_schedule='composite',
100
+ clip_skip=2,
101
+ controlnet_type_map=controlnet_image_map,
102
+ controlnet_image_map=controlnet_image_map,
103
+ controlnet_ref_map=controlnet_ref_map,
104
+ controlnet_no_shrink=controlnet_no_shrink,
105
+ controlnet_max_samples_on_vram=model_config.controlnet_map["max_samples_on_vram"] if "max_samples_on_vram" in model_config.controlnet_map else 999,
106
+ controlnet_max_models_on_vram=model_config.controlnet_map["max_models_on_vram"] if "max_models_on_vram" in model_config.controlnet_map else 99,
107
+ controlnet_is_loop = model_config.controlnet_map["is_loop"] if "is_loop" in model_config.controlnet_map else True,
108
+ img2img_map=img2img_map,
109
+ ip_adapter_config_map=ip_adapter_config_map,
110
+ region_list=region_list,
111
+ region_condi_list=region_condi_list,
112
+ interpolation_factor=1,
113
+ is_single_prompt_mode=model_config.is_single_prompt_mode,
114
+ gradual_latent_map=model_config.gradual_latent_hires_fix_map,
115
+ callback=None,
116
+ callback_steps=None,
117
+ )
118
+
119
+ unload_controlnet_models(pipe=pipeline)
120
+ frames = output.permute(0, 2, 1, 3, 4).squeeze(0)
121
+ frames = frames.mul(255).add_(0.5).clamp_(0, 255).permute(0, 2, 3, 1).to("cpu", torch.uint8).numpy()
122
+ del pipeline
123
+ torch.cuda.empty_cache()
124
+ pipeline = diffusers.AudioLDM2Pipeline.from_pretrained('cvssp/audioldm2-music', torch_dtype=torch.float16).to('cuda')
125
+ pipeline.scheduler = diffusers.DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
126
+ music = pipeline(prompt='Light rhythm techno', negative_prompt='low quality, average quality', num_inference_steps=20, audio_length_in_s=180).audios[0]
127
+ del pipeline
128
+ torch.cuda.empty_cache()
129
+
130
+ model = basicsr.archs.rrdbnet_arch.RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=4)
131
+ upsampler = realesrgan.RealESRGANer(scale=4, model_path='https://huggingface.co/chaowenguoback/pal/resolve/main/RealESRGAN_x4plus.pth', model=model, half=True, device='cuda')
132
+ face_enhancer = gfpgan.GFPGANer(model_path='https://huggingface.co/chaowenguoback/pal/resolve/main/GFPGANv1.4.pth',upscale=4, bg_upsampler=upsampler)
133
+ with av.open('video.mp4', mode='w') as writer:
134
+ video = writer.add_stream('h264', rate=8)
135
+ video.width = width * 4
136
+ video.height = height * 4
137
+ video.pix_fmt = 'yuv420p'
138
+ audio = writer.add_stream('aac', rate=16000)
139
+ for frame in frames: writer.mux(video.encode(av.VideoFrame.from_ndarray(face_enhancer.enhance(frame)[-1])))
140
+ writer.mux(video.encode())
141
+ for _ in builtins.range(0, music.shape[0], audio.frame_size):
142
+ frame = av.AudioFrame.from_ndarray(music[_:_ + audio.frame_size][None], format='fltp', layout='mono')
143
+ frame.sample_rate = audio.sample_rate
144
+ frame.pts = _
145
+ writer.mux(audio.encode(frame))
146
+ writer.mux(audio.encode())
147
+
148
  def greet(name, intensity):
149
  return "Hello, " + name + "!" * int(intensity)
150