Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,5 +1,150 @@
|
|
| 1 |
import gradio
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
def greet(name, intensity):
|
| 4 |
return "Hello, " + name + "!" * int(intensity)
|
| 5 |
|
|
|
|
| 1 |
import gradio
|
| 2 |
|
| 3 |
+
import basicsr, realesrgan, gfpgan, av, pathlib, diffusers, torch, transformers, builtins, numpy, re
|
| 4 |
+
from animatediff.generate import controlnet_preprocess, img2img_preprocess, wild_card_conversion, region_preprocess, unload_controlnet_models
|
| 5 |
+
from animatediff.settings import get_model_config, get_infer_config
|
| 6 |
+
from animatediff.utils.pipeline import send_to_device
|
| 7 |
+
from animatediff.utils.util import set_tensor_interpolation_method
|
| 8 |
+
from animatediff.pipelines import load_text_embeddings
|
| 9 |
+
from animatediff.pipelines.lora import load_lcm_lora
|
| 10 |
+
import huggingface_hub
|
| 11 |
+
import animatediff
|
| 12 |
+
|
| 13 |
+
width=432
|
| 14 |
+
height=768
|
| 15 |
+
length=1440
|
| 16 |
+
model_config = get_model_config('config/prompts/prompt_travel.json')
|
| 17 |
+
is_sdxl = False
|
| 18 |
+
infer_config = get_infer_config(True, is_sdxl)
|
| 19 |
+
set_tensor_interpolation_method(model_config.tensor_interpolation_slerp)
|
| 20 |
+
device = torch.device('cuda')
|
| 21 |
+
save_dir = pathlib.Path('output')
|
| 22 |
+
controlnet_image_map, controlnet_type_map, controlnet_ref_map, controlnet_no_shrink = controlnet_preprocess(model_config.controlnet_map, width, height, length, save_dir, device, is_sdxl)
|
| 23 |
+
img2img_map = img2img_preprocess(model_config.img2img_map, width, height, length, save_dir)
|
| 24 |
+
|
| 25 |
+
base_model = pathlib.Path('/tmp/base')
|
| 26 |
+
diffusers.StableDiffusionPipeline.from_pretrained('stable-diffusion-v1-5/stable-diffusion-v1-5').save_pretrained(base_model)
|
| 27 |
+
|
| 28 |
+
tokenizer = transformers.CLIPTokenizer.from_pretrained(base_model, subfolder='tokenizer')
|
| 29 |
+
text_encoder = transformers.CLIPTextModel.from_pretrained(base_model, subfolder='text_encoder')
|
| 30 |
+
vae = diffusers.AutoencoderKL.from_single_file('https://huggingface.co/chaowenguoback/pal/blob/main/vae-ft-mse-840000-ema-pruned.safetensors')
|
| 31 |
+
huggingface_hub.hf_hub_download(repo_id='wangfuyun/AnimateLCM', filename='AnimateLCM_sd15_t2v.ckpt', local_dir=pathlib.Path.cwd())
|
| 32 |
+
unet = animatediff.models.unet.UNet2DConditionModel.from_pretrained_2d(
|
| 33 |
+
pretrained_model_path=base_model,
|
| 34 |
+
motion_module_path=pathlib.Path.cwd().joinpath('AnimateLCM_sd15_t2v.ckpt'),
|
| 35 |
+
subfolder='unet',
|
| 36 |
+
unet_additional_kwargs=infer_config.unet_additional_kwargs,
|
| 37 |
+
feature_extractor = transformers.CLIPImageProcessor.from_pretrained(base_model, subfolder='feature_extractor')
|
| 38 |
+
|
| 39 |
+
pipeline = diffusers.StableDiffusionPipeline.from_single_file('https://huggingface.co/chaowenguoback/15/blob/main/chilloutMix-Ni.safetensors', config='stable-diffusion-v1-5/stable-diffusion-v1-5', safety_checker=None, use_safetensors=True)
|
| 40 |
+
unet.load_state_dict(pipeline.unet.state_dict(), strict=False)
|
| 41 |
+
text_encoder.load_state_dict(pipeline.text_encoder.state_dict(), strict=False)
|
| 42 |
+
del pipeline
|
| 43 |
+
|
| 44 |
+
unet.enable_xformers_memory_efficient_attention()
|
| 45 |
+
|
| 46 |
+
pipeline = animatediff.pipelines.AnimationPipeline(
|
| 47 |
+
vae=vae,
|
| 48 |
+
text_encoder=text_encoder,
|
| 49 |
+
tokenizer=tokenizer,
|
| 50 |
+
unet=unet,
|
| 51 |
+
scheduler=diffusers.LCMScheduler.from_config(infer_config.noise_scheduler_kwargs),
|
| 52 |
+
feature_extractor=feature_extractor,
|
| 53 |
+
controlnet_map=None,
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
lcm_lora = pathlib.Path.cwd().joinpath('data/models/lcm_lora/sd15')
|
| 57 |
+
lcm_lora.mkdir(parents=True)
|
| 58 |
+
huggingface_hub.hf_hub_download(repo_id='wangfuyun/AnimateLCM', filename='AnimateLCM_sd15_t2v_lora.safetensors', local_dir=lcm_lora)
|
| 59 |
+
load_lcm_lora(pipeline, {'start_scale':0.15, 'end_scale':0.75, 'gradient_start':0.2, 'gradient_end':0.75}, is_sdxl=is_sdxl)
|
| 60 |
+
pipeline.lora_map = None
|
| 61 |
+
pipeline.load_lora_weights('chaowenguoback/15', weight_name='add_detail.safetensors', adapter_name='detail')
|
| 62 |
+
pipeline.load_lora_weights('chaowenguoback/15', weight_name='b1r1av5-000007.safetensors', adapter_name='bikini')
|
| 63 |
+
pipeline.load_lora_weights('chaowenguoback/15', weight_name='btcstr.safetensors', adapter_name='c-string')
|
| 64 |
+
pipeline.load_lora_weights('chaowenguoback/15', weight_name='蓝洁瑛.safetensors', adapter_name='character')
|
| 65 |
+
pipeline.set_adapters(['detail', 'bikini', 'c-string', 'character'], [1, 0.4, 0.2, 0.8])
|
| 66 |
+
|
| 67 |
+
pipeline.unet = pipeline.unet.half()
|
| 68 |
+
pipeline.text_encoder = pipeline.text_encoder.half()
|
| 69 |
+
pipeline.text_encoder = pipeline.text_encoder.to(device)
|
| 70 |
+
load_text_embeddings(pipeline)
|
| 71 |
+
pipeline.text_encoder = pipeline.text_encoder.to('cpu')
|
| 72 |
+
pipeline = send_to_device(pipeline, device, freeze=True, force_half=False, compile=False, is_sdxl=is_sdxl)
|
| 73 |
+
wild_card_conversion(model_config)
|
| 74 |
+
|
| 75 |
+
is_init_img_exist = img2img_map != None
|
| 76 |
+
region_condi_list, region_list, ip_adapter_config_map, region2index = region_preprocess(model_config, width, height, length, save_dir, is_init_img_exist, is_sdxl)
|
| 77 |
+
|
| 78 |
+
if controlnet_type_map:
|
| 79 |
+
for c in controlnet_type_map:
|
| 80 |
+
tmp_r = [region2index[r] for r in controlnet_type_map[c]["control_region_list"]]
|
| 81 |
+
controlnet_type_map[c]["control_region_list"] = [r for r in tmp_r if r != -1]
|
| 82 |
+
|
| 83 |
+
prompt_map = region_condi_list[0]["prompt_map"]
|
| 84 |
+
prompt_tags = [re.compile(r"[^\w\-, ]").sub("", tag).strip().replace(" ", "-") for tag in prompt_map[list(prompt_map.keys())[0]].split(",")]
|
| 85 |
+
prompt_str = "_".join((prompt_tags[:6]))[:50]
|
| 86 |
+
|
| 87 |
+
output = pipeline(
|
| 88 |
+
n_prompt='nipple, waistband, back view, monochrome, longbody, lowres, bad anatomy, bad hands, fused fingers, missing fingers, too many fingers, cropped, worst quality, low quality, deformed body, bloated, ugly, unrealistic, extra hands and arms',
|
| 89 |
+
num_inference_steps=8,
|
| 90 |
+
guidance_scale=3,
|
| 91 |
+
unet_batch_size=1,
|
| 92 |
+
width=width,
|
| 93 |
+
height=height,
|
| 94 |
+
video_length=length,
|
| 95 |
+
return_dict=False,
|
| 96 |
+
context_frames=16,
|
| 97 |
+
context_stride=1,
|
| 98 |
+
context_overlap=16 // 4,
|
| 99 |
+
context_schedule='composite',
|
| 100 |
+
clip_skip=2,
|
| 101 |
+
controlnet_type_map=controlnet_image_map,
|
| 102 |
+
controlnet_image_map=controlnet_image_map,
|
| 103 |
+
controlnet_ref_map=controlnet_ref_map,
|
| 104 |
+
controlnet_no_shrink=controlnet_no_shrink,
|
| 105 |
+
controlnet_max_samples_on_vram=model_config.controlnet_map["max_samples_on_vram"] if "max_samples_on_vram" in model_config.controlnet_map else 999,
|
| 106 |
+
controlnet_max_models_on_vram=model_config.controlnet_map["max_models_on_vram"] if "max_models_on_vram" in model_config.controlnet_map else 99,
|
| 107 |
+
controlnet_is_loop = model_config.controlnet_map["is_loop"] if "is_loop" in model_config.controlnet_map else True,
|
| 108 |
+
img2img_map=img2img_map,
|
| 109 |
+
ip_adapter_config_map=ip_adapter_config_map,
|
| 110 |
+
region_list=region_list,
|
| 111 |
+
region_condi_list=region_condi_list,
|
| 112 |
+
interpolation_factor=1,
|
| 113 |
+
is_single_prompt_mode=model_config.is_single_prompt_mode,
|
| 114 |
+
gradual_latent_map=model_config.gradual_latent_hires_fix_map,
|
| 115 |
+
callback=None,
|
| 116 |
+
callback_steps=None,
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
unload_controlnet_models(pipe=pipeline)
|
| 120 |
+
frames = output.permute(0, 2, 1, 3, 4).squeeze(0)
|
| 121 |
+
frames = frames.mul(255).add_(0.5).clamp_(0, 255).permute(0, 2, 3, 1).to("cpu", torch.uint8).numpy()
|
| 122 |
+
del pipeline
|
| 123 |
+
torch.cuda.empty_cache()
|
| 124 |
+
pipeline = diffusers.AudioLDM2Pipeline.from_pretrained('cvssp/audioldm2-music', torch_dtype=torch.float16).to('cuda')
|
| 125 |
+
pipeline.scheduler = diffusers.DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
|
| 126 |
+
music = pipeline(prompt='Light rhythm techno', negative_prompt='low quality, average quality', num_inference_steps=20, audio_length_in_s=180).audios[0]
|
| 127 |
+
del pipeline
|
| 128 |
+
torch.cuda.empty_cache()
|
| 129 |
+
|
| 130 |
+
model = basicsr.archs.rrdbnet_arch.RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=4)
|
| 131 |
+
upsampler = realesrgan.RealESRGANer(scale=4, model_path='https://huggingface.co/chaowenguoback/pal/resolve/main/RealESRGAN_x4plus.pth', model=model, half=True, device='cuda')
|
| 132 |
+
face_enhancer = gfpgan.GFPGANer(model_path='https://huggingface.co/chaowenguoback/pal/resolve/main/GFPGANv1.4.pth',upscale=4, bg_upsampler=upsampler)
|
| 133 |
+
with av.open('video.mp4', mode='w') as writer:
|
| 134 |
+
video = writer.add_stream('h264', rate=8)
|
| 135 |
+
video.width = width * 4
|
| 136 |
+
video.height = height * 4
|
| 137 |
+
video.pix_fmt = 'yuv420p'
|
| 138 |
+
audio = writer.add_stream('aac', rate=16000)
|
| 139 |
+
for frame in frames: writer.mux(video.encode(av.VideoFrame.from_ndarray(face_enhancer.enhance(frame)[-1])))
|
| 140 |
+
writer.mux(video.encode())
|
| 141 |
+
for _ in builtins.range(0, music.shape[0], audio.frame_size):
|
| 142 |
+
frame = av.AudioFrame.from_ndarray(music[_:_ + audio.frame_size][None], format='fltp', layout='mono')
|
| 143 |
+
frame.sample_rate = audio.sample_rate
|
| 144 |
+
frame.pts = _
|
| 145 |
+
writer.mux(audio.encode(frame))
|
| 146 |
+
writer.mux(audio.encode())
|
| 147 |
+
|
| 148 |
def greet(name, intensity):
|
| 149 |
return "Hello, " + name + "!" * int(intensity)
|
| 150 |
|