import gradio

import av, pathlib, diffusers, torch, transformers, builtins, numpy, re
from animatediff.generate import controlnet_preprocess, img2img_preprocess, wild_card_conversion, region_preprocess, unload_controlnet_models
from animatediff.settings import get_model_config, get_infer_config
from animatediff.utils.pipeline import send_to_device
from animatediff.utils.util import set_tensor_interpolation_method
from animatediff.pipelines import load_text_embeddings
from animatediff.pipelines.lora import load_lcm_lora
import huggingface_hub
import animatediff

width=432
height=768
length=1440
model_config = get_model_config('config/prompts/prompt_travel.json')
is_sdxl = False
infer_config = get_infer_config(True, is_sdxl)
set_tensor_interpolation_method(model_config.tensor_interpolation_slerp)
device = torch.device('cuda')
save_dir = pathlib.Path('output')
controlnet_image_map, controlnet_type_map, controlnet_ref_map, controlnet_no_shrink = controlnet_preprocess(model_config.controlnet_map, width, height, length, save_dir, device, is_sdxl)
img2img_map = img2img_preprocess(model_config.img2img_map, width, height, length, save_dir)

base_model = pathlib.Path('/tmp/base')
diffusers.StableDiffusionPipeline.from_pretrained('stable-diffusion-v1-5/stable-diffusion-v1-5').save_pretrained(base_model)

tokenizer = transformers.CLIPTokenizer.from_pretrained(base_model, subfolder='tokenizer')
text_encoder = transformers.CLIPTextModel.from_pretrained(base_model, subfolder='text_encoder')
vae = diffusers.AutoencoderKL.from_single_file('https://huggingface.co/chaowenguoback/pal/blob/main/vae-ft-mse-840000-ema-pruned.safetensors')
huggingface_hub.hf_hub_download(repo_id='wangfuyun/AnimateLCM', filename='AnimateLCM_sd15_t2v.ckpt', local_dir=pathlib.Path.cwd())
unet = animatediff.models.unet.UNet2DConditionModel.from_pretrained_2d(
    pretrained_model_path=base_model,
    motion_module_path=pathlib.Path.cwd().joinpath('AnimateLCM_sd15_t2v.ckpt'),
    subfolder='unet',
    unet_additional_kwargs=infer_config.unet_additional_kwargs,
    feature_extractor = transformers.CLIPImageProcessor.from_pretrained(base_model, subfolder='feature_extractor')
)

pipeline = diffusers.StableDiffusionPipeline.from_single_file('https://huggingface.co/chaowenguoback/15/blob/main/chilloutMix-Ni.safetensors', config='stable-diffusion-v1-5/stable-diffusion-v1-5', safety_checker=None, use_safetensors=True)
unet.load_state_dict(pipeline.unet.state_dict(), strict=False)
text_encoder.load_state_dict(pipeline.text_encoder.state_dict(), strict=False)
del pipeline

unet.enable_xformers_memory_efficient_attention()

pipeline = animatediff.pipelines.AnimationPipeline(
    vae=vae,
    text_encoder=text_encoder,
    tokenizer=tokenizer,
    unet=unet,
    scheduler=diffusers.LCMScheduler.from_config(infer_config.noise_scheduler_kwargs),
    feature_extractor=feature_extractor,
    controlnet_map=None,
)

lcm_lora = pathlib.Path.cwd().joinpath('data/models/lcm_lora/sd15')
lcm_lora.mkdir(parents=True)
huggingface_hub.hf_hub_download(repo_id='wangfuyun/AnimateLCM', filename='AnimateLCM_sd15_t2v_lora.safetensors', local_dir=lcm_lora)
load_lcm_lora(pipeline, {'start_scale':0.15, 'end_scale':0.75, 'gradient_start':0.2, 'gradient_end':0.75}, is_sdxl=is_sdxl)
pipeline.lora_map = None
pipeline.load_lora_weights('chaowenguoback/15', weight_name='add_detail.safetensors', adapter_name='detail')
pipeline.load_lora_weights('chaowenguoback/15', weight_name='b1r1av5-000007.safetensors', adapter_name='bikini')
pipeline.load_lora_weights('chaowenguoback/15', weight_name='btcstr.safetensors', adapter_name='c-string')
pipeline.load_lora_weights('chaowenguoback/15', weight_name='蓝洁瑛.safetensors', adapter_name='character')
pipeline.set_adapters(['detail', 'bikini', 'c-string', 'character'], [1, 0.4, 0.2, 0.8])

pipeline.unet = pipeline.unet.half()
pipeline.text_encoder = pipeline.text_encoder.half()
pipeline.text_encoder = pipeline.text_encoder.to(device)
load_text_embeddings(pipeline)
pipeline.text_encoder = pipeline.text_encoder.to('cpu')
pipeline = send_to_device(pipeline, device, freeze=True, force_half=False, compile=False, is_sdxl=is_sdxl)
wild_card_conversion(model_config)

is_init_img_exist = img2img_map != None
region_condi_list, region_list, ip_adapter_config_map, region2index = region_preprocess(model_config, width, height, length, save_dir, is_init_img_exist, is_sdxl)

if controlnet_type_map:
    for c in controlnet_type_map:
        tmp_r = [region2index[r] for r in controlnet_type_map[c]["control_region_list"]]
        controlnet_type_map[c]["control_region_list"] = [r for r in tmp_r if r != -1]

prompt_map = region_condi_list[0]["prompt_map"]
prompt_tags = [re.compile(r"[^\w\-, ]").sub("", tag).strip().replace(" ", "-") for tag in prompt_map[list(prompt_map.keys())[0]].split(",")]
prompt_str = "_".join((prompt_tags[:6]))[:50]

output = pipeline(
    n_prompt='nipple, waistband, back view, monochrome, longbody, lowres, bad anatomy, bad hands, fused fingers, missing fingers, too many fingers, cropped, worst quality, low quality, deformed body, bloated, ugly, unrealistic, extra hands and arms',
    num_inference_steps=8,
    guidance_scale=3,
    unet_batch_size=1,
    width=width,
    height=height,
    video_length=length,
    return_dict=False,
    context_frames=16,
    context_stride=1,
    context_overlap=16 // 4,
    context_schedule='composite',
    clip_skip=2,
    controlnet_type_map=controlnet_image_map,
    controlnet_image_map=controlnet_image_map,
    controlnet_ref_map=controlnet_ref_map,
    controlnet_no_shrink=controlnet_no_shrink,
    controlnet_max_samples_on_vram=model_config.controlnet_map["max_samples_on_vram"] if "max_samples_on_vram" in model_config.controlnet_map else 999,
    controlnet_max_models_on_vram=model_config.controlnet_map["max_models_on_vram"] if "max_models_on_vram" in model_config.controlnet_map else 99,
    controlnet_is_loop = model_config.controlnet_map["is_loop"] if "is_loop" in model_config.controlnet_map else True,
    img2img_map=img2img_map,
    ip_adapter_config_map=ip_adapter_config_map,
    region_list=region_list,
    region_condi_list=region_condi_list,
    interpolation_factor=1,
    is_single_prompt_mode=model_config.is_single_prompt_mode,
    gradual_latent_map=model_config.gradual_latent_hires_fix_map,
    callback=None,
    callback_steps=None,
)

unload_controlnet_models(pipe=pipeline)
frames = output.permute(0, 2, 1, 3, 4).squeeze(0)
frames = frames.mul(255).add_(0.5).clamp_(0, 255).permute(0, 2, 3, 1).to("cpu", torch.uint8).numpy()
del pipeline
torch.cuda.empty_cache()
pipeline = diffusers.AudioLDM2Pipeline.from_pretrained('cvssp/audioldm2-music', torch_dtype=torch.float16).to('cuda')
pipeline.scheduler = diffusers.DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
music = pipeline(prompt='Light rhythm techno', negative_prompt='low quality, average quality', num_inference_steps=20, audio_length_in_s=180).audios[0]
del pipeline
torch.cuda.empty_cache()

with av.open('video.mp4', mode='w') as writer:
    video = writer.add_stream('h264', rate=8)
    video.width = width * 4
    video.height = height * 4
    video.pix_fmt = 'yuv420p'
    audio = writer.add_stream('aac', rate=16000)
    for frame in frames: writer.mux(video.encode(av.VideoFrame.from_ndarray(frame)))
    writer.mux(video.encode())
    for _ in builtins.range(0, music.shape[0], audio.frame_size):
        frame = av.AudioFrame.from_ndarray(music[_:_ + audio.frame_size][None], format='fltp', layout='mono')
        frame.sample_rate = audio.sample_rate
        frame.pts = _
        writer.mux(audio.encode(frame))
    writer.mux(audio.encode())

def greet(name, intensity):
    return "Hello, " + name + "!" * int(intensity)

demo = gradio.Interface(
    fn=greet,
    inputs=["text", "slider"],
    outputs=["text"],
    api_name="predict"
)

demo.launch()