Spaces:

mattb512
/

fastai-lesson-10-diffusers

Sleeping

App Files Files Community

mattb512 commited on Jan 26, 2024

Commit

29f02ac

1 Parent(s): f65c76f

remove notebook - too big

Browse files

Files changed (3) hide show

.gitignore +3 -0
image_generator.py +15 -24
requirements.txt +8 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+.venv/**
+.DS_Store
+__pycache__

image_generator.py CHANGED Viewed

@@ -16,6 +16,7 @@ from diffusers import AutoencoderKL, UNet2DConditionModel
 from diffusers import LMSDiscreteScheduler
 from tqdm.auto import tqdm
 logging.disable(logging.WARNING)
 class ImageGenerator():
     def __init__(self,
@@ -27,23 +28,16 @@ class ImageGenerator():
         self.height = 512
         self.generator = torch.manual_seed(32)
         self.bs = 1
-        if torch.cuda.is_available():
-            self.device = torch.device("cuda")
-            self.dtype = torch.float16
-        else:
-            self.device = torch.device("cpu")
-            self.dtype = torch.float32
-        print(f"Working on device: {self.device=}")
     def __repr__(self):
         return f"Image Generator with {self.g=}"
     def load_models(self):
-        self.tokenizer    = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=self.dtype)
-        self.text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=self.dtype).to(self.device)
-        # vae             = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-ema",     torch_dtype=self.dtype ).to(self.device)
-        self.vae          = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae").to(self.device)
-        self.unet         = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet").to(self.device) #torch_dtype=torch.float16,
     def load_scheduler( self,
                         beta_start : float=0.00085,
@@ -57,13 +51,10 @@ class ImageGenerator():
             beta_schedule="scaled_linear",
             num_train_timesteps=num_train_timesteps)
-    def load_image(self, filepath:str) -> Image:
         return Image.open(filepath).resize(size=(self.width,self.height))
         #.convert("RGB") # RGB = 3 dimensions, RGBA = 4 dimensions
-    def nparray_to_pil(self, np_image: np.array) -> Image:
-        return Image.fromarray(np_image).resize(size=(self.width,self.height))
     def pil_to_latent(self, image: Image) -> torch.Tensor:
         with torch.no_grad():
             np_img = np.transpose( (( np.array(image) / 255)-0.5)*2, (2,0,1)) # turn pil image into np array with values between -1 and 1
@@ -72,7 +63,7 @@ class ImageGenerator():
             np_images = np.repeat(np_img[np.newaxis, :, :], self.bs, axis=0) # adding a new dimension and repeating the image for each prompt
             # print(f"{np_images.shape=}")
-            decoded_latent = torch.from_numpy(np_images).to(self.device).float() #<-- stability-ai vae uses half(), compvis vae uses float?
             # print(f"{decoded_latent.shape=}")
             encoded_latent = 0.18215 * self.vae.encode(decoded_latent).latent_dist.sample()
@@ -84,7 +75,7 @@ class ImageGenerator():
         # noise = torch.randn_like(latent) # missing generator parameter
         noise = torch.randn(
                 size = (self.bs, self.unet.config.in_channels, self.height//8, self.width//8),
-                generator = self.generator).to(self.device)
         timesteps = torch.tensor([self.scheduler.timesteps[scheduler_steps]])
         noisy_latent = self.scheduler.add_noise(latent, noise, timesteps)
         # print(f"add_noise: {timesteps.shape=} {timesteps=} {noisy_latent.shape=}")
@@ -112,7 +103,7 @@ class ImageGenerator():
         if maxlen is None: maxlen = self.tokenizer.model_max_length
         inp = self.tokenizer([prompt], padding="max_length", max_length=maxlen, truncation=True, return_tensors="pt")
-        return self.text_encoder(inp.input_ids.to(self.device))[0].float()
     def tensor_to_pil(self, t:torch.Tensor) -> Image:
         '''transforms a tensor decoded by the vae to a pil image'''
@@ -135,7 +126,7 @@ class ImageGenerator():
                  seed : int=32,
                  steps : int=30,
                  start_step_ratio : float=1/5,
-                 init_image : Image=None,
                  latent_callback_mod : int=10):
         self.latent_images = []
         if not negative_prompt: negative_prompt = ""
@@ -162,13 +153,13 @@ class ImageGenerator():
         else:
             start_steps = int(steps * start_step_ratio) # 0%: too much noise, 100% no noise
             # print(f"{start_steps=}")
-            # img = self.load_image(init_image)
-            latents =self.pil_to_latent(init_image)
             self.latent_callback(latents)
-            latents = self.add_noise(latents, start_steps).to(self.device).float()
             self.latent_callback(latents)
-        latents = latents.to(self.device).float()
         for i,ts in enumerate(tqdm(self.scheduler.timesteps, leave=False)):
             if i >= start_steps:

 from diffusers import LMSDiscreteScheduler
 from tqdm.auto import tqdm
 logging.disable(logging.WARNING)
 class ImageGenerator():
     def __init__(self,
         self.height = 512
         self.generator = torch.manual_seed(32)
         self.bs = 1
     def __repr__(self):
         return f"Image Generator with {self.g=}"
     def load_models(self):
+        self.tokenizer    = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14",        torch_dtype=torch.float16)
+        self.text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14",        torch_dtype=torch.float16                          ).to("cuda")
+        # vae             = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-ema",             torch_dtype=torch.float16                          ).to("cuda")
+        self.vae          = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4",                                    subfolder="vae"         ).to("cuda")
+        self.unet         = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4",                             subfolder="unet"        ).to("cuda") #torch_dtype=torch.float16,
     def load_scheduler( self,
                         beta_start : float=0.00085,
             beta_schedule="scaled_linear",
             num_train_timesteps=num_train_timesteps)
+    def load_image(self, filepath:str):
         return Image.open(filepath).resize(size=(self.width,self.height))
         #.convert("RGB") # RGB = 3 dimensions, RGBA = 4 dimensions
     def pil_to_latent(self, image: Image) -> torch.Tensor:
         with torch.no_grad():
             np_img = np.transpose( (( np.array(image) / 255)-0.5)*2, (2,0,1)) # turn pil image into np array with values between -1 and 1
             np_images = np.repeat(np_img[np.newaxis, :, :], self.bs, axis=0) # adding a new dimension and repeating the image for each prompt
             # print(f"{np_images.shape=}")
+            decoded_latent = torch.from_numpy(np_images).to("cuda").float() #<-- stability-ai vae uses half(), compvis vae uses float?
             # print(f"{decoded_latent.shape=}")
             encoded_latent = 0.18215 * self.vae.encode(decoded_latent).latent_dist.sample()
         # noise = torch.randn_like(latent) # missing generator parameter
         noise = torch.randn(
                 size = (self.bs, self.unet.config.in_channels, self.height//8, self.width//8),
+                generator = self.generator).to("cuda")
         timesteps = torch.tensor([self.scheduler.timesteps[scheduler_steps]])
         noisy_latent = self.scheduler.add_noise(latent, noise, timesteps)
         # print(f"add_noise: {timesteps.shape=} {timesteps=} {noisy_latent.shape=}")
         if maxlen is None: maxlen = self.tokenizer.model_max_length
         inp = self.tokenizer([prompt], padding="max_length", max_length=maxlen, truncation=True, return_tensors="pt")
+        return self.text_encoder(inp.input_ids.to("cuda"))[0].float()
     def tensor_to_pil(self, t:torch.Tensor) -> Image:
         '''transforms a tensor decoded by the vae to a pil image'''
                  seed : int=32,
                  steps : int=30,
                  start_step_ratio : float=1/5,
+                 init_image : str=None,
                  latent_callback_mod : int=10):
         self.latent_images = []
         if not negative_prompt: negative_prompt = ""
         else:
             start_steps = int(steps * start_step_ratio) # 0%: too much noise, 100% no noise
             # print(f"{start_steps=}")
+            img = self.load_image(init_image)
+            latents =self. pil_to_latent(img)
             self.latent_callback(latents)
+            latents = self.add_noise(latents, start_steps).to("cuda").float()
             self.latent_callback(latents)
+        latents = latents.to("cuda").float()
         for i,ts in enumerate(tqdm(self.scheduler.timesteps, leave=False)):
             if i >= start_steps:

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+diffusers
+transformers
+fastcore
+matplotlib
+scipy
+torch
+torchvision
+gradio