nateraw · chenxwh · Sep 8, 2022 · Sep 9, 2022 · nateraw · Sep 9, 2022
diff --git a/README.md b/README.md
@@ -1,6 +1,7 @@
 # stable-diffusion-videos
 
 Try it yourself in Colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/nateraw/stable-diffusion-videos/blob/main/stable_diffusion_videos.ipynb)
+[![Replicate](https://replicate.com/nateraw/stable-diffusion-videos/badge)](https://replicate.com/nateraw/stable-diffusion-videos)
 
 **Example** - morphing between "blueberry spaghetti" and "strawberry spaghetti"
 

diff --git a/cog.yaml b/cog.yaml
@@ -0,0 +1,17 @@
+build:
+ gpu: true
+ cuda: "11.6.2"
+ python_version: "3.10"
+ python_packages:
+ - "git+https://github.com/huggingface/diffusers@f085d2f5c6569a1c0d90327c51328622036ef76e"
+ - "torch==1.12.1 --extra-index-url=https://download.pytorch.org/whl/cu116"
+ - "ftfy==6.1.1"
+ - "scipy==1.9.0"
+ - "transformers==4.21.1"
+ - "fire==0.4.0"
+ - "ipython==8.5.0"
+ run:
+ - apt-get update && apt-get install -y software-properties-common
+ - add-apt-repository ppa:ubuntu-toolchain-r/test
+ - apt update -y && apt-get install ffmpeg -y
+predict: "predict.py:Predictor"
diff --git a/download_weights.py b/download_weights.py
@@ -0,0 +1,26 @@
+import os
+import sys
+import argparse
+import torch
+from diffusers import StableDiffusionPipeline
+
+os.makedirs("diffusers-cache", exist_ok=True)
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--auth_token",
+ required=True,
+ help="Authentication token from Huggingface for downloaging weights.",
+ )
+ args = parser.parse_args()
+
+ pipe = StableDiffusionPipeline.from_pretrained(
+ "CompVis/stable-diffusion-v1-4",
+ cache_dir="diffusers-cache",
+ torch_dtype=torch.float16,
+ use_auth_token=args.auth_token,
+ revision="fp16"
+ )
diff --git a/predict.py b/predict.py
@@ -0,0 +1,225 @@
+import os
+from subprocess import call
+from typing import Optional, List
+import shutil
+import numpy as np
+import torch
+from torch import autocast
+from diffusers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from PIL import Image
+from cog import BasePredictor, Input, Path
+
+from stable_diffusion_videos import StableDiffusionPipeline
+
+
+MODEL_CACHE = "diffusers-cache"
+
+
+class Predictor(BasePredictor):
+ def setup(self):
+ """Load the model into memory to make running multiple predictions efficient"""
+ print("Loading pipeline...")
+
+ self.pipeline = StableDiffusionPipeline.from_pretrained(
+ "CompVis/stable-diffusion-v1-4",
+ # scheduler=lms,
+ cache_dir="diffusers-cache",
+ local_files_only=True,
+ revision="fp16",
+ torch_dtype=torch.float16,
+ ).to("cuda")
+
+ default_scheduler = PNDMScheduler(
+ beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
+ )
+ ddim_scheduler = DDIMScheduler(
+ beta_start=0.00085,
+ beta_end=0.012,
+ beta_schedule="scaled_linear",
+ clip_sample=False,
+ set_alpha_to_one=False,
+ )
+ klms_scheduler = LMSDiscreteScheduler(
+ beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
+ )
+ self.SCHEDULERS = dict(
+ default=default_scheduler, ddim=ddim_scheduler, klms=klms_scheduler
+ )
+
+ @torch.inference_mode()
+ @torch.cuda.amp.autocast()
+ def predict(
+ self,
+ prompts: str = Input(
+ description="Input prompts, separate each prompt with '|'.",
+ default="a cat | a dog | a horse",
+ ),
+ seeds: str = Input(
+ description="Random seed, separated with '|' to use different seeds for each of the prompt provided above. Leave blank to randomize the seed.",
+ default=None,
+ ),
+ scheduler: str = Input(
+ description="Choose the scheduler",
+ choices=["default", "ddim", "klms"],
+ default="klms",
+ ),
+ num_inference_steps: int = Input(
+ description="Number of denoising steps for each image generated from the prompt",
+ ge=1,
+ le=500,
+ default=50,
+ ),
+ guidance_scale: float = Input(
+ description="Scale for classifier-free guidance", ge=1, le=20, default=7.5
+ ),
+ num_steps: int = Input(
+ description="Steps for generating the interpolation video. Recommended to set to 3 or 5 for testing, then up it to 60-200 for better results.",
+ default=50,
+ ),
+ fps: int = Input(
+ description="Frame rate for the video.", default=15, ge=5, le=60
+ ),
+ ) -> Path:
+ """Run a single prediction on the model"""
+
+ prompts = [p.strip() for p in prompts.split("|")]
+ if seeds is None:
+ print("Setting Random seeds.")
+ seeds = [int.from_bytes(os.urandom(2), "big") for s in range(len(prompts))]
+ else:
+ seeds = [s.strip() for s in seeds.split("|")]
+ for s in seeds:
+ assert s.isdigit(), "Please provide integer seed."
+ seeds = [int(s) for s in seeds]
+
+ if len(seeds) > len(prompts):
+ seeds = seeds[: len(prompts)]
+ else:
+ seeds_not_set = len(prompts) - len(seeds)
+ print("Setting Random seeds.")
+ seeds = seeds + [
+ int.from_bytes(os.urandom(2), "big") for s in range(seeds_not_set)
+ ]
+
+ print("Seeds used for prompts are:")
+ for prompt, seed in zip(prompts, seeds):
+ print(f"{prompt}: {seed}")
+
+ # use the default settings for the demo
+ height = 512
+ width = 512
+ eta = 0.0
+ disable_tqdm = False
+ use_lerp_for_text = False
+
+ self.pipeline.set_progress_bar_config(disable=disable_tqdm)
+ self.pipeline.scheduler = self.SCHEDULERS[scheduler]
+
+ outdir = "cog_out"
+ if os.path.exists(outdir):
+ shutil.rmtree(outdir)
+ os.makedirs(outdir)
+
+ first_prompt, *prompts = prompts
+ embeds_a = self.pipeline.embed_text(first_prompt)
+
+ first_seed, *seeds = seeds
+ latents_a = torch.randn(
+ (1, self.pipeline.unet.in_channels, height // 8, width // 8),
+ device=self.pipeline.device,
+ generator=torch.Generator(device=self.pipeline.device).manual_seed(
+ first_seed
+ ),
+ )
+
+ frame_index = 0
+ for prompt, seed in zip(prompts, seeds):
+ # Text
+ embeds_b = self.pipeline.embed_text(prompt)
+
+ # Latent Noise
+ latents_b = torch.randn(
+ (1, self.pipeline.unet.in_channels, height // 8, width // 8),
+ device=self.pipeline.device,
+ generator=torch.Generator(device=self.pipeline.device).manual_seed(
+ seed
+ ),
+ )
+
+ for i, t in enumerate(np.linspace(0, 1, num_steps)):
+ do_print_progress = (i == 0) or ((frame_index + 1) % 20 == 0)
+ if do_print_progress:
+ print(f"COUNT: {frame_index+1}/{len(seeds)*num_steps}")
+
+ if use_lerp_for_text:
+ embeds = torch.lerp(embeds_a, embeds_b, float(t))
+ else:
+ embeds = slerp(float(t), embeds_a, embeds_b)
+ latents = slerp(float(t), latents_a, latents_b)
+
+ with torch.autocast("cuda"):
+ im = self.pipeline(
+ latents=latents,
+ text_embeddings=embeds,
+ height=height,
+ width=width,
+ guidance_scale=guidance_scale,
+ eta=eta,
+ num_inference_steps=num_inference_steps,
+ )["sample"][0]
+
+ im.save(os.path.join(outdir, "frame%06d.jpg" % frame_index))
+ frame_index += 1
+
+ embeds_a = embeds_b
+ latents_a = latents_b
+
+ image_path = os.path.join(outdir, "frame%06d.jpg")
+ video_path = f"/tmp/out.mp4"
+
+ cmdd = (
+ "ffmpeg -y -r "
+ + str(fps)
+ + " -i "
+ + image_path
+ + " -vcodec libx264 -crf 25 -pix_fmt yuv420p "
+ + video_path
+ )
+ run_cmd(cmdd)
+
+ return Path(video_path)
+
+
+def run_cmd(command):
+ try:
+ call(command, shell=True)
+ except KeyboardInterrupt:
+ print("Process interrupted")
+ sys.exit(1)
+
+
+def slerp(t, v0, v1, DOT_THRESHOLD=0.9995):
+ """helper function to spherically interpolate two arrays v1 v2"""
+
+ if not isinstance(v0, np.ndarray):
+ inputs_are_torch = True
+ input_device = v0.device
+ v0 = v0.cpu().numpy()
+ v1 = v1.cpu().numpy()
+
+ dot = np.sum(v0 * v1 / (np.linalg.norm(v0) * np.linalg.norm(v1)))
+ if np.abs(dot) > DOT_THRESHOLD:
+ v2 = (1 - t) * v0 + t * v1
+ else:
+ theta_0 = np.arccos(dot)
+ sin_theta_0 = np.sin(theta_0)
+ theta_t = theta_0 * t
+ sin_theta_t = np.sin(theta_t)
+ s0 = np.sin(theta_0 - theta_t) / sin_theta_0
+ s1 = sin_theta_t / sin_theta_0
+ v2 = s0 * v0 + s1 * v1
+
+ if inputs_are_torch:
+ v2 = torch.from_numpy(v2).to(input_device)
+
+ return v2