hacky stablediffusion code for generating videos

This is a Python script called `stablediffusionwalk.py` that creates hypnotic moving videos by smoothly walking randomly through Stable Diffusion's sample space. The script uses the Diffusers library and requires access to Stable Diffusion checkpoints from Hugging Face, along with various dependencies. Users can generate videos by running the script with a text prompt and then stitching the output images together using FFmpeg.

stablediffusionwalk.py This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters Show hidden characters """ stable diffusion dreaming creates hypnotic moving videos by smoothly walking randomly through the sample space example way to run this script: $ python stablediffusionwalk.py --prompt "blueberry spaghetti" --name blueberry to stitch together the images, e.g.: $ ffmpeg -r 10 -f image2 -s 512x512 -i blueberry/frame%06d.jpg -vcodec libx264 -crf 10 -pix fmt yuv420p blueberry.mp4 nice slerp def from @xsteenbrugge ty you have to have access to stablediffusion checkpoints from https://huggingface.co/CompVis and install all the other dependencies e.g. diffusers library """ import os import inspect import fire from diffusers import StableDiffusionPipeline from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler from time import time from PIL import Image from einops import rearrange import numpy as np import torch from torch import autocast from torchvision.utils import make grid ----------------------------------------------------------------------------- @torch.no grad def diffuse pipe, cond embeddings, text conditioning, should be 1, 77, 768 cond latents, image conditioning, should be 1, 4, 64, 64 num inference steps, guidance scale, eta, : torch device = cond latents.get device classifier guidance: add the unconditional embedding max length = cond embeddings.shape 1 77 uncond input = pipe.tokenizer "" , padding="max length", max length=max length, return tensors="pt" uncond embeddings = pipe.text encoder uncond input.input ids.to torch device 0 text embeddings = torch.cat uncond embeddings, cond embeddings if we use LMSDiscreteScheduler, let's make sure latents are mulitplied by sigmas if isinstance pipe.scheduler, LMSDiscreteScheduler : cond latents = cond latents pipe.scheduler.sigmas 0 init the scheduler accepts offset = "offset" in set inspect.signature pipe.scheduler.set timesteps .parameters.keys extra set kwargs = {} if accepts offset: extra set kwargs "offset" = 1 pipe.scheduler.set timesteps num inference steps, extra set kwargs prepare extra kwargs for the scheduler step, since not all schedulers have the same signature eta η is only used with the DDIMScheduler, it will be ignored for other schedulers. eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 and should be between 0, 1 accepts eta = "eta" in set inspect.signature pipe.scheduler.step .parameters.keys extra step kwargs = {} if accepts eta: extra step kwargs "eta" = eta diffuse for i, t in enumerate pipe.scheduler.timesteps : expand the latents for classifier free guidance latent model input = torch.cat cond latents 2 if isinstance pipe.scheduler, LMSDiscreteScheduler : sigma = pipe.scheduler.sigmas i latent model input = latent model input / sigma 2 + 1 0.5 predict the noise residual noise pred = pipe.unet latent model input, t, encoder hidden states=text embeddings "sample" cfg noise pred uncond, noise pred text = noise pred.chunk 2 noise pred = noise pred uncond + guidance scale noise pred text - noise pred uncond compute the previous noisy sample x t - x t-1 if isinstance pipe.scheduler, LMSDiscreteScheduler : cond latents = pipe.scheduler.step noise pred, i, cond latents, extra step kwargs "prev sample" else: cond latents = pipe.scheduler.step noise pred, t, cond latents, extra step kwargs "prev sample" scale and decode the image latents with vae cond latents = 1 / 0.18215 cond latents image = pipe.vae.decode cond latents generate output numpy image as uint8 image = image / 2 + 0.5 .clamp 0, 1 image = image.cpu .permute 0, 2, 3, 1 .numpy image = image 0 255 .astype np.uint8 return image def slerp t, v0, v1, DOT THRESHOLD=0.9995 : """ helper function to spherically interpolate two arrays v1 v2 """ if not isinstance v0, np.ndarray : inputs are torch = True input device = v0.device v0 = v0.cpu .numpy v1 = v1.cpu .numpy dot = np.sum v0 v1 / np.linalg.norm v0 np.linalg.norm v1 if np.abs dot DOT THRESHOLD: v2 = 1 - t v0 + t v1 else: theta 0 = np.arccos dot sin theta 0 = np.sin theta 0 theta t = theta 0 t sin theta t = np.sin theta t s0 = np.sin theta 0 - theta t / sin theta 0 s1 = sin theta t / sin theta 0 v2 = s0 v0 + s1 v1 if inputs are torch: v2 = torch.from numpy v2 .to input device return v2 def run -------------------------------------- args you probably want to change prompt = "blueberry spaghetti", prompt to dream about gpu = 0, id of the gpu to run on name = 'blueberry', name of this project, for the output directory rootdir = '/home/ubuntu/dreams', num steps = 200, number of steps between each pair of sampled points max frames = 10000, number of frames to write and then exit the script num inference steps = 50, more e.g. 100, 200 etc can create slightly better images guidance scale = 7.5, can depend on the prompt. usually somewhere between 3-10 is good seed = 1337, -------------------------------------- args you probably don't want to change quality = 90, for jpeg compression of the output images eta = 0.0, width = 512, height = 512, weights path = "/home/ubuntu/stable-diffusion-v1-3-diffusers", -------------------------------------- : assert torch.cuda.is available assert height % 8 == 0 and width % 8 == 0 torch.manual seed seed torch device = f"cuda:{gpu}" init the output dir outdir = os.path.join rootdir, name os.makedirs outdir, exist ok=True init all of the models and move them to a given GPU lms = LMSDiscreteScheduler beta start=0.00085, beta end=0.012, beta schedule="scaled linear" pipe = StableDiffusionPipeline.from pretrained weights path, scheduler=lms, use auth token=True pipe.unet.to torch device pipe.vae.to torch device pipe.text encoder.to torch device get the conditional text embeddings based on the prompt text input = pipe.tokenizer prompt, padding="max length", max length=pipe.tokenizer.model max length, truncation=True, return tensors="pt" cond embeddings = pipe.text encoder text input.input ids.to torch device 0 shape 1, 77, 768 sample a source init1 = torch.randn 1, pipe.unet.in channels, height // 8, width // 8 , device=torch device iterate the loop frame index = 0 while frame index < max frames: sample the destination init2 = torch.randn 1, pipe.unet.in channels, height // 8, width // 8 , device=torch device for i, t in enumerate np.linspace 0, 1, num steps : init = slerp float t , init1, init2 print "dreaming... ", frame index with autocast "cuda" : image = diffuse pipe, cond embeddings, init, num inference steps, guidance scale, eta im = Image.fromarray image outpath = os.path.join outdir, 'frame%06d.jpg' % frame index im.save outpath, quality=quality frame index += 1 init1 = init2 if name == ' main ': fire.Fire run