-
Notifications
You must be signed in to change notification settings - Fork 23
/
audiotext2img.py
24 lines (21 loc) · 783 Bytes
/
audiotext2img.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import anything2image.imagebind as ib
import torch
from diffusers import StableUnCLIPImg2ImgPipeline
# construct models
device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
"stabilityai/stable-diffusion-2-1-unclip", torch_dtype=torch.float16
)
pipe = pipe.to(device)
model = ib.imagebind_huge(pretrained=True)
model.eval()
model.to(device)
# generate image
with torch.no_grad():
audio_paths=["assets/wav/bird_audio.wav"]
embeddings = model.forward({
ib.ModalityType.AUDIO: ib.load_and_transform_audio_data(audio_paths, device),
})
embeddings = embeddings[ib.ModalityType.AUDIO]
images = pipe(prompt='a painting', image_embeds=embeddings.half()).images
images[0].save("audiotext2img.png")