huggingface · yuanwu2017 · Apr 12, 2024 · May 7, 2024 · May 7, 2024 · May 7, 2024
@@ -60,11 +60,27 @@ python text_to_image_generation.py \
  --bf16
 ```
 
+### Distributed inference with multiple HPUs
+Here is how to generate images with two prompts on two HPUs:
+```bash
+python ../gaudi_spawn.py \
+ --world_size 2 text_to_image_generation.py \
+ --model_name_or_path runwayml/stable-diffusion-v1-5 \
+ --prompts "An image of a squirrel in Picasso style" "A shiny flying horse taking off" \
+ --num_images_per_prompt 20 \
+ --batch_size 4 \
+ --image_save_dir /tmp/stable_diffusion_images \
+ --use_habana \
+ --use_hpu_graphs \
+ --gaudi_config Habana/stable-diffusion \
+ --bf16 \
+ --distributed
+```
+
 > HPU graphs are recommended when generating images by batches to get the fastest possible generations.
 > The first batch of images entails a performance penalty. All subsequent batches will be generated much faster.
 > You can enable this mode with `--use_hpu_graphs`.
 
-
 ### Stable Diffusion 2
 
 [Stable Diffusion 2](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion_2) can also be used to generate images with this script. Here is an example for a single prompt:
@@ -109,6 +125,23 @@ python text_to_image_generation.py \
  --gaudi_config Habana/stable-diffusion-2 \
  --ldm3d
 ```
+Here is how to generate images and depth maps with two prompts on two HPUs:
+```bash
+python ../gaudi_spawn.py \
+ --world_size 2 text_to_image_generation.py \
+ --model_name_or_path "Intel/ldm3d-4c" \
+ --prompts "An image of a squirrel in Picasso style" "A shiny flying horse taking off" \
+ --num_images_per_prompt 10 \
+ --batch_size 2 \
+ --height 768 \
+ --width 768 \
+ --image_save_dir /tmp/stable_diffusion_images \
+ --use_habana \
+ --use_hpu_graphs \
+ --gaudi_config Habana/stable-diffusion-2 \
+ --ldm3d \
+ --distributed
+```
 
 > There are three different checkpoints for LDM3D:
 > - use [original checkpoint](https://huggingface.co/Intel/ldm3d) to generate outputs from the paper
@@ -173,6 +206,27 @@ python text_to_image_generation.py \
  --bf16
 ```
 
+Here is how to generate SDXL images with two prompts on two HPUs:
+```bash
+python ../gaudi_spawn.py \
+ --world_size 2 text_to_image_generation.py \
+ --model_name_or_path stabilityai/stable-diffusion-xl-base-1.0 \
+ --prompts "Sailing ship painting by Van Gogh" "A shiny flying horse taking off" \
+ --prompts_2 "Red tone" "Blue tone" \
+ --negative_prompts "Low quality" "Sketch" \
+ --negative_prompts_2 "Clouds" "Clouds" \
+ --num_images_per_prompt 20 \
+ --batch_size 8 \
+ --image_save_dir /tmp/stable_diffusion_xl_images \
+ --scheduler euler_discrete \
+ --use_habana \
+ --use_hpu_graphs \
+ --gaudi_config Habana/stable-diffusion \
+ --bf16 \
+ --distributed
+```
+
+
 > HPU graphs are recommended when generating images by batches to get the fastest possible generations.
 > The first batch of images entails a performance penalty. All subsequent batches will be generated much faster.
 > You can enable this mode with `--use_hpu_graphs`.
@@ -244,6 +298,25 @@ python text_to_image_generation.py \
  --bf16
 ```
 
+Here is how to generate images conditioned by canny edge model and with two prompts on two HPUs:
+```bash
+pip install -r requirements.txt
+python ../gaudi_spawn.py \
+ --world_size 2 text_to_image_generation.py \
+ --model_name_or_path runwayml/stable-diffusion-v1-5 \
+ --controlnet_model_name_or_path lllyasviel/sd-controlnet-canny \
+ --prompts "futuristic-looking woman" "a rusty robot" \
+ --control_image https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png \
+ --num_images_per_prompt 10 \
+ --batch_size 4 \
+ --image_save_dir /tmp/controlnet_images \
+ --use_habana \
+ --use_hpu_graphs \
+ --gaudi_config Habana/stable-diffusion \
+ --bf16 \
+ --distributed
+```
+
 Here is how to generate images conditioned by open pose model:
 ```bash
 pip install -r requirements.txt

@@ -20,6 +20,7 @@
 
 import numpy as np
 import torch
+from accelerate import PartialState
 
 from optimum.habana.diffusers import (
  GaudiDDIMScheduler,
@@ -220,6 +221,7 @@ def main():
  default=0,
  help="Number of steps to capture for profiling.",
  )
+ parser.add_argument("--distributed", action="store_true", help="Use distributed inference on multi-cards")
  parser.add_argument(
  "--unet_adapter_name_or_path",
  default=None,
@@ -241,10 +243,10 @@ def main():
  args = parser.parse_args()
 
  # Set image resolution
- kwargs_call = {}
+ res = {}
  if args.width > 0 and args.height > 0:
- kwargs_call["width"] = args.width
- kwargs_call["height"] = args.height
+ res["width"] = args.width
+ res["height"] = args.height
 
  # ControlNet
  if args.control_image is not None:
@@ -317,8 +319,27 @@ def main():
  if args.bf16:
  kwargs["torch_dtype"] = torch.bfloat16
 
+ negative_prompts = args.negative_prompts
+ if args.distributed:
+ distributed_state = PartialState()
+ if args.negative_prompts is not None:
+ with distributed_state.split_between_processes(args.negative_prompts) as negative_prompt:
+ negative_prompts = negative_prompt
+
+ infer_kwargs = {
+ "num_images_per_prompt": args.num_images_per_prompt,
+ "batch_size": args.batch_size,
+ "num_inference_steps": args.num_inference_steps,
+ "guidance_scale": args.guidance_scale,
+ "negative_prompt": negative_prompts,
+ "eta": args.eta,
+ "output_type": args.output_type,
+ "profiling_warmup_steps": args.profiling_warmup_steps,
+ "profiling_steps": args.profiling_steps,
+ }
+
  if args.throughput_warmup_steps is not None:
- kwargs_call["throughput_warmup_steps"] = args.throughput_warmup_steps
+ infer_kwargs["throughput_warmup_steps"] = args.throughput_warmup_steps
 
  # Generate images
  if args.control_image is not None:
@@ -334,21 +355,8 @@ def main():
 
  # Set seed before running the model
  set_seed(args.seed)
+ infer_kwargs["image"] = control_image
 
- outputs = pipeline(
- prompt=args.prompts,
- image=control_image,
- num_images_per_prompt=args.num_images_per_prompt,
- batch_size=args.batch_size,
- num_inference_steps=args.num_inference_steps,
- guidance_scale=args.guidance_scale,
- negative_prompt=args.negative_prompts,
- eta=args.eta,
- output_type=args.output_type,
- profiling_warmup_steps=args.profiling_warmup_steps,
- profiling_steps=args.profiling_steps,
- **kwargs_call,
- )
  elif sdxl:
  pipeline = GaudiStableDiffusionXLPipeline.from_pretrained(
  args.model_name_or_path,
@@ -360,21 +368,18 @@ def main():
  # Set seed before running the model
  set_seed(args.seed)
 
- outputs = pipeline(
- prompt=args.prompts,
- prompt_2=args.prompts_2,
- num_images_per_prompt=args.num_images_per_prompt,
- batch_size=args.batch_size,
- num_inference_steps=args.num_inference_steps,
- guidance_scale=args.guidance_scale,
- negative_prompt=args.negative_prompts,
- negative_prompt_2=args.negative_prompts_2,
- eta=args.eta,
- output_type=args.output_type,
- profiling_warmup_steps=args.profiling_warmup_steps,
- profiling_steps=args.profiling_steps,
- **kwargs_call,
- )
+ prompts_2 = args.prompts_2
+ negative_prompts_2 = args.negative_prompts_2
+ if args.distributed and args.prompts_2 is not None:
+ with distributed_state.split_between_processes(args.prompts_2) as prompt_2:
+ prompts_2 = prompt_2
+ if args.distributed and args.negative_prompts_2 is not None:
+ with distributed_state.split_between_processes(args.negative_prompts_2) as negative_prompt_2:
+ negative_prompts_2 = negative_prompt_2
+
+ infer_kwargs["prompt_2"] = prompts_2
+ infer_kwargs["negative_prompt_2"] = negative_prompts_2
+
  else:
  pipeline = GaudiStableDiffusionPipeline.from_pretrained(
  args.model_name_or_path,
@@ -394,28 +399,26 @@ def main():
  pipeline.text_encoder = pipeline.text_encoder.merge_and_unload()
  set_seed(args.seed)
 
- outputs = pipeline(
- prompt=args.prompts,
- num_images_per_prompt=args.num_images_per_prompt,
- batch_size=args.batch_size,
- num_inference_steps=args.num_inference_steps,
- guidance_scale=args.guidance_scale,
- negative_prompt=args.negative_prompts,
- eta=args.eta,
- output_type=args.output_type,
- profiling_warmup_steps=args.profiling_warmup_steps,
- profiling_steps=args.profiling_steps,
- **kwargs_call,
- )
+ if args.distributed:
+ with distributed_state.split_between_processes(args.prompts) as prompt:
+ outputs = pipeline(prompt=prompt, **infer_kwargs, **res)
+ else:
+ outputs = pipeline(prompt=args.prompts, **infer_kwargs, **res)
 
  # Save the pipeline in the specified directory if not None
  if args.pipeline_save_dir is not None:
- pipeline.save_pretrained(args.pipeline_save_dir)
+ save_dir = args.pipeline_save_dir
+ if args.distributed:
+ save_dir = f"{args.pipeline_save_dir}_{distributed_state.process_index}"
+ pipeline.save_pretrained(save_dir)
 
  # Save images in the specified directory if not None and if they are in PIL format
  if args.image_save_dir is not None:
  if args.output_type == "pil":
  image_save_dir = Path(args.image_save_dir)
+ if args.distributed:
+ image_save_dir = Path(f"{image_save_dir}_{distributed_state.process_index}")
+
  image_save_dir.mkdir(parents=True, exist_ok=True)
  logger.info(f"Saving images in {image_save_dir.resolve()}...")
  if args.ldm3d: