From f2af45a9dd65667af0e97ed6c6f9c14cf96deee2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Mon, 15 Apr 2024 14:51:23 +0200
Subject: [PATCH] Add documentation vLLM for multiple GPUs

---
 docs/reference/models/vllm.md | 15 +++++++++++++++
 docs/reference/serve/vllm.md  |  8 ++++++++
 2 files changed, 23 insertions(+)

diff --git a/docs/reference/models/vllm.md b/docs/reference/models/vllm.md
index 25581a1bc..7fc29f00c 100644
--- a/docs/reference/models/vllm.md
+++ b/docs/reference/models/vllm.md
@@ -84,6 +84,21 @@ model = models.vllm("https://huggingface.co/squeeze-ai-lab/sq-llama-30b-w4-s5",
 
     To use GPTQ models you need to install the autoGTPQ and optimum libraries `pip install auto-gptq optimum`.
 
+
+### Multi-GPU usage
+
+To run multi-GPU inference with vLLM you need to set the `tensor_parallel_size` argument to the number of GPUs available when initializing the model. For instance to run inference on 2 GPUs:
+
+
+```python
+from outlines import models
+
+model = models.vllm(
+    "mistralai/Mistral-7B-v0.1"
+    tensor_parallel_size=2
+)
+```
+
 ### Load LoRA adapters
 
 You can load LoRA adapters and alternate between them dynamically:
diff --git a/docs/reference/serve/vllm.md b/docs/reference/serve/vllm.md
index 0a6f5dc62..1b4d4bf14 100644
--- a/docs/reference/serve/vllm.md
+++ b/docs/reference/serve/vllm.md
@@ -18,6 +18,14 @@ python -m outlines.serve.serve --model="mistralai/Mistral-7B-Instruct-v0.2"
 
 This will by default start a server at `http://127.0.0.1:8000` (check what the console says, though). Without the `--model` argument set, the OPT-125M model is used. The `--model` argument allows you to specify any model of your choosing.
 
+To run inference on multiple GPUs you must pass the `--tensor-parallel-size` argument when initializing the server. For instance, to run inference on 2 GPUs:
+
+
+```bash
+python -m outlines.serve.serve --model="mistralai/Mistral-7B-Instruct-v0.2" --tensor-parallel-size 2
+```
+
+
 ### Alternative Method: Via Docker
 
 You can install and run the server with Outlines' official Docker image using the command