Updated memory requirements constants.py

Added information about the vRAM requirements for running both the LLM and Embedding models.
PromtEngineer · Aug 29, 2023 · 9743221 · 9743221
1 parent 379b214
commit 9743221
Showing 1 changed file with 81 additions and 56 deletions.
diff --git a/constants.py b/constants.py
@@ -36,26 +36,45 @@
 }
 
 # Default Instructor Model
-EMBEDDING_MODEL_NAME = "hkunlp/instructor-large"
-# You can also choose a smaller model, don't forget to change HuggingFaceInstructEmbeddings
-# to HuggingFaceEmbeddings in both ingest.py and run_localGPT.py
-# EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
+EMBEDDING_MODEL_NAME = "hkunlp/instructor-large" # Uses 1.5 GB of VRAM (High Accuracy with lower VRAM usage)
+
+####
+#### OTHER EMBEDDING MODEL OPTIONS
+####
+
+# EMBEDDING_MODEL_NAME = "hkunlp/instructor-xl" # Uses 5 GB of VRAM (Most Accurate of all models)
+# EMBEDDING_MODEL_NAME = "intfloat/e5-large-v2" # Uses 1.5 GB of VRAM (A little less accurate than instructor-large)
+# EMBEDDING_MODEL_NAME = "intfloat/e5-base-v2" # Uses 0.5 GB of VRAM (A good model for lower VRAM GPUs)
+# EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2" # Uses 0.2 GB of VRAM (Less accurate but fastest - only requires 150mb of vram)
+
+####
+#### MULTILINGUAL EMBEDDING MODELS
+####
+
+# EMBEDDING_MODEL_NAME = "intfloat/multilingual-e5-large" # Uses 2.5 GB of VRAM 
+# EMBEDDING_MODEL_NAME = "intfloat/multilingual-e5-base" # Uses 1.2 GB of VRAM 
 
-# Select the Model ID and model_basename
-# load the LLM for generating Natural Language responses
 
-#### GPU VRAM Memory required for LLM Models by Billion Parameter value (B Model)
+#### SELECT AN OPEN SOURCE LLM (LARGE LANGUAGE MODEL)
+ # Select the Model ID and model_basename
+ # load the LLM for generating Natural Language responses
+
+#### GPU VRAM Memory required for LLM Models (ONLY) by Billion Parameter value (B Model)
+#### Does not include VRAM used by Embedding Models - which use an additional 2GB-7GB of VRAM depending on the model.
 ####
-#### (B Model) (float32) (float16) (GPTQ 8bit) (GPTQ 4bit)
-#### 7b 28 GB 14 GB 7 GB   3.5 GB 
-#### 13b 52 GB 26 GB 13 GB   6.5 GB 
-#### 32b 130 GB 65 GB 32.5 GB   16.25 GB 
-#### 65b 260.8 GB 130.4 GB 65.2 GB   32.6 GB 
+#### (B Model) (float32) (float16)  (GPTQ 8bit)  (GPTQ 4bit)
+#### 7b 28 GB 14 GB 7 GB - 9 GB  3.5 GB - 5 GB  
+#### 13b 52 GB 26 GB 13 GB - 15 GB  6.5 GB - 8 GB 
+#### 32b 130 GB 65 GB 32.5 GB - 35 GB  16.25 GB - 19 GB  
+#### 65b 260.8 GB 130.4 GB 65.2 GB - 67 GB  32.6 GB - - 35 GB 
 
 MODEL_ID = "TheBloke/Llama-2-7B-Chat-GGML"
 MODEL_BASENAME = "llama-2-7b-chat.ggmlv3.q4_0.bin"
 
-# for HF models
+####
+#### (FOR HF MODELS)
+####
+
 # MODEL_ID = "TheBloke/vicuna-7B-1.1-HF"
 # MODEL_BASENAME = None
 # MODEL_ID = "TheBloke/Wizard-Vicuna-7B-Uncensored-HF"
@@ -64,49 +83,55 @@
 # alongside will 100% create OOM on 24GB cards.
 # llm = load_model(device_type, model_id=model_id)
 
- # For GPTQ (quantized) select an llm model based on your GPU and VRAM GB
-
- ##### 48GB VRAM Graphics Cards (RTX 6000, RTX A6000 and other 48GB VRAM GPUs) #####
-
- ### 65b GPTQ Models for 48GB GPUs
- # model_id = "TheBloke/guanaco-65B-GPTQ"
- # model_basename = "model.safetensors"
- # model_id = "TheBloke/Airoboros-65B-GPT4-2.0-GPTQ"
- # model_basename = "model.safetensors"
- # model_id = "TheBloke/gpt4-alpaca-lora_mlp-65B-GPTQ"
- # model_basename = "model.safetensors"
- # model_id = "TheBloke/Upstage-Llama1-65B-Instruct-GPTQ" 
- # model_basename = "model.safetensors" 
-
- ##### 24GB VRAM Graphics Cards (RTX 3090 - RTX 4090 (35% Faster) - RTX A5000 - RTX A5500) #####
-
- ### 13b GPTQ Models for 24GB GPUs (*** With best embedding model: hkunlp/instructor-xl ***)
- # model_id = "TheBloke/Wizard-Vicuna-13B-Uncensored-GPTQ"
- # model_basename = "Wizard-Vicuna-13B-Uncensored-GPTQ-4bit-128g.compat.no-act-order.safetensors"
- # model_id = "TheBloke/vicuna-13B-v1.5-GPTQ"
- # model_basename = "model.safetensors"
- # model_id = "TheBloke/Nous-Hermes-13B-GPTQ"
- # model_basename = "nous-hermes-13b-GPTQ-4bit-128g.no-act.order"
- # model_id = "TheBloke/WizardLM-13B-V1.2-GPTQ" 
- # model_basename = "gptq_model-4bit-128g.safetensors
-
- ### 30b GPTQ Models for 24GB GPUs (*** Requires using intfloat/e5-base-v2 instead of hkunlp/instructor-large as embedding model ***)
- # model_id = "TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ"
- # model_basename = "Wizard-Vicuna-30B-Uncensored-GPTQ-4bit--1g.act.order.safetensors" 
- # model_id = "TheBloke/WizardLM-30B-Uncensored-GPTQ"
- # model_basename = "WizardLM-30B-Uncensored-GPTQ-4bit.act-order.safetensors" 
-
- ##### 8-10GB VRAM Graphics Cards (RTX 3080 - RTX 3080 Ti - RTX 3070 Ti - 3060 Ti - RTX 2000 Series, Quadro RTX 4000, 5000, 6000) #####
-
- ### 7b GPTQ Models for 8GB GPUs
- # model_id = "TheBloke/Wizard-Vicuna-7B-Uncensored-GPTQ"
- # model_basename = "Wizard-Vicuna-7B-Uncensored-GPTQ-4bit-128g.no-act.order.safetensors"
- # model_id = "TheBloke/WizardLM-7B-uncensored-GPTQ"
- # model_basename = "WizardLM-7B-uncensored-GPTQ-4bit-128g.compat.no-act-order.safetensors"
- # model_id = "TheBloke/wizardLM-7B-GPTQ"
- # model_basename = "wizardLM-7B-GPTQ-4bit.compat.no-act-order.safetensors"
-
-# for GGML (quantized cpu+gpu+mps) models - check if they support llama.cpp
+####
+#### (FOR GPTQ QUANTIZED) Select a llm model based on your GPU and VRAM GB. Does not include Embedding Models VRAM usage.
+####
+
+##### 48GB VRAM Graphics Cards (RTX 6000, RTX A6000 and other 48GB VRAM GPUs) #####
+
+### 65b GPTQ LLM Models for 48GB GPUs (*** With best embedding model: hkunlp/instructor-xl ***)
+# model_id = "TheBloke/guanaco-65B-GPTQ"
+# model_basename = "model.safetensors"
+# model_id = "TheBloke/Airoboros-65B-GPT4-2.0-GPTQ"
+# model_basename = "model.safetensors"
+# model_id = "TheBloke/gpt4-alpaca-lora_mlp-65B-GPTQ"
+# model_basename = "model.safetensors"
+# model_id = "TheBloke/Upstage-Llama1-65B-Instruct-GPTQ" 
+# model_basename = "model.safetensors" 
+
+##### 24GB VRAM Graphics Cards (RTX 3090 - RTX 4090 (35% Faster) - RTX A5000 - RTX A5500) #####
+
+### 13b GPTQ Models for 24GB GPUs (*** With best embedding model: hkunlp/instructor-xl ***)
+# model_id = "TheBloke/Wizard-Vicuna-13B-Uncensored-GPTQ"
+# model_basename = "Wizard-Vicuna-13B-Uncensored-GPTQ-4bit-128g.compat.no-act-order.safetensors"
+# model_id = "TheBloke/vicuna-13B-v1.5-GPTQ"
+# model_basename = "model.safetensors"
+# model_id = "TheBloke/Nous-Hermes-13B-GPTQ"
+# model_basename = "nous-hermes-13b-GPTQ-4bit-128g.no-act.order"
+# model_id = "TheBloke/WizardLM-13B-V1.2-GPTQ" 
+# model_basename = "gptq_model-4bit-128g.safetensors
+
+### 30b GPTQ Models for 24GB GPUs (*** Requires using intfloat/e5-base-v2 instead of hkunlp/instructor-large as embedding model ***)
+# model_id = "TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ"
+# model_basename = "Wizard-Vicuna-30B-Uncensored-GPTQ-4bit--1g.act.order.safetensors" 
+# model_id = "TheBloke/WizardLM-30B-Uncensored-GPTQ"
+# model_basename = "WizardLM-30B-Uncensored-GPTQ-4bit.act-order.safetensors" 
+
+##### 8-10GB VRAM Graphics Cards (RTX 3080 - RTX 3080 Ti - RTX 3070 Ti - 3060 Ti - RTX 2000 Series, Quadro RTX 4000, 5000, 6000) #####
+### (*** Requires using intfloat/e5-small-v2 instead of hkunlp/instructor-large as embedding model ***)
+
+### 7b GPTQ Models for 8GB GPUs
+# model_id = "TheBloke/Wizard-Vicuna-7B-Uncensored-GPTQ"
+# model_basename = "Wizard-Vicuna-7B-Uncensored-GPTQ-4bit-128g.no-act.order.safetensors"
+# model_id = "TheBloke/WizardLM-7B-uncensored-GPTQ"
+# model_basename = "WizardLM-7B-uncensored-GPTQ-4bit-128g.compat.no-act-order.safetensors"
+# model_id = "TheBloke/wizardLM-7B-GPTQ"
+# model_basename = "wizardLM-7B-GPTQ-4bit.compat.no-act-order.safetensors"
+
+####
+#### (FOR GGML) (Quantized cpu+gpu+mps) models - check if they support llama.cpp
+####
+
 # MODEL_ID = "TheBloke/wizard-vicuna-13B-GGML"
 # MODEL_BASENAME = "wizard-vicuna-13B.ggmlv3.q4_0.bin"
 # MODEL_BASENAME = "wizard-vicuna-13B.ggmlv3.q6_K.bin"