Skip to content

Commit

Permalink
Updated memory requirements constants.py
Browse files Browse the repository at this point in the history
Added information about the vRAM requirements for running both the LLM and Embedding models.
  • Loading branch information
PromtEngineer committed Aug 29, 2023
1 parent 379b214 commit 9743221
Showing 1 changed file with 81 additions and 56 deletions.
137 changes: 81 additions & 56 deletions constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,26 +36,45 @@
}

# Default Instructor Model
EMBEDDING_MODEL_NAME = "hkunlp/instructor-large"
# You can also choose a smaller model, don't forget to change HuggingFaceInstructEmbeddings
# to HuggingFaceEmbeddings in both ingest.py and run_localGPT.py
# EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
EMBEDDING_MODEL_NAME = "hkunlp/instructor-large" # Uses 1.5 GB of VRAM (High Accuracy with lower VRAM usage)

####
#### OTHER EMBEDDING MODEL OPTIONS
####

# EMBEDDING_MODEL_NAME = "hkunlp/instructor-xl" # Uses 5 GB of VRAM (Most Accurate of all models)
# EMBEDDING_MODEL_NAME = "intfloat/e5-large-v2" # Uses 1.5 GB of VRAM (A little less accurate than instructor-large)
# EMBEDDING_MODEL_NAME = "intfloat/e5-base-v2" # Uses 0.5 GB of VRAM (A good model for lower VRAM GPUs)
# EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2" # Uses 0.2 GB of VRAM (Less accurate but fastest - only requires 150mb of vram)

####
#### MULTILINGUAL EMBEDDING MODELS
####

# EMBEDDING_MODEL_NAME = "intfloat/multilingual-e5-large" # Uses 2.5 GB of VRAM
# EMBEDDING_MODEL_NAME = "intfloat/multilingual-e5-base" # Uses 1.2 GB of VRAM

# Select the Model ID and model_basename
# load the LLM for generating Natural Language responses

#### GPU VRAM Memory required for LLM Models by Billion Parameter value (B Model)
#### SELECT AN OPEN SOURCE LLM (LARGE LANGUAGE MODEL)
# Select the Model ID and model_basename
# load the LLM for generating Natural Language responses

#### GPU VRAM Memory required for LLM Models (ONLY) by Billion Parameter value (B Model)
#### Does not include VRAM used by Embedding Models - which use an additional 2GB-7GB of VRAM depending on the model.
####
#### (B Model) (float32) (float16) (GPTQ 8bit) (GPTQ 4bit)
#### 7b 28 GB 14 GB 7 GB 3.5 GB
#### 13b 52 GB 26 GB 13 GB 6.5 GB
#### 32b 130 GB 65 GB 32.5 GB 16.25 GB
#### 65b 260.8 GB 130.4 GB 65.2 GB 32.6 GB
#### (B Model) (float32) (float16) (GPTQ 8bit) (GPTQ 4bit)
#### 7b 28 GB 14 GB 7 GB - 9 GB 3.5 GB - 5 GB
#### 13b 52 GB 26 GB 13 GB - 15 GB 6.5 GB - 8 GB
#### 32b 130 GB 65 GB 32.5 GB - 35 GB 16.25 GB - 19 GB
#### 65b 260.8 GB 130.4 GB 65.2 GB - 67 GB 32.6 GB - - 35 GB

MODEL_ID = "TheBloke/Llama-2-7B-Chat-GGML"
MODEL_BASENAME = "llama-2-7b-chat.ggmlv3.q4_0.bin"

# for HF models
####
#### (FOR HF MODELS)
####

# MODEL_ID = "TheBloke/vicuna-7B-1.1-HF"
# MODEL_BASENAME = None
# MODEL_ID = "TheBloke/Wizard-Vicuna-7B-Uncensored-HF"
Expand All @@ -64,49 +83,55 @@
# alongside will 100% create OOM on 24GB cards.
# llm = load_model(device_type, model_id=model_id)

# For GPTQ (quantized) select an llm model based on your GPU and VRAM GB

##### 48GB VRAM Graphics Cards (RTX 6000, RTX A6000 and other 48GB VRAM GPUs) #####

### 65b GPTQ Models for 48GB GPUs
# model_id = "TheBloke/guanaco-65B-GPTQ"
# model_basename = "model.safetensors"
# model_id = "TheBloke/Airoboros-65B-GPT4-2.0-GPTQ"
# model_basename = "model.safetensors"
# model_id = "TheBloke/gpt4-alpaca-lora_mlp-65B-GPTQ"
# model_basename = "model.safetensors"
# model_id = "TheBloke/Upstage-Llama1-65B-Instruct-GPTQ"
# model_basename = "model.safetensors"

##### 24GB VRAM Graphics Cards (RTX 3090 - RTX 4090 (35% Faster) - RTX A5000 - RTX A5500) #####

### 13b GPTQ Models for 24GB GPUs (*** With best embedding model: hkunlp/instructor-xl ***)
# model_id = "TheBloke/Wizard-Vicuna-13B-Uncensored-GPTQ"
# model_basename = "Wizard-Vicuna-13B-Uncensored-GPTQ-4bit-128g.compat.no-act-order.safetensors"
# model_id = "TheBloke/vicuna-13B-v1.5-GPTQ"
# model_basename = "model.safetensors"
# model_id = "TheBloke/Nous-Hermes-13B-GPTQ"
# model_basename = "nous-hermes-13b-GPTQ-4bit-128g.no-act.order"
# model_id = "TheBloke/WizardLM-13B-V1.2-GPTQ"
# model_basename = "gptq_model-4bit-128g.safetensors

### 30b GPTQ Models for 24GB GPUs (*** Requires using intfloat/e5-base-v2 instead of hkunlp/instructor-large as embedding model ***)
# model_id = "TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ"
# model_basename = "Wizard-Vicuna-30B-Uncensored-GPTQ-4bit--1g.act.order.safetensors"
# model_id = "TheBloke/WizardLM-30B-Uncensored-GPTQ"
# model_basename = "WizardLM-30B-Uncensored-GPTQ-4bit.act-order.safetensors"

##### 8-10GB VRAM Graphics Cards (RTX 3080 - RTX 3080 Ti - RTX 3070 Ti - 3060 Ti - RTX 2000 Series, Quadro RTX 4000, 5000, 6000) #####

### 7b GPTQ Models for 8GB GPUs
# model_id = "TheBloke/Wizard-Vicuna-7B-Uncensored-GPTQ"
# model_basename = "Wizard-Vicuna-7B-Uncensored-GPTQ-4bit-128g.no-act.order.safetensors"
# model_id = "TheBloke/WizardLM-7B-uncensored-GPTQ"
# model_basename = "WizardLM-7B-uncensored-GPTQ-4bit-128g.compat.no-act-order.safetensors"
# model_id = "TheBloke/wizardLM-7B-GPTQ"
# model_basename = "wizardLM-7B-GPTQ-4bit.compat.no-act-order.safetensors"

# for GGML (quantized cpu+gpu+mps) models - check if they support llama.cpp
####
#### (FOR GPTQ QUANTIZED) Select a llm model based on your GPU and VRAM GB. Does not include Embedding Models VRAM usage.
####

##### 48GB VRAM Graphics Cards (RTX 6000, RTX A6000 and other 48GB VRAM GPUs) #####

### 65b GPTQ LLM Models for 48GB GPUs (*** With best embedding model: hkunlp/instructor-xl ***)
# model_id = "TheBloke/guanaco-65B-GPTQ"
# model_basename = "model.safetensors"
# model_id = "TheBloke/Airoboros-65B-GPT4-2.0-GPTQ"
# model_basename = "model.safetensors"
# model_id = "TheBloke/gpt4-alpaca-lora_mlp-65B-GPTQ"
# model_basename = "model.safetensors"
# model_id = "TheBloke/Upstage-Llama1-65B-Instruct-GPTQ"
# model_basename = "model.safetensors"

##### 24GB VRAM Graphics Cards (RTX 3090 - RTX 4090 (35% Faster) - RTX A5000 - RTX A5500) #####

### 13b GPTQ Models for 24GB GPUs (*** With best embedding model: hkunlp/instructor-xl ***)
# model_id = "TheBloke/Wizard-Vicuna-13B-Uncensored-GPTQ"
# model_basename = "Wizard-Vicuna-13B-Uncensored-GPTQ-4bit-128g.compat.no-act-order.safetensors"
# model_id = "TheBloke/vicuna-13B-v1.5-GPTQ"
# model_basename = "model.safetensors"
# model_id = "TheBloke/Nous-Hermes-13B-GPTQ"
# model_basename = "nous-hermes-13b-GPTQ-4bit-128g.no-act.order"
# model_id = "TheBloke/WizardLM-13B-V1.2-GPTQ"
# model_basename = "gptq_model-4bit-128g.safetensors

### 30b GPTQ Models for 24GB GPUs (*** Requires using intfloat/e5-base-v2 instead of hkunlp/instructor-large as embedding model ***)
# model_id = "TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ"
# model_basename = "Wizard-Vicuna-30B-Uncensored-GPTQ-4bit--1g.act.order.safetensors"
# model_id = "TheBloke/WizardLM-30B-Uncensored-GPTQ"
# model_basename = "WizardLM-30B-Uncensored-GPTQ-4bit.act-order.safetensors"

##### 8-10GB VRAM Graphics Cards (RTX 3080 - RTX 3080 Ti - RTX 3070 Ti - 3060 Ti - RTX 2000 Series, Quadro RTX 4000, 5000, 6000) #####
### (*** Requires using intfloat/e5-small-v2 instead of hkunlp/instructor-large as embedding model ***)

### 7b GPTQ Models for 8GB GPUs
# model_id = "TheBloke/Wizard-Vicuna-7B-Uncensored-GPTQ"
# model_basename = "Wizard-Vicuna-7B-Uncensored-GPTQ-4bit-128g.no-act.order.safetensors"
# model_id = "TheBloke/WizardLM-7B-uncensored-GPTQ"
# model_basename = "WizardLM-7B-uncensored-GPTQ-4bit-128g.compat.no-act-order.safetensors"
# model_id = "TheBloke/wizardLM-7B-GPTQ"
# model_basename = "wizardLM-7B-GPTQ-4bit.compat.no-act-order.safetensors"

####
#### (FOR GGML) (Quantized cpu+gpu+mps) models - check if they support llama.cpp
####

# MODEL_ID = "TheBloke/wizard-vicuna-13B-GGML"
# MODEL_BASENAME = "wizard-vicuna-13B.ggmlv3.q4_0.bin"
# MODEL_BASENAME = "wizard-vicuna-13B.ggmlv3.q6_K.bin"
Expand Down

0 comments on commit 9743221

Please sign in to comment.