Skip to content

Commit

Permalink
Merge pull request #646 from ptanov/645-support-awq-models
Browse files Browse the repository at this point in the history
#645 Add support for AWQ models
  • Loading branch information
PromtEngineer committed Nov 22, 2023
2 parents 54d38cf + 17e8cfb commit d30aaef
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 0 deletions.
8 changes: 8 additions & 0 deletions constants.py
Expand Up @@ -176,3 +176,11 @@
# MODEL_BASENAME = "wizard-vicuna-13B.ggmlv3.q2_K.bin"
# MODEL_ID = "TheBloke/orca_mini_3B-GGML"
# MODEL_BASENAME = "orca-mini-3b.ggmlv3.q4_0.bin"

####
#### (FOR AWQ QUANTIZED) Select a llm model based on your GPU and VRAM GB. Does not include Embedding Models VRAM usage.
### (*** MODEL_BASENAME is not actually used but have to contain .awq so the correct model loading is used ***)
### (*** Compute capability 7.5 (sm75) and CUDA Toolkit 11.8+ are required ***)
####
# MODEL_ID = "TheBloke/Llama-2-7B-Chat-AWQ"
# MODEL_BASENAME = "model.safetensors.awq"
30 changes: 30 additions & 0 deletions load_models.py
Expand Up @@ -149,3 +149,33 @@ def load_full_model(model_id, model_basename, device_type, logging):
)
model.tie_weights()
return model, tokenizer

def load_quantized_model_awq(model_id, logging):
"""
Load a AWQ quantized model using AutoModelForCausalLM.
This function loads a quantized model that ends with AWQ.
Parameters:
- model_id (str): The identifier for the model on HuggingFace Hub.
- logging (logging.Logger): Logger instance for logging messages.
Returns:
- model (AutoModelForCausalLM): The loaded quantized model.
- tokenizer (AutoTokenizer): The tokenizer associated with the model.
"""

# The code supports all huggingface models that ends with AWQ.
logging.info("Using AutoModelForCausalLM for AWQ quantized models")

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
logging.info("Tokenizer loaded")

model = AutoModelForCausalLM.from_pretrained(
model_id,
use_safetensors=True,
trust_remote_code=True,
device_map="auto",
)
return model, tokenizer
1 change: 1 addition & 0 deletions requirements.txt
Expand Up @@ -7,6 +7,7 @@ sentence-transformers
faiss-cpu
huggingface_hub
transformers
autoawq
protobuf==3.20.2; sys_platform != 'darwin'
protobuf==3.20.2; sys_platform == 'darwin' and platform_machine != 'arm64'
protobuf==3.20.3; sys_platform == 'darwin' and platform_machine == 'arm64'
Expand Down
3 changes: 3 additions & 0 deletions run_localGPT.py
Expand Up @@ -21,6 +21,7 @@
)

from load_models import (
load_quantized_model_awq,
load_quantized_model_gguf_ggml,
load_quantized_model_qptq,
load_full_model,
Expand Down Expand Up @@ -64,6 +65,8 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
return llm
elif ".ggml" in model_basename.lower():
model, tokenizer = load_quantized_model_gguf_ggml(model_id, model_basename, device_type, LOGGING)
elif ".awq" in model_basename.lower():
model, tokenizer = load_quantized_model_awq(model_id, LOGGING)
else:
model, tokenizer = load_quantized_model_qptq(model_id, model_basename, device_type, LOGGING)
else:
Expand Down

0 comments on commit d30aaef

Please sign in to comment.