Merge pull request #646 from ptanov/645-support-awq-models

#645 Add support for AWQ models
PromtEngineer · Nov 22, 2023 · d30aaef · d30aaef
2 parents 54d38cf + 17e8cfb
commit d30aaef
Show file tree

Hide file tree

Showing 4 changed files with 42 additions and 0 deletions.
diff --git a/constants.py b/constants.py
@@ -176,3 +176,11 @@
 # MODEL_BASENAME = "wizard-vicuna-13B.ggmlv3.q2_K.bin"
 # MODEL_ID = "TheBloke/orca_mini_3B-GGML"
 # MODEL_BASENAME = "orca-mini-3b.ggmlv3.q4_0.bin"
+
+####
+#### (FOR AWQ QUANTIZED) Select a llm model based on your GPU and VRAM GB. Does not include Embedding Models VRAM usage.
+### (*** MODEL_BASENAME is not actually used but have to contain .awq so the correct model loading is used ***)
+### (*** Compute capability 7.5 (sm75) and CUDA Toolkit 11.8+ are required ***)
+####
+# MODEL_ID = "TheBloke/Llama-2-7B-Chat-AWQ"
+# MODEL_BASENAME = "model.safetensors.awq"
diff --git a/load_models.py b/load_models.py
@@ -149,3 +149,33 @@ def load_full_model(model_id, model_basename, device_type, logging):
  )
  model.tie_weights()
  return model, tokenizer
+
+def load_quantized_model_awq(model_id, logging):
+ """
+ Load a AWQ quantized model using AutoModelForCausalLM.
+
+ This function loads a quantized model that ends with AWQ.
+
+ Parameters:
+ - model_id (str): The identifier for the model on HuggingFace Hub.
+ - logging (logging.Logger): Logger instance for logging messages.
+
+ Returns:
+ - model (AutoModelForCausalLM): The loaded quantized model.
+ - tokenizer (AutoTokenizer): The tokenizer associated with the model.
+
+ """
+
+ # The code supports all huggingface models that ends with AWQ.
+ logging.info("Using AutoModelForCausalLM for AWQ quantized models")
+
+ tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
+ logging.info("Tokenizer loaded")
+
+ model = AutoModelForCausalLM.from_pretrained(
+ model_id,
+ use_safetensors=True,
+ trust_remote_code=True,
+ device_map="auto",
+ )
+ return model, tokenizer
diff --git a/requirements.txt b/requirements.txt
@@ -7,6 +7,7 @@ sentence-transformers
 faiss-cpu
 huggingface_hub
 transformers
+autoawq
 protobuf==3.20.2; sys_platform != 'darwin'
 protobuf==3.20.2; sys_platform == 'darwin' and platform_machine != 'arm64'
 protobuf==3.20.3; sys_platform == 'darwin' and platform_machine == 'arm64'

diff --git a/run_localGPT.py b/run_localGPT.py
@@ -21,6 +21,7 @@
 )
 
 from load_models import (
+ load_quantized_model_awq,
  load_quantized_model_gguf_ggml,
  load_quantized_model_qptq,
  load_full_model,
@@ -64,6 +65,8 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
  return llm
  elif ".ggml" in model_basename.lower():
  model, tokenizer = load_quantized_model_gguf_ggml(model_id, model_basename, device_type, LOGGING)
+ elif ".awq" in model_basename.lower():
+ model, tokenizer = load_quantized_model_awq(model_id, LOGGING)
  else:
  model, tokenizer = load_quantized_model_qptq(model_id, model_basename, device_type, LOGGING)
  else: