diff --git a/constants.py b/constants.py
index 007d16b4..629230e2 100644
--- a/constants.py
+++ b/constants.py
@@ -176,3 +176,11 @@
 # MODEL_BASENAME = "wizard-vicuna-13B.ggmlv3.q2_K.bin"
 # MODEL_ID = "TheBloke/orca_mini_3B-GGML"
 # MODEL_BASENAME = "orca-mini-3b.ggmlv3.q4_0.bin"
+
+####
+#### (FOR AWQ QUANTIZED) Select a llm model based on your GPU and VRAM GB. Does not include Embedding Models VRAM usage.
+### (*** MODEL_BASENAME is not actually used but have to contain .awq so the correct model loading is used ***)
+### (*** Compute capability 7.5 (sm75) and CUDA Toolkit 11.8+ are required ***)
+####
+# MODEL_ID = "TheBloke/Llama-2-7B-Chat-AWQ"
+# MODEL_BASENAME = "model.safetensors.awq"
diff --git a/load_models.py b/load_models.py
index b9eb909f..c5347060 100644
--- a/load_models.py
+++ b/load_models.py
@@ -149,3 +149,33 @@ def load_full_model(model_id, model_basename, device_type, logging):
         )
         model.tie_weights()
     return model, tokenizer
+
+def load_quantized_model_awq(model_id, logging):
+    """
+    Load a AWQ quantized model using AutoModelForCausalLM.
+
+    This function loads a quantized model that ends with AWQ.
+
+    Parameters:
+    - model_id (str): The identifier for the model on HuggingFace Hub.
+    - logging (logging.Logger): Logger instance for logging messages.
+
+    Returns:
+    - model (AutoModelForCausalLM): The loaded quantized model.
+    - tokenizer (AutoTokenizer): The tokenizer associated with the model.
+
+    """
+
+    # The code supports all huggingface models that ends with AWQ.
+    logging.info("Using AutoModelForCausalLM for AWQ quantized models")
+
+    tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
+    logging.info("Tokenizer loaded")
+
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id,
+        use_safetensors=True,
+        trust_remote_code=True,
+        device_map="auto",
+    )
+    return model, tokenizer
diff --git a/requirements.txt b/requirements.txt
index 82068f05..998e8c8e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,6 +7,7 @@ sentence-transformers
 faiss-cpu
 huggingface_hub
 transformers
+autoawq
 protobuf==3.20.2; sys_platform != 'darwin'
 protobuf==3.20.2; sys_platform == 'darwin' and platform_machine != 'arm64'
 protobuf==3.20.3; sys_platform == 'darwin' and platform_machine == 'arm64'
diff --git a/run_localGPT.py b/run_localGPT.py
index 05a3d253..c9f0a2fa 100644
--- a/run_localGPT.py
+++ b/run_localGPT.py
@@ -21,6 +21,7 @@
 )
 
 from load_models import (
+    load_quantized_model_awq,
     load_quantized_model_gguf_ggml,
     load_quantized_model_qptq,
     load_full_model,
@@ -64,6 +65,8 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
             return llm
         elif ".ggml" in model_basename.lower():
             model, tokenizer = load_quantized_model_gguf_ggml(model_id, model_basename, device_type, LOGGING)
+        elif ".awq" in model_basename.lower():
+            model, tokenizer = load_quantized_model_awq(model_id, LOGGING)
         else:
             model, tokenizer = load_quantized_model_qptq(model_id, model_basename, device_type, LOGGING)
     else: