diff --git a/constants.py b/constants.py index 007d16b4..629230e2 100644 --- a/constants.py +++ b/constants.py @@ -176,3 +176,11 @@ # MODEL_BASENAME = "wizard-vicuna-13B.ggmlv3.q2_K.bin" # MODEL_ID = "TheBloke/orca_mini_3B-GGML" # MODEL_BASENAME = "orca-mini-3b.ggmlv3.q4_0.bin" + +#### +#### (FOR AWQ QUANTIZED) Select a llm model based on your GPU and VRAM GB. Does not include Embedding Models VRAM usage. +### (*** MODEL_BASENAME is not actually used but have to contain .awq so the correct model loading is used ***) +### (*** Compute capability 7.5 (sm75) and CUDA Toolkit 11.8+ are required ***) +#### +# MODEL_ID = "TheBloke/Llama-2-7B-Chat-AWQ" +# MODEL_BASENAME = "model.safetensors.awq" diff --git a/load_models.py b/load_models.py index b9eb909f..c5347060 100644 --- a/load_models.py +++ b/load_models.py @@ -149,3 +149,33 @@ def load_full_model(model_id, model_basename, device_type, logging): ) model.tie_weights() return model, tokenizer + +def load_quantized_model_awq(model_id, logging): + """ + Load a AWQ quantized model using AutoModelForCausalLM. + + This function loads a quantized model that ends with AWQ. + + Parameters: + - model_id (str): The identifier for the model on HuggingFace Hub. + - logging (logging.Logger): Logger instance for logging messages. + + Returns: + - model (AutoModelForCausalLM): The loaded quantized model. + - tokenizer (AutoTokenizer): The tokenizer associated with the model. + + """ + + # The code supports all huggingface models that ends with AWQ. + logging.info("Using AutoModelForCausalLM for AWQ quantized models") + + tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True) + logging.info("Tokenizer loaded") + + model = AutoModelForCausalLM.from_pretrained( + model_id, + use_safetensors=True, + trust_remote_code=True, + device_map="auto", + ) + return model, tokenizer diff --git a/requirements.txt b/requirements.txt index 82068f05..998e8c8e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,6 +7,7 @@ sentence-transformers faiss-cpu huggingface_hub transformers +autoawq protobuf==3.20.2; sys_platform != 'darwin' protobuf==3.20.2; sys_platform == 'darwin' and platform_machine != 'arm64' protobuf==3.20.3; sys_platform == 'darwin' and platform_machine == 'arm64' diff --git a/run_localGPT.py b/run_localGPT.py index 05a3d253..c9f0a2fa 100644 --- a/run_localGPT.py +++ b/run_localGPT.py @@ -21,6 +21,7 @@ ) from load_models import ( + load_quantized_model_awq, load_quantized_model_gguf_ggml, load_quantized_model_qptq, load_full_model, @@ -64,6 +65,8 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging): return llm elif ".ggml" in model_basename.lower(): model, tokenizer = load_quantized_model_gguf_ggml(model_id, model_basename, device_type, LOGGING) + elif ".awq" in model_basename.lower(): + model, tokenizer = load_quantized_model_awq(model_id, LOGGING) else: model, tokenizer = load_quantized_model_qptq(model_id, model_basename, device_type, LOGGING) else: