From e7311a2c3462c1c33856c18e91a64381bd28cdc4 Mon Sep 17 00:00:00 2001
From: PromptEngineer <jnfarooq@outlook.com>
Date: Fri, 3 May 2024 15:33:16 -0700
Subject: [PATCH] added support for llama3

---
 README.md                |  4 ++--
 constants.py             | 17 ++++++++++++++---
 load_models.py           | 16 +++++++++++++---
 prompt_template_utils.py | 24 ++++++++++++++++++++++++
 run_localGPT.py          |  6 +++---
 5 files changed, 56 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index 1a13b753..e006ebf3 100644
--- a/README.md
+++ b/README.md
@@ -71,14 +71,14 @@ For `NVIDIA` GPUs support, use `cuBLAS`
 
 ```shell
 # Example: cuBLAS
-CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python==0.1.83 --no-cache-dir
+CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --no-cache-dir
 ```
 
 For Apple Metal (`M1/M2`) support, use
 
 ```shell
 # Example: METAL
-CMAKE_ARGS="-DLLAMA_METAL=on"  FORCE_CMAKE=1 pip install llama-cpp-python==0.1.83 --no-cache-dir
+CMAKE_ARGS="-DLLAMA_METAL=on"  FORCE_CMAKE=1 pip install llama-cpp-python --no-cache-dir
 ```
 For more details, please refer to [llama-cpp](https://github.com/abetlen/llama-cpp-python#installation-with-openblas--cublas--clblast--metal)
 
diff --git a/constants.py b/constants.py
index 380c650a..876e4d71 100644
--- a/constants.py
+++ b/constants.py
@@ -29,7 +29,7 @@
 )
 
 # Context Window and Max New Tokens
-CONTEXT_WINDOW_SIZE = 4096
+CONTEXT_WINDOW_SIZE = 8096
 MAX_NEW_TOKENS = CONTEXT_WINDOW_SIZE  # int(CONTEXT_WINDOW_SIZE/4)
 
 #### If you get a "not enough space in the buffer" error, you should reduce the values below, start with half of the original values and keep halving the value until the error stops appearing
@@ -100,8 +100,19 @@
 # MODEL_ID = "TheBloke/Llama-2-13b-Chat-GGUF"
 # MODEL_BASENAME = "llama-2-13b-chat.Q4_K_M.gguf"
 
-MODEL_ID = "TheBloke/Llama-2-7b-Chat-GGUF"
-MODEL_BASENAME = "llama-2-7b-chat.Q4_K_M.gguf"
+# MODEL_ID = "TheBloke/Llama-2-7b-Chat-GGUF"
+# MODEL_BASENAME = "llama-2-7b-chat.Q4_K_M.gguf"
+
+# MODEL_ID = "QuantFactory/Meta-Llama-3-8B-Instruct-GGUF"
+# MODEL_BASENAME = "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf"
+
+# LLAMA 3 # use for Apple Silicon
+# MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+# MODEL_BASENAME = None
+
+# LLAMA 3 # use for NVIDIA GPUs
+# MODEL_ID = "unsloth/llama-3-8b-bnb-4bit"
+# MODEL_BASENAME = None
 
 # MODEL_ID = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF"
 # MODEL_BASENAME = "mistral-7b-instruct-v0.1.Q8_0.gguf"
diff --git a/load_models.py b/load_models.py
index 65f10e6e..ad278166 100644
--- a/load_models.py
+++ b/load_models.py
@@ -136,9 +136,19 @@ def load_full_model(model_id, model_basename, device_type, logging):
     """
 
     if device_type.lower() in ["mps", "cpu"]:
-        logging.info("Using LlamaTokenizer")
-        tokenizer = LlamaTokenizer.from_pretrained(model_id, cache_dir="./models/")
-        model = LlamaForCausalLM.from_pretrained(model_id, cache_dir="./models/")
+        logging.info("Using AutoModelForCausalLM")
+        # tokenizer = LlamaTokenizer.from_pretrained(model_id, cache_dir="./models/")
+        # model = LlamaForCausalLM.from_pretrained(model_id, cache_dir="./models/")
+
+        model = AutoModelForCausalLM.from_pretrained(model_id,
+                                            #  quantization_config=quantization_config,
+                                            #  low_cpu_mem_usage=True,
+                                            #  torch_dtype="auto",
+                                             torch_dtype=torch.bfloat16,
+                                             device_map="auto",
+                                             cache_dir="./models/")
+
+        tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="./models/")
     else:
         logging.info("Using AutoModelForCausalLM for full models")
         tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="./models/")
diff --git a/prompt_template_utils.py b/prompt_template_utils.py
index beb56f8a..aa55c1b2 100644
--- a/prompt_template_utils.py
+++ b/prompt_template_utils.py
@@ -33,6 +33,28 @@ def get_prompt_template(system_prompt=system_prompt, promptTemplate_type=None, h
 
             prompt_template = B_INST + SYSTEM_PROMPT + instruction + E_INST
             prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template)
+
+    elif promptTemplate_type == "llama3":
+
+        B_INST, E_INST = "<|start_header_id|>user<|end_header_id|>", "<|eot_id|>"
+        B_SYS, E_SYS = "<|begin_of_text|><|start_header_id|>system<|end_header_id|> ", "<|eot_id|>"
+        ASSISTANT_INST = "<|start_header_id|>assistant<|end_header_id|>"
+        SYSTEM_PROMPT = B_SYS + system_prompt + E_SYS
+        if history:
+            instruction = """
+            Context: {history} \n {context}
+            User: {question}"""
+
+            prompt_template = SYSTEM_PROMPT + B_INST + instruction + ASSISTANT_INST
+            prompt = PromptTemplate(input_variables=["history", "context", "question"], template=prompt_template)
+        else:
+            instruction = """
+            Context: {context}
+            User: {question}"""
+
+            prompt_template = SYSTEM_PROMPT + B_INST + instruction + ASSISTANT_INST
+            prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template)
+
     elif promptTemplate_type == "mistral":
         B_INST, E_INST = "<s>[INST] ", " [/INST]"
         if history:
@@ -82,6 +104,8 @@ def get_prompt_template(system_prompt=system_prompt, promptTemplate_type=None, h
 
     memory = ConversationBufferMemory(input_key="question", memory_key="history")
 
+    print(f"Here is the prompt used: {prompt}")
+
     return (
         prompt,
         memory,
diff --git a/run_localGPT.py b/run_localGPT.py
index 4ed53983..185c983c 100644
--- a/run_localGPT.py
+++ b/run_localGPT.py
@@ -209,11 +209,11 @@ def retrieval_qa_pipline(device_type, use_history, promptTemplate_type="llama"):
 )
 @click.option(
     "--model_type",
-    default="llama",
+    default="llama3",
     type=click.Choice(
-        ["llama", "mistral", "non_llama"],
+        ["llama3", "llama", "mistral", "non_llama"],
     ),
-    help="model type, llama, mistral or non_llama",
+    help="model type, llama3, llama, mistral or non_llama",
 )
 @click.option(
     "--save_qa",