From e7311a2c3462c1c33856c18e91a64381bd28cdc4 Mon Sep 17 00:00:00 2001 From: PromptEngineer Date: Fri, 3 May 2024 15:33:16 -0700 Subject: [PATCH] added support for llama3 --- README.md | 4 ++-- constants.py | 17 ++++++++++++++--- load_models.py | 16 +++++++++++++--- prompt_template_utils.py | 24 ++++++++++++++++++++++++ run_localGPT.py | 6 +++--- 5 files changed, 56 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 1a13b753..e006ebf3 100644 --- a/README.md +++ b/README.md @@ -71,14 +71,14 @@ For `NVIDIA` GPUs support, use `cuBLAS` ```shell # Example: cuBLAS -CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python==0.1.83 --no-cache-dir +CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --no-cache-dir ``` For Apple Metal (`M1/M2`) support, use ```shell # Example: METAL -CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python==0.1.83 --no-cache-dir +CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python --no-cache-dir ``` For more details, please refer to [llama-cpp](https://github.com/abetlen/llama-cpp-python#installation-with-openblas--cublas--clblast--metal) diff --git a/constants.py b/constants.py index 380c650a..876e4d71 100644 --- a/constants.py +++ b/constants.py @@ -29,7 +29,7 @@ ) # Context Window and Max New Tokens -CONTEXT_WINDOW_SIZE = 4096 +CONTEXT_WINDOW_SIZE = 8096 MAX_NEW_TOKENS = CONTEXT_WINDOW_SIZE # int(CONTEXT_WINDOW_SIZE/4) #### If you get a "not enough space in the buffer" error, you should reduce the values below, start with half of the original values and keep halving the value until the error stops appearing @@ -100,8 +100,19 @@ # MODEL_ID = "TheBloke/Llama-2-13b-Chat-GGUF" # MODEL_BASENAME = "llama-2-13b-chat.Q4_K_M.gguf" -MODEL_ID = "TheBloke/Llama-2-7b-Chat-GGUF" -MODEL_BASENAME = "llama-2-7b-chat.Q4_K_M.gguf" +# MODEL_ID = "TheBloke/Llama-2-7b-Chat-GGUF" +# MODEL_BASENAME = "llama-2-7b-chat.Q4_K_M.gguf" + +# MODEL_ID = "QuantFactory/Meta-Llama-3-8B-Instruct-GGUF" +# MODEL_BASENAME = "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf" + +# LLAMA 3 # use for Apple Silicon +# MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" +# MODEL_BASENAME = None + +# LLAMA 3 # use for NVIDIA GPUs +# MODEL_ID = "unsloth/llama-3-8b-bnb-4bit" +# MODEL_BASENAME = None # MODEL_ID = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF" # MODEL_BASENAME = "mistral-7b-instruct-v0.1.Q8_0.gguf" diff --git a/load_models.py b/load_models.py index 65f10e6e..ad278166 100644 --- a/load_models.py +++ b/load_models.py @@ -136,9 +136,19 @@ def load_full_model(model_id, model_basename, device_type, logging): """ if device_type.lower() in ["mps", "cpu"]: - logging.info("Using LlamaTokenizer") - tokenizer = LlamaTokenizer.from_pretrained(model_id, cache_dir="./models/") - model = LlamaForCausalLM.from_pretrained(model_id, cache_dir="./models/") + logging.info("Using AutoModelForCausalLM") + # tokenizer = LlamaTokenizer.from_pretrained(model_id, cache_dir="./models/") + # model = LlamaForCausalLM.from_pretrained(model_id, cache_dir="./models/") + + model = AutoModelForCausalLM.from_pretrained(model_id, + # quantization_config=quantization_config, + # low_cpu_mem_usage=True, + # torch_dtype="auto", + torch_dtype=torch.bfloat16, + device_map="auto", + cache_dir="./models/") + + tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="./models/") else: logging.info("Using AutoModelForCausalLM for full models") tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="./models/") diff --git a/prompt_template_utils.py b/prompt_template_utils.py index beb56f8a..aa55c1b2 100644 --- a/prompt_template_utils.py +++ b/prompt_template_utils.py @@ -33,6 +33,28 @@ def get_prompt_template(system_prompt=system_prompt, promptTemplate_type=None, h prompt_template = B_INST + SYSTEM_PROMPT + instruction + E_INST prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template) + + elif promptTemplate_type == "llama3": + + B_INST, E_INST = "<|start_header_id|>user<|end_header_id|>", "<|eot_id|>" + B_SYS, E_SYS = "<|begin_of_text|><|start_header_id|>system<|end_header_id|> ", "<|eot_id|>" + ASSISTANT_INST = "<|start_header_id|>assistant<|end_header_id|>" + SYSTEM_PROMPT = B_SYS + system_prompt + E_SYS + if history: + instruction = """ + Context: {history} \n {context} + User: {question}""" + + prompt_template = SYSTEM_PROMPT + B_INST + instruction + ASSISTANT_INST + prompt = PromptTemplate(input_variables=["history", "context", "question"], template=prompt_template) + else: + instruction = """ + Context: {context} + User: {question}""" + + prompt_template = SYSTEM_PROMPT + B_INST + instruction + ASSISTANT_INST + prompt = PromptTemplate(input_variables=["context", "question"], template=prompt_template) + elif promptTemplate_type == "mistral": B_INST, E_INST = "[INST] ", " [/INST]" if history: @@ -82,6 +104,8 @@ def get_prompt_template(system_prompt=system_prompt, promptTemplate_type=None, h memory = ConversationBufferMemory(input_key="question", memory_key="history") + print(f"Here is the prompt used: {prompt}") + return ( prompt, memory, diff --git a/run_localGPT.py b/run_localGPT.py index 4ed53983..185c983c 100644 --- a/run_localGPT.py +++ b/run_localGPT.py @@ -209,11 +209,11 @@ def retrieval_qa_pipline(device_type, use_history, promptTemplate_type="llama"): ) @click.option( "--model_type", - default="llama", + default="llama3", type=click.Choice( - ["llama", "mistral", "non_llama"], + ["llama3", "llama", "mistral", "non_llama"], ), - help="model type, llama, mistral or non_llama", + help="model type, llama3, llama, mistral or non_llama", ) @click.option( "--save_qa",