UPDATE: xpu support

PromtEngineer · Oct 11, 2023 · f5a68e1 · f5a68e1
1 parent 279dfbb
commit f5a68e1
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 5 deletions.
diff --git a/constants.py b/constants.py
@@ -98,8 +98,8 @@
 # MODEL_ID = "TheBloke/Llama-2-13b-Chat-GGUF"
 # MODEL_BASENAME = "llama-2-13b-chat.Q4_K_M.gguf"
 
-MODEL_ID = "TheBloke/Llama-2-7b-Chat-GGUF"
-MODEL_BASENAME = "llama-2-7b-chat.Q4_K_M.gguf"
+# MODEL_ID = "TheBloke/Llama-2-7b-Chat-GGUF"
+# MODEL_BASENAME = "llama-2-7b-chat.Q4_K_M.gguf"
 
 # MODEL_ID = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF"
 # MODEL_BASENAME = "mistral-7b-instruct-v0.1.Q8_0.gguf"
@@ -111,8 +111,8 @@
 #### (FOR HF MODELS)
 ####
 
-# MODEL_ID = "NousResearch/Llama-2-7b-chat-hf"
-# MODEL_BASENAME = None
+MODEL_ID = "NousResearch/Llama-2-7b-chat-hf"
+MODEL_BASENAME = None
 # MODEL_ID = "TheBloke/vicuna-7B-1.1-HF"
 # MODEL_BASENAME = None
 # MODEL_ID = "TheBloke/Wizard-Vicuna-7B-Uncensored-HF"

diff --git a/load_models.py b/load_models.py
@@ -9,6 +9,7 @@
  LlamaForCausalLM,
  LlamaTokenizer,
 )
+import intel_extension_for_pytorch as ipex
 from constants import CONTEXT_WINDOW_SIZE, MAX_NEW_TOKENS, N_GPU_LAYERS, N_BATCH, MODELS_PATH
 
 
@@ -131,6 +132,13 @@ def load_full_model(model_id, model_basename, device_type, logging):
  logging.info("Using LlamaTokenizer")
  tokenizer = LlamaTokenizer.from_pretrained(model_id, cache_dir="./models/")
  model = LlamaForCausalLM.from_pretrained(model_id, cache_dir="./models/")
+ elif device_type.lower() == "xpu":
+ logging.info("Using LlamaTokenizer")
+ tokenizer = LlamaTokenizer.from_pretrained(model_id, cache_dir="./models/")
+ logging.info("Using AutoModelForCausalLM")
+ model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto", low_cpu_mem_usage=True, cache_dir="./models")
+ model = model.to('xpu')
+ model = ipex.optimize(model)
  else:
  logging.info("Using AutoModelForCausalLM for full models")
  tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="./models/")

diff --git a/run_localGPT.py b/run_localGPT.py
@@ -7,6 +7,8 @@
 from langchain.llms import HuggingFacePipeline
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler # for streaming response
 from langchain.callbacks.manager import CallbackManager
+from langchain.llms.base import LLM
+import intel_extension_for_pytorch as ipex
 
 callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
 
@@ -34,7 +36,6 @@
  MODELS_PATH,
 )
 
-
 def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
  """
  Select a model for text generation using the HuggingFace library.
@@ -67,6 +68,20 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
  else:
  model, tokenizer = load_full_model(model_id, model_basename, device_type, LOGGING)
 
+ class CustomLLM(LLM):
+ def _call(self, prompt, stop=None, run_manager=None) -> str:
+ input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
+ result = model.generate(input_ids=input_ids, max_new_tokens=512)
+ result = tokenizer.decode(result[0])
+ return result
+
+ @property
+ def _llm_type(self) -> str:
+ return "custom"
+
+ llm = CustomLLM()
+ return llm
+
  # Load configuration from the model to avoid warnings
  generation_config = GenerationConfig.from_pretrained(model_id)
  # see here for details: