PromtEngineer · rskasturi · Oct 11, 2023
diff --git a/load_models.py b/load_models.py
@@ -9,6 +9,8 @@
  LlamaForCausalLM,
  LlamaTokenizer,
 )
+# Uncomment below line if you have Intel® Discrete GPU's and it has XPU Support.
+#import intel_extension_for_pytorch as ipex
 from constants import CONTEXT_WINDOW_SIZE, MAX_NEW_TOKENS, N_GPU_LAYERS, N_BATCH, MODELS_PATH
 
 
@@ -131,6 +133,13 @@ def load_full_model(model_id, model_basename, device_type, logging):
  logging.info("Using LlamaTokenizer")
  tokenizer = LlamaTokenizer.from_pretrained(model_id, cache_dir="./models/")
  model = LlamaForCausalLM.from_pretrained(model_id, cache_dir="./models/")
+ elif device_type.lower() == "xpu":
+ logging.info("Using LlamaTokenizer")
+ tokenizer = LlamaTokenizer.from_pretrained(model_id, cache_dir="./models/")
+ logging.info("Using AutoModelForCausalLM")
+ model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto", low_cpu_mem_usage=True, cache_dir="./models")
+ model = model.to('xpu')
+ model = ipex.optimize(model)
  else:
  logging.info("Using AutoModelForCausalLM for full models")
  tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="./models/")

diff --git a/run_localGPT.py b/run_localGPT.py
@@ -7,6 +7,10 @@
 from langchain.llms import HuggingFacePipeline
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler # for streaming response
 from langchain.callbacks.manager import CallbackManager
+from langchain.llms.base import LLM
+
+# Uncomment below line if you have Intel Discrete GPU's and it has XPU Support
+#import intel_extension_for_pytorch as ipex
 
 callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
 
@@ -19,6 +23,18 @@
  pipeline,
 )
 
+import warnings
+
+warnings.filterwarnings(
+ "ignore", category=UserWarning, module="intel_extension_for_pytorch"
+)
+warnings.filterwarnings(
+ "ignore", category=UserWarning, module="torchvision.io.image", lineno=13
+)
+warnings.filterwarnings(
+ "ignore", category=UserWarning, module="transformers"
+)
+
 from load_models import (
  load_quantized_model_gguf_ggml,
  load_quantized_model_qptq,
@@ -34,7 +50,6 @@
  MODELS_PATH,
 )
 
-
 def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
  """
  Select a model for text generation using the HuggingFace library.
@@ -65,7 +80,20 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
  else:
  model, tokenizer = load_quantized_model_qptq(model_id, model_basename, device_type, LOGGING)
  else:
- model, tokenizer = load_full_model(model_id, model_basename, device_type, LOGGING)
+ model, tokenizer = load_full_model(model_id, model_basename, device_type, LOGGING)
+ if device_type == "xpu":
+ class CustomLLM(LLM):
+ def _call(self, prompt, stop=None, run_manager=None) -> str:
+ input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
+ result = model.generate(input_ids=input_ids, max_new_tokens=MAX_NEW_TOKENS)
+ result = tokenizer.decode(result[0])
+ return result
+ @property
+ def _llm_type(self) -> str:
+ return "custom"
+
+ llm = CustomLLM()
+ return llm
 
  # Load configuration from the model to avoid warnings
  generation_config = GenerationConfig.from_pretrained(model_id)