Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

XPU Support for HF Models #593

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
9 changes: 9 additions & 0 deletions load_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
LlamaForCausalLM,
LlamaTokenizer,
)
# Uncomment below line if you have Intel® Discrete GPU's and it has XPU Support.
#import intel_extension_for_pytorch as ipex
from constants import CONTEXT_WINDOW_SIZE, MAX_NEW_TOKENS, N_GPU_LAYERS, N_BATCH, MODELS_PATH


Expand Down Expand Up @@ -131,6 +133,13 @@ def load_full_model(model_id, model_basename, device_type, logging):
logging.info("Using LlamaTokenizer")
tokenizer = LlamaTokenizer.from_pretrained(model_id, cache_dir="./models/")
model = LlamaForCausalLM.from_pretrained(model_id, cache_dir="./models/")
elif device_type.lower() == "xpu":
logging.info("Using LlamaTokenizer")
tokenizer = LlamaTokenizer.from_pretrained(model_id, cache_dir="./models/")
logging.info("Using AutoModelForCausalLM")
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto", low_cpu_mem_usage=True, cache_dir="./models")
model = model.to('xpu')
model = ipex.optimize(model)
else:
logging.info("Using AutoModelForCausalLM for full models")
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="./models/")
Expand Down
32 changes: 30 additions & 2 deletions run_localGPT.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
from langchain.llms import HuggingFacePipeline
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler # for streaming response
from langchain.callbacks.manager import CallbackManager
from langchain.llms.base import LLM

# Uncomment below line if you have Intel Discrete GPU's and it has XPU Support
#import intel_extension_for_pytorch as ipex

callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

Expand All @@ -19,6 +23,18 @@
pipeline,
)

import warnings

warnings.filterwarnings(
"ignore", category=UserWarning, module="intel_extension_for_pytorch"
)
warnings.filterwarnings(
"ignore", category=UserWarning, module="torchvision.io.image", lineno=13
)
warnings.filterwarnings(
"ignore", category=UserWarning, module="transformers"
)

from load_models import (
load_quantized_model_gguf_ggml,
load_quantized_model_qptq,
Expand All @@ -34,7 +50,6 @@
MODELS_PATH,
)


def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
"""
Select a model for text generation using the HuggingFace library.
Expand Down Expand Up @@ -65,7 +80,20 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
else:
model, tokenizer = load_quantized_model_qptq(model_id, model_basename, device_type, LOGGING)
else:
model, tokenizer = load_full_model(model_id, model_basename, device_type, LOGGING)
model, tokenizer = load_full_model(model_id, model_basename, device_type, LOGGING)
if device_type == "xpu":
class CustomLLM(LLM):
def _call(self, prompt, stop=None, run_manager=None) -> str:
input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
result = model.generate(input_ids=input_ids, max_new_tokens=MAX_NEW_TOKENS)
result = tokenizer.decode(result[0])
return result
@property
def _llm_type(self) -> str:
return "custom"

llm = CustomLLM()
return llm

# Load configuration from the model to avoid warnings
generation_config = GenerationConfig.from_pretrained(model_id)
Expand Down