Skip to content

Commit

Permalink
UPDATE: xpu support
Browse files Browse the repository at this point in the history
  • Loading branch information
rskasturi committed Oct 11, 2023
1 parent 279dfbb commit f5a68e1
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 5 deletions.
8 changes: 4 additions & 4 deletions constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,8 @@
# MODEL_ID = "TheBloke/Llama-2-13b-Chat-GGUF"
# MODEL_BASENAME = "llama-2-13b-chat.Q4_K_M.gguf"

MODEL_ID = "TheBloke/Llama-2-7b-Chat-GGUF"
MODEL_BASENAME = "llama-2-7b-chat.Q4_K_M.gguf"
# MODEL_ID = "TheBloke/Llama-2-7b-Chat-GGUF"
# MODEL_BASENAME = "llama-2-7b-chat.Q4_K_M.gguf"

# MODEL_ID = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF"
# MODEL_BASENAME = "mistral-7b-instruct-v0.1.Q8_0.gguf"
Expand All @@ -111,8 +111,8 @@
#### (FOR HF MODELS)
####

# MODEL_ID = "NousResearch/Llama-2-7b-chat-hf"
# MODEL_BASENAME = None
MODEL_ID = "NousResearch/Llama-2-7b-chat-hf"
MODEL_BASENAME = None
# MODEL_ID = "TheBloke/vicuna-7B-1.1-HF"
# MODEL_BASENAME = None
# MODEL_ID = "TheBloke/Wizard-Vicuna-7B-Uncensored-HF"
Expand Down
8 changes: 8 additions & 0 deletions load_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
LlamaForCausalLM,
LlamaTokenizer,
)
import intel_extension_for_pytorch as ipex
from constants import CONTEXT_WINDOW_SIZE, MAX_NEW_TOKENS, N_GPU_LAYERS, N_BATCH, MODELS_PATH


Expand Down Expand Up @@ -131,6 +132,13 @@ def load_full_model(model_id, model_basename, device_type, logging):
logging.info("Using LlamaTokenizer")
tokenizer = LlamaTokenizer.from_pretrained(model_id, cache_dir="./models/")
model = LlamaForCausalLM.from_pretrained(model_id, cache_dir="./models/")
elif device_type.lower() == "xpu":
logging.info("Using LlamaTokenizer")
tokenizer = LlamaTokenizer.from_pretrained(model_id, cache_dir="./models/")
logging.info("Using AutoModelForCausalLM")
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto", low_cpu_mem_usage=True, cache_dir="./models")
model = model.to('xpu')
model = ipex.optimize(model)
else:
logging.info("Using AutoModelForCausalLM for full models")
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="./models/")
Expand Down
17 changes: 16 additions & 1 deletion run_localGPT.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from langchain.llms import HuggingFacePipeline
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler # for streaming response
from langchain.callbacks.manager import CallbackManager
from langchain.llms.base import LLM
import intel_extension_for_pytorch as ipex

callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

Expand Down Expand Up @@ -34,7 +36,6 @@
MODELS_PATH,
)


def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
"""
Select a model for text generation using the HuggingFace library.
Expand Down Expand Up @@ -67,6 +68,20 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
else:
model, tokenizer = load_full_model(model_id, model_basename, device_type, LOGGING)

class CustomLLM(LLM):
def _call(self, prompt, stop=None, run_manager=None) -> str:
input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
result = model.generate(input_ids=input_ids, max_new_tokens=512)
result = tokenizer.decode(result[0])
return result

@property
def _llm_type(self) -> str:
return "custom"

llm = CustomLLM()
return llm

# Load configuration from the model to avoid warnings
generation_config = GenerationConfig.from_pretrained(model_id)
# see here for details:
Expand Down

0 comments on commit f5a68e1

Please sign in to comment.