diff --git a/extensions/HuggingFace/python/requirements.txt b/extensions/HuggingFace/python/requirements.txt index 6388e1c9e..79e5db10b 100644 --- a/extensions/HuggingFace/python/requirements.txt +++ b/extensions/HuggingFace/python/requirements.txt @@ -10,11 +10,12 @@ huggingface_hub #Hugging Face Libraries - Local Inference Tranformers & Diffusors accelerate # Used to help speed up image generation -diffusers # Used for image + audio generation +diffusers # Used for image generation +scipy # array -> wav file, text-speech. torchaudio.save seems broken. +sentencepiece # Used for text translation torch torchvision torchaudio -scipy # array -> wav file, text-speech. torchaudio.save seems broken. transformers # Used for text generation #Other diff --git a/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/image_2_text.py b/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/image_2_text.py index ad408ca17..aab79c965 100644 --- a/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/image_2_text.py +++ b/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/image_2_text.py @@ -1,11 +1,21 @@ +import json from typing import Any, Dict, Optional, List, TYPE_CHECKING +from transformers import ( + Pipeline, + pipeline, +) + from aiconfig import ParameterizedModelParser, InferenceOptions from aiconfig.callback import CallbackEvent -import torch -from aiconfig.schema import Prompt, Output, ExecuteResult, Attachment - -from transformers import pipeline, Pipeline - +from aiconfig.schema import ( + Attachment, + ExecuteResult, + Output, + OutputDataWithValue, + Prompt, +) + +# Circular Dependency Type Hints if TYPE_CHECKING: from aiconfig import AIConfigRuntime @@ -93,10 +103,11 @@ async def deserialize( await aiconfig.callback_manager.run_callbacks(CallbackEvent("on_deserialize_start", __name__, {"prompt": prompt, "params": params})) # Build Completion data - completion_params = self.get_model_settings(prompt, aiconfig) + model_settings = self.get_model_settings(prompt, aiconfig) + completion_params = refine_completion_params(model_settings) + #Add image inputs inputs = validate_and_retrieve_image_from_attachments(prompt) - completion_params["inputs"] = inputs await aiconfig.callback_manager.run_callbacks(CallbackEvent("on_deserialize_complete", __name__, {"output": completion_params})) @@ -110,24 +121,93 @@ async def run_inference(self, prompt: Prompt, aiconfig: "AIConfigRuntime", optio {"prompt": prompt, "options": options, "parameters": parameters}, ) ) - model_name = aiconfig.get_model_name(prompt) - - self.pipelines[model_name] = pipeline(task="image-to-text", model=model_name) - captioner = self.pipelines[model_name] completion_data = await self.deserialize(prompt, aiconfig, parameters) inputs = completion_data.pop("inputs") - model = completion_data.pop("model") - response = captioner(inputs, **completion_data) - output = ExecuteResult(output_type="execute_result", data=response, metadata={}) + model_name: str | None = aiconfig.get_model_name(prompt) + if isinstance(model_name, str) and model_name not in self.pipelines: + self.pipelines[model_name] = pipeline(task="image-to-text", model=model_name) + captioner = self.pipelines[model_name] + + outputs: List[Output] = [] + response: List[Any] = captioner(inputs, **completion_data) + for count, result in enumerate(response): + output: Output = construct_regular_output(result, count) + outputs.append(output) - prompt.outputs = [output] - await aiconfig.callback_manager.run_callbacks(CallbackEvent("on_run_complete", __name__, {"result": prompt.outputs})) + prompt.outputs = outputs + print(f"{prompt.outputs=}") + await aiconfig.callback_manager.run_callbacks( + CallbackEvent( + "on_run_complete", + __name__, + {"result": prompt.outputs}, + ) + ) return prompt.outputs - def get_output_text(self, response: dict[str, Any]) -> str: - raise NotImplementedError("get_output_text is not implemented for HuggingFaceImage2TextTransformer") + def get_output_text( + self, + prompt: Prompt, + aiconfig: "AIConfigRuntime", + output: Optional[Output] = None, + ) -> str: + if output is None: + output = aiconfig.get_latest_output(prompt) + + if output is None: + return "" + + # TODO (rossdanlm): Handle multiple outputs in list + # https://github.com/lastmile-ai/aiconfig/issues/467 + if output.output_type == "execute_result": + output_data = output.data + if isinstance(output_data, str): + return output_data + if isinstance(output_data, OutputDataWithValue): + if isinstance(output_data.value, str): + return output_data.value + # HuggingFace Text summarization does not support function + # calls so shouldn't get here, but just being safe + return json.dumps(output_data.value, indent=2) + return "" + + +def refine_completion_params(model_settings: Dict[str, Any]) -> Dict[str, Any]: + """ + Refines the completion params for the HF image to text api. Removes any unsupported params. + The supported keys were found by looking at the HF ImageToTextPipeline.__call__ method + """ + supported_keys = { + "max_new_tokens", + "timeout", + } + + completion_data = {} + for key in model_settings: + if key.lower() in supported_keys: + completion_data[key.lower()] = model_settings[key] + + return completion_data + +# Helper methods +def construct_regular_output(result: Dict[str, str], execution_count: int) -> Output: + """ + Construct regular output per response result, without streaming enabled + """ + output = ExecuteResult( + **{ + "output_type": "execute_result", + # For some reason result is always in list format we haven't found + # a way of being able to return multiple sequences from the image + # to text pipeline + "data": result[0]["generated_text"], + "execution_count": execution_count, + "metadata": {}, + } + ) + return output def validate_attachment_type_is_image(attachment: Attachment): diff --git a/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/text_2_speech.py b/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/text_2_speech.py index 85dee4add..97e172fde 100644 --- a/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/text_2_speech.py +++ b/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/text_2_speech.py @@ -25,6 +25,8 @@ # Step 1: define Helpers def refine_pipeline_creation_params(model_settings: Dict[str, Any]) -> List[Dict[str, Any]]: + # There are from the transformers Github repo: + # https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_utils.py#L2534 supported_keys = { "torch_dtype", "force_download", @@ -61,9 +63,11 @@ def refine_pipeline_creation_params(model_settings: Dict[str, Any]) -> List[Dict def refine_completion_params(unfiltered_completion_params: Dict[str, Any]) -> Dict[str, Any]: - supported_keys = { - # ??? - } + # Note: There seems to be no public API docs on what completion + # params are supported for text to speech: + # https://huggingface.co/docs/transformers/tasks/text-to-speech#inference + # The only one mentioned is `forward_params` which can contain `speaker_embeddings` + supported_keys = {} completion_params: Dict[str, Any] = {} for key in unfiltered_completion_params: diff --git a/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/text_generation.py b/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/text_generation.py index e0941b0fd..4da5d7037 100644 --- a/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/text_generation.py +++ b/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/text_generation.py @@ -153,7 +153,7 @@ def __init__(self): config.register_model_parser(parser) """ super().__init__() - self.generators : dict[str, Pipeline]= {} + self.generators: dict[str, Pipeline]= {} def id(self) -> str: """ @@ -217,14 +217,14 @@ async def deserialize( # Build Completion data model_settings = self.get_model_settings(prompt, aiconfig) completion_data = refine_chat_completion_params(model_settings) - + #Add resolved prompt resolved_prompt = resolve_prompt(prompt, params, aiconfig) completion_data["prompt"] = resolved_prompt return completion_data async def run_inference( - self, prompt: Prompt, aiconfig : "AIConfigRuntime", options : InferenceOptions, parameters: Dict[str, Any] + self, prompt: Prompt, aiconfig: "AIConfigRuntime", options: InferenceOptions, parameters: Dict[str, Any] ) -> List[Output]: """ Invoked to run a prompt in the .aiconfig. This method should perform @@ -239,8 +239,8 @@ async def run_inference( """ completion_data = await self.deserialize(prompt, aiconfig, options, parameters) completion_data["text_inputs"] = completion_data.pop("prompt", None) - - model_name : str = aiconfig.get_model_name(prompt) + + model_name: str | None = aiconfig.get_model_name(prompt) if isinstance(model_name, str) and model_name not in self.generators: self.generators[model_name] = pipeline('text-generation', model=model_name) generator = self.generators[model_name] @@ -251,14 +251,14 @@ async def run_inference( not "stream" in completion_data or completion_data.get("stream") != False ) if should_stream: - tokenizer : AutoTokenizer = AutoTokenizer.from_pretrained(model_name) + tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(model_name) streamer = TextIteratorStreamer(tokenizer) completion_data["streamer"] = streamer - outputs : List[Output] = [] + outputs: List[Output] = [] output = None if not should_stream: - response : List[Any] = generator(**completion_data) + response: List[Any] = generator(**completion_data) for count, result in enumerate(response): output = construct_regular_output(result, count) outputs.append(output) @@ -267,7 +267,7 @@ async def run_inference( raise ValueError("Sorry, TextIteratorStreamer does not support multiple return sequences, please set `num_return_sequences` to 1") if not streamer: raise ValueError("Stream option is selected but streamer is not initialized") - + # For streaming, cannot call `generator` directly otherwise response will be blocking thread = threading.Thread(target=generator, kwargs=completion_data) thread.start() diff --git a/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/text_summarization.py b/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/text_summarization.py index bba735b4f..2b3b61358 100644 --- a/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/text_summarization.py +++ b/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/text_summarization.py @@ -128,13 +128,18 @@ def construct_stream_output( "metadata": {}, } ) + accumulated_message = "" for new_text in streamer: if isinstance(new_text, str): + # For some reason these symbols aren't filtered out by the streamer + new_text = new_text.replace("", "") + new_text = new_text.replace("", "") + accumulated_message += new_text options.stream_callback(new_text, accumulated_message, 0) - output.data = accumulated_message + return output @@ -245,18 +250,18 @@ async def run_inference(self, prompt: Prompt, aiconfig: "AIConfigRuntime", optio # if stream enabled in runtime options and config, then stream. Otherwise don't stream. streamer = None - should_stream = (options.stream if options else False) and (not "stream" in completion_data or completion_data.get("stream") != False) + should_stream = (options.stream if options else False) and ( + not "stream" in completion_data or completion_data.get("stream") != False + ) if should_stream: tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(model_name) streamer = TextIteratorStreamer(tokenizer) completion_data["streamer"] = streamer - outputs: List[Output] = [] - output = None - def _summarize(): return summarizer(inputs, **completion_data) + outputs: List[Output] = [] if not should_stream: response: List[Any] = _summarize() for count, result in enumerate(response): diff --git a/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/text_translation.py b/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/text_translation.py index 9ee8bb357..860a11e46 100644 --- a/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/text_translation.py +++ b/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/text_translation.py @@ -129,12 +129,19 @@ def construct_stream_output( "metadata": {}, } ) + accumulated_message = "" for new_text in streamer: if isinstance(new_text, str): + # For some reason these symbols aren't filtered out by the streamer + new_text = new_text.replace("", "") + new_text = new_text.replace("", "") + new_text = new_text.replace("", "") + accumulated_message += new_text options.stream_callback(new_text, accumulated_message, 0) output.data = accumulated_message + return output @@ -240,19 +247,26 @@ async def run_inference(self, prompt: Prompt, aiconfig: "AIConfigRuntime", optio model_name: str = aiconfig.get_model_name(prompt) if isinstance(model_name, str) and model_name not in self.translators: - self.translators[model_name] = pipeline(model_name) + self.translators[model_name] = pipeline("translation", model_name) translator = self.translators[model_name] # if stream enabled in runtime options and config, then stream. Otherwise don't stream. streamer = None - should_stream = (options.stream if options else False) and (not "stream" in completion_data or completion_data.get("stream") != False) + should_stream = (options.stream if options else False) and ( + not "stream" in completion_data or completion_data.get("stream") != False + ) if should_stream: - raise NotImplementedError("Streaming is not supported for HuggingFace Text Translation") + tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(model_name) + streamer = TextIteratorStreamer(tokenizer) + completion_data["streamer"] = streamer + + def _translate(): + return translator(inputs, **completion_data) outputs: List[Output] = [] output = None if not should_stream: - response: List[Any] = translator(inputs, **completion_data) + response: List[Any] = _translate() for count, result in enumerate(response): output = construct_regular_output(result, count) outputs.append(output) @@ -263,7 +277,7 @@ async def run_inference(self, prompt: Prompt, aiconfig: "AIConfigRuntime", optio raise ValueError("Stream option is selected but streamer is not initialized") # For streaming, cannot call `translator` directly otherwise response will be blocking - thread = threading.Thread(target=translator, kwargs=completion_data) + thread = threading.Thread(target=_translate) thread.start() output = construct_stream_output(streamer, options) if output is not None: