diff --git a/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/image_2_text.py b/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/image_2_text.py index 7cbf54ccf..12455f6d0 100644 --- a/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/image_2_text.py +++ b/extensions/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/image_2_text.py @@ -179,7 +179,8 @@ def get_output_text( def refine_completion_params(model_settings: Dict[str, Any]) -> Dict[str, Any]: """ Refines the completion params for the HF image to text api. Removes any unsupported params. - The supported keys were found by looking at the HF ImageToTextPipeline.__call__ method + The supported keys were found by looking at the HF ImageToTextPipeline.__call__ method: + https://github.com/huggingface/transformers/blob/cbbe30749b425f7c327acdab11473b33567a6e26/src/transformers/pipelines/image_to_text.py#L83 """ supported_keys = { "max_new_tokens", diff --git a/python/src/aiconfig/editor/client/src/shared/prompt_schemas/HuggingFaceAutomaticSpeechRecognitionPromptSchema.ts b/python/src/aiconfig/editor/client/src/shared/prompt_schemas/HuggingFaceAutomaticSpeechRecognitionPromptSchema.ts index 3af87d6dd..0650678b3 100644 --- a/python/src/aiconfig/editor/client/src/shared/prompt_schemas/HuggingFaceAutomaticSpeechRecognitionPromptSchema.ts +++ b/python/src/aiconfig/editor/client/src/shared/prompt_schemas/HuggingFaceAutomaticSpeechRecognitionPromptSchema.ts @@ -15,7 +15,14 @@ export const HuggingFaceAutomaticSpeechRecognitionPromptSchema: PromptSchema = { items: { type: "attachment", required: ["data"], - mime_types: ["audio/mpeg", "audio/wav", "audio/webm", "audio/flac", "audio/ogg", "audio/ogg"], + mime_types: [ + "audio/mpeg", + "audio/wav", + "audio/webm", + "audio/flac", + "audio/ogg", + "audio/ogg", + ], properties: { data: { type: "string", @@ -45,10 +52,10 @@ export const HuggingFaceAutomaticSpeechRecognitionPromptSchema: PromptSchema = { bits at the end to make the final reconstitution as perfect as possible. Defaults to defaults to chunk_length_s / 6`, }, - device:{ + device: { type: "string", enum: ["cuda", "mps", "cpu"], - description: `The device to load the pipeline to. Mps backend not supported for all models.` + description: `The device to load the pipeline to. Mps backend not supported for all models.`, }, framework: { type: "string", @@ -65,15 +72,12 @@ export const HuggingFaceAutomaticSpeechRecognitionPromptSchema: PromptSchema = { return_timestamps: { type: "string", enum: ["word", "char", "True", ""], - description: `Only available for pure CTC models (Wav2Vec2, HuBERT, etc) and the Whisper model. Not available for other sequence-to-sequence models.` + description: `Only available for pure CTC models (Wav2Vec2, HuBERT, etc) and the Whisper model. Not available for other sequence-to-sequence models.`, }, max_new_tokens: { - type: "number", - description: `The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt` - } + type: "integer", + description: `The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt`, + }, }, }, }; - - - diff --git a/python/src/aiconfig/editor/client/src/shared/prompt_schemas/HuggingFaceImage2TextTransformerPromptSchema.ts b/python/src/aiconfig/editor/client/src/shared/prompt_schemas/HuggingFaceImage2TextTransformerPromptSchema.ts index ea0a55ab4..337efd291 100644 --- a/python/src/aiconfig/editor/client/src/shared/prompt_schemas/HuggingFaceImage2TextTransformerPromptSchema.ts +++ b/python/src/aiconfig/editor/client/src/shared/prompt_schemas/HuggingFaceImage2TextTransformerPromptSchema.ts @@ -1,6 +1,10 @@ import { PromptSchema } from "../../utils/promptUtils"; export const HuggingFaceImage2TextTransformerPromptSchema: PromptSchema = { + // See https://github.com/huggingface/transformers/blob/cbbe30749b425f7c327acdab11473b33567a6e26/src/transformers/pipelines/image_to_text.py#L83 + // for settings and defaults. The settings below are supported settings + // specified in the HuggingFaceImage2TextTransformer + // refine_completion_params implementation. input: { type: "object", required: ["data"], @@ -28,6 +32,16 @@ export const HuggingFaceImage2TextTransformerPromptSchema: PromptSchema = { type: "string", description: `Hugging Face model to use`, }, + max_new_tokens: { + type: "integer", + description: `The amount of maximum tokens to generate. + By default it will use \`generate\` default.`, + }, + timeout: { + type: "number", + description: `The maximum time in seconds to wait for fetching images + from the web. If None, no timeout is set and the call may block forever.`, + }, }, }, };