Add model setting completion params for Image2Text prompt schema (#875)

Add model setting completion params for Image2Text prompt schema TSIA, pretty simple actually ## Test Plan Follow the README from AIConfig Editor https://github.com/lastmile-ai/aiconfig/tree/main/python/src/aiconfig/editor#dev, then run these commands: Make sure also that you don't have the `python-aiconfig-test` installed: `pip3 uninstall python-aiconfig-test` ```bash aiconfig_path=/Users/rossdancraig/Projects/aiconfig/cookbooks/Gradio/huggingface.aiconfig.json parsers_path=/Users/rossdancraig/Projects/aiconfig/cookbooks/Gradio/hf_model_parsers.py alias aiconfig="python3 -m 'aiconfig.scripts.aiconfig_cli'" aiconfig edit --aiconfig-path=$aiconfig_path --server-port=8080 --server-mode=debug_servers --parsers-module-path=$parsers_path ``` <img width="1261" alt="Screenshot 2024-01-11 at 01 21 21" src="https://github.com/lastmile-ai/aiconfig/assets/151060367/63ef4830-1163-4229-90ea-d49b914d1ec2"> --- Stack created with [Sapling](https://sapling-scm.com). Best reviewed with [ReviewStack](https://reviewstack.dev/lastmile-ai/aiconfig/pull/875). * #877 * __->__ #875
lastmile-ai · Jan 11, 2024 · cfc00c3 · cfc00c3
2 parents 0ceb176 + cdbd431
commit cfc00c3
Show file tree

Hide file tree

Showing 3 changed files with 30 additions and 11 deletions.
diff --git a/...ns/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/image_2_text.py b/...ns/HuggingFace/python/src/aiconfig_extension_hugging_face/local_inference/image_2_text.py
@@ -179,7 +179,8 @@ def get_output_text(
 def refine_completion_params(model_settings: Dict[str, Any]) -> Dict[str, Any]:
  """
  Refines the completion params for the HF image to text api. Removes any unsupported params.
- The supported keys were found by looking at the HF ImageToTextPipeline.__call__ method
+ The supported keys were found by looking at the HF ImageToTextPipeline.__call__ method:
+ https://github.com/huggingface/transformers/blob/cbbe30749b425f7c327acdab11473b33567a6e26/src/transformers/pipelines/image_to_text.py#L83
  """
  supported_keys = {
  "max_new_tokens",

diff --git a/...tor/client/src/shared/prompt_schemas/HuggingFaceAutomaticSpeechRecognitionPromptSchema.ts b/...tor/client/src/shared/prompt_schemas/HuggingFaceAutomaticSpeechRecognitionPromptSchema.ts
@@ -15,7 +15,14 @@ export const HuggingFaceAutomaticSpeechRecognitionPromptSchema: PromptSchema = {
  items: {
  type: "attachment",
  required: ["data"],
- mime_types: ["audio/mpeg", "audio/wav", "audio/webm", "audio/flac", "audio/ogg", "audio/ogg"],
+ mime_types: [
+ "audio/mpeg",
+ "audio/wav",
+ "audio/webm",
+ "audio/flac",
+ "audio/ogg",
+ "audio/ogg",
+ ],
  properties: {
  data: {
  type: "string",
@@ -45,10 +52,10 @@ export const HuggingFaceAutomaticSpeechRecognitionPromptSchema: PromptSchema = {
  bits at the end to make the final reconstitution as perfect as possible.
  Defaults to defaults to chunk_length_s / 6`,
  },
- device:{
+ device: {
  type: "string",
  enum: ["cuda", "mps", "cpu"],
- description: `The device to load the pipeline to. Mps backend not supported for all models.`
+ description: `The device to load the pipeline to. Mps backend not supported for all models.`,
  },
  framework: {
  type: "string",
@@ -65,15 +72,12 @@ export const HuggingFaceAutomaticSpeechRecognitionPromptSchema: PromptSchema = {
  return_timestamps: {
  type: "string",
  enum: ["word", "char", "True", ""],
- description: `Only available for pure CTC models (Wav2Vec2, HuBERT, etc) and the Whisper model. Not available for other sequence-to-sequence models.`
+ description: `Only available for pure CTC models (Wav2Vec2, HuBERT, etc) and the Whisper model. Not available for other sequence-to-sequence models.`,
  },
  max_new_tokens: {
- type: "number",
- description: `The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt`
- }
+ type: "integer",
+ description: `The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt`,
+ },
  },
  },
 };
-
-
-
diff --git a/...g/editor/client/src/shared/prompt_schemas/HuggingFaceImage2TextTransformerPromptSchema.ts b/...g/editor/client/src/shared/prompt_schemas/HuggingFaceImage2TextTransformerPromptSchema.ts
@@ -1,6 +1,10 @@
 import { PromptSchema } from "../../utils/promptUtils";
 
 export const HuggingFaceImage2TextTransformerPromptSchema: PromptSchema = {
+ // See https://github.com/huggingface/transformers/blob/cbbe30749b425f7c327acdab11473b33567a6e26/src/transformers/pipelines/image_to_text.py#L83
+ // for settings and defaults. The settings below are supported settings
+ // specified in the HuggingFaceImage2TextTransformer
+ // refine_completion_params implementation.
  input: {
  type: "object",
  required: ["data"],
@@ -28,6 +32,16 @@ export const HuggingFaceImage2TextTransformerPromptSchema: PromptSchema = {
  type: "string",
  description: `Hugging Face model to use`,
  },
+ max_new_tokens: {
+ type: "integer",
+ description: `The amount of maximum tokens to generate. 
+ By default it will use \`generate\` default.`,
+ },
+ timeout: {
+ type: "number",
+ description: `The maximum time in seconds to wait for fetching images 
+ from the web. If None, no timeout is set and the call may block forever.`,
+ },
  },
  },
 };