lastmile-ai · rossdanlm · Jan 11, 2024 · Jan 11, 2024 · saqadri · Jan 11, 2024
@@ -179,7 +179,8 @@ def get_output_text(
 def refine_completion_params(model_settings: Dict[str, Any]) -> Dict[str, Any]:
  """
  Refines the completion params for the HF image to text api. Removes any unsupported params.
- The supported keys were found by looking at the HF ImageToTextPipeline.__call__ method
+ The supported keys were found by looking at the HF ImageToTextPipeline.__call__ method:
+ https://github.com/huggingface/transformers/blob/cbbe30749b425f7c327acdab11473b33567a6e26/src/transformers/pipelines/image_to_text.py#L83
  """
  supported_keys = {
  "max_new_tokens",

@@ -15,7 +15,14 @@ export const HuggingFaceAutomaticSpeechRecognitionPromptSchema: PromptSchema = {
  items: {
  type: "attachment",
  required: ["data"],
- mime_types: ["audio/mpeg", "audio/wav", "audio/webm", "audio/flac", "audio/ogg", "audio/ogg"],
+ mime_types: [
+ "audio/mpeg",
+ "audio/wav",
+ "audio/webm",
+ "audio/flac",
+ "audio/ogg",
+ "audio/ogg",
+ ],
  properties: {
  data: {
  type: "string",
@@ -45,10 +52,10 @@ export const HuggingFaceAutomaticSpeechRecognitionPromptSchema: PromptSchema = {
  bits at the end to make the final reconstitution as perfect as possible.
  Defaults to defaults to chunk_length_s / 6`,
  },
- device:{
+ device: {
  type: "string",
  enum: ["cuda", "mps", "cpu"],
- description: `The device to load the pipeline to. Mps backend not supported for all models.`
+ description: `The device to load the pipeline to. Mps backend not supported for all models.`,
  },
  framework: {
  type: "string",
@@ -65,15 +72,12 @@ export const HuggingFaceAutomaticSpeechRecognitionPromptSchema: PromptSchema = {
  return_timestamps: {
  type: "string",
  enum: ["word", "char", "True", ""],
- description: `Only available for pure CTC models (Wav2Vec2, HuBERT, etc) and the Whisper model. Not available for other sequence-to-sequence models.`
+ description: `Only available for pure CTC models (Wav2Vec2, HuBERT, etc) and the Whisper model. Not available for other sequence-to-sequence models.`,
  },
  max_new_tokens: {
- type: "number",
- description: `The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt`
- }
+ type: "integer",
+ description: `The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt`,
+ },
  },
  },
 };
-
-
-
@@ -1,6 +1,10 @@
 import { PromptSchema } from "../../utils/promptUtils";
 
 export const HuggingFaceImage2TextTransformerPromptSchema: PromptSchema = {
+ // See https://github.com/huggingface/transformers/blob/cbbe30749b425f7c327acdab11473b33567a6e26/src/transformers/pipelines/image_to_text.py#L83
+ // for settings and defaults. The settings below are supported settings
+ // specified in the HuggingFaceImage2TextTransformer
+ // refine_completion_params implementation.
  input: {
  type: "object",
  required: ["data"],
@@ -28,6 +32,16 @@ export const HuggingFaceImage2TextTransformerPromptSchema: PromptSchema = {
  type: "string",
  description: `Hugging Face model to use`,
  },
+ max_new_tokens: {
+ type: "integer",
+ description: `The amount of maximum tokens to generate. 
+ By default it will use \`generate\` default.`,
+ },
+ timeout: {
+ type: "number",
+ description: `The maximum time in seconds to wait for fetching images 
+ from the web. If None, no timeout is set and the call may block forever.`,
+ },
  },
  },
 };