finalizing audio feature

Chainlit · May 5, 2024 · 49d2c2a · 49d2c2a
1 parent 276bbef
commit 49d2c2a
Show file tree

Hide file tree

Showing 18 changed files with 170 additions and 93 deletions.
diff --git a/backend/chainlit/__init__.py b/backend/chainlit/__init__.py
@@ -52,7 +52,7 @@
 from chainlit.step import Step, step
 from chainlit.sync import make_async, run_sync
 from chainlit.telemetry import trace
-from chainlit.types import ChatProfile, ThreadDict
+from chainlit.types import AudioChunk, ChatProfile, ThreadDict
 from chainlit.user import PersistedUser, User
 from chainlit.user_session import user_session
 from chainlit.utils import make_module_getattr, wrap_user_function
@@ -230,9 +230,7 @@ def on_audio_chunk(func: Callable) -> Callable:
  Hook to react to the audio chunks being sent.
 
  Args:
- is_start (bool): Whether this is the start of the audio stream.
- mime_type (str): The mime type of the audio chunk.
- chunk (bytes): The audio chunk.
+ chunk (AudioChunk): The audio chunk being sent.
 
  Returns:
  Callable[], Any]: The decorated hook.
@@ -352,6 +350,7 @@ def acall(self):
 __all__ = [
  "user_session",
  "CopilotFunction",
+ "AudioChunk",
  "Action",
  "User",
  "PersistedUser",

diff --git a/backend/chainlit/config.py b/backend/chainlit/config.py
@@ -18,7 +18,7 @@
  from chainlit.action import Action
  from chainlit.element import ElementBased
  from chainlit.message import Message
- from chainlit.types import ChatProfile, ThreadDict
+ from chainlit.types import AudioChunk, ChatProfile, ThreadDict
  from chainlit.user import User
  from fastapi import Request, Response
 
@@ -87,6 +87,8 @@
  initial_silence_timeout = 3000
  # Delay for the user to continue speaking in MS. If the user stops speaking for this duration, the recording will stop.
  silence_timeout = 1500
+ # Above this duration (MS), the recording will forcefully stop.
+ max_duration = 15000
  # Duration of the audio chunks in MS
  chunk_duration = 1000
  # Sample rate of the audio
@@ -97,7 +99,7 @@
 name = "Chatbot"
 
 # Show the readme while the thread is empty.
-show_readme_as_default = false
+show_readme_as_default = true
 
 # Description of the app and chatbot. This is used for HTML tags.
 # description = ""
@@ -207,9 +209,10 @@ class SpontaneousFileUploadFeature(DataClassJsonMixin):
 @dataclass
 class AudioFeature(DataClassJsonMixin):
  min_decibels: int = -45
- initial_silence_timeout: int = 3000
+ initial_silence_timeout: int = 2000
  silence_timeout: int = 1500
  chunk_duration: int = 1000
+ max_duration: int = 15000
  sample_rate: int = 44100
  enabled: bool = False
 
@@ -227,7 +230,7 @@ class FeaturesSettings(DataClassJsonMixin):
 @dataclass()
 class UISettings(DataClassJsonMixin):
  name: str
- show_readme_as_default: bool = False
+ show_readme_as_default: bool = True
  description: str = ""
  hide_cot: bool = False
  # Large size content are by default collapsed for a cleaner ui
@@ -260,7 +263,7 @@ class CodeSettings:
  on_chat_end: Optional[Callable[[], Any]] = None
  on_chat_resume: Optional[Callable[["ThreadDict"], Any]] = None
  on_message: Optional[Callable[["Message"], Any]] = None
- on_audio_chunk: Optional[Callable[[bool, str, str], Any]] = None
+ on_audio_chunk: Optional[Callable[["AudioChunk"], Any]] = None
  on_audio_end: Optional[Callable[[List["ElementBased"]], Any]] = None
 
  author_rename: Optional[Callable[[str], str]] = None

diff --git a/backend/chainlit/data/__init__.py b/backend/chainlit/data/__init__.py
@@ -156,6 +156,7 @@ def attachment_to_element_dict(self, attachment: Attachment) -> "ElementDict":
  "chainlitKey": None,
  "display": metadata.get("display", "side"),
  "language": metadata.get("language"),
+ "autoPlay": metadata.get("autoPlay", None),
  "page": metadata.get("page"),
  "size": metadata.get("size"),
  "type": metadata.get("type", "file"),

diff --git a/backend/chainlit/data/sql_alchemy.py b/backend/chainlit/data/sql_alchemy.py
@@ -170,12 +170,14 @@ async def update_thread(
  raise ValueError("User not found in session context")
  data = {
  "id": thread_id,
- "createdAt": await self.get_current_timestamp()
- if metadata is None
- else None,
- "name": name
- if name is not None
- else (metadata.get("name") if metadata and "name" in metadata else None),
+ "createdAt": (
+ await self.get_current_timestamp() if metadata is None else None
+ ),
+ "name": (
+ name
+ if name is not None
+ else (metadata.get("name") if metadata and "name" in metadata else None)
+ ),
  "userId": user_id,
  "userIdentifier": user_identifier,
  "tags": tags,
@@ -552,13 +554,17 @@ async def get_all_user_threads(
  streaming=step_feedback.get("step_streaming", False),
  waitForAnswer=step_feedback.get("step_waitforanswer"),
  isError=step_feedback.get("step_iserror"),
- metadata=step_feedback["step_metadata"]
- if step_feedback.get("step_metadata") is not None
- else {},
+ metadata=(
+ step_feedback["step_metadata"]
+ if step_feedback.get("step_metadata") is not None
+ else {}
+ ),
  tags=step_feedback.get("step_tags"),
- input=step_feedback.get("step_input", "")
- if step_feedback["step_showinput"]
- else "",
+ input=(
+ step_feedback.get("step_input", "")
+ if step_feedback["step_showinput"]
+ else ""
+ ),
  output=step_feedback.get("step_output", ""),
  createdAt=step_feedback.get("step_createdat"),
  start=step_feedback.get("step_start"),
@@ -587,6 +593,7 @@ async def get_all_user_threads(
  display=element["element_display"],
  size=element.get("element_size"),
  language=element.get("element_language"),
+ autoPlay=element.get("element_autoPlay"),
  page=element.get("element_page"),
  forId=element.get("element_forid"),
  mime=element.get("element_mime"),

diff --git a/backend/chainlit/element.py b/backend/chainlit/element.py
@@ -38,6 +38,7 @@ class ElementDict(TypedDict):
  size: Optional[ElementSize]
  language: Optional[str]
  page: Optional[int]
+ autoPlay: Optional[bool]
  forId: Optional[str]
  mime: Optional[str]
 
@@ -93,6 +94,7 @@ def to_dict(self) -> ElementDict:
  "objectKey": getattr(self, "object_key", None),
  "size": getattr(self, "size", None),
  "page": getattr(self, "page", None),
+ "autoPlay": getattr(self, "auto_play", None),
  "language": getattr(self, "language", None),
  "forId": getattr(self, "for_id", None),
  "mime": getattr(self, "mime", None),
@@ -306,6 +308,7 @@ async def preprocess_content(self):
 @dataclass
 class Audio(Element):
  type: ClassVar[ElementType] = "audio"
+ auto_play: bool = False
 
 
 @dataclass

diff --git a/backend/chainlit/llama_index/callbacks.py b/backend/chainlit/llama_index/callbacks.py
@@ -70,7 +70,7 @@ def on_event_start(
  ) -> str:
  """Run when an event starts and return id of event."""
  self._restore_context()
- 
+
  step_type: StepType = "undefined"
  if event_type == CBEventType.RETRIEVE:
  step_type = "retrieval"
@@ -104,7 +104,6 @@ def on_event_end(
  """Run when an event ends."""
  step = self.steps.get(event_id, None)
 
-
  if payload is None or step is None:
  return
 
@@ -117,11 +116,13 @@ def on_event_end(
  source_nodes = getattr(response, "source_nodes", None)
  if source_nodes:
  source_refs = ", ".join(
- [f"Source {idx}" for idx, _ in enumerate(source_nodes)])
+ [f"Source {idx}" for idx, _ in enumerate(source_nodes)]
+ )
  step.elements = [
  Text(
  name=f"Source {idx}",
  content=source.text or "Empty node",
+ display="side",
  )
  for idx, source in enumerate(source_nodes)
  ]
@@ -137,6 +138,7 @@ def on_event_end(
  step.elements = [
  Text(
  name=f"Source {idx}",
+ display="side",
  content=source.node.get_text() or "Empty node",
  )
  for idx, source in enumerate(sources)
@@ -173,7 +175,7 @@ def on_event_end(
  token_count = self.total_llm_token_count or None
  raw_response = response.raw if response else None
  model = raw_response.get("model", None) if raw_response else None
- 
+
  if messages and isinstance(response, ChatResponse):
  msg: ChatMessage = response.message
  step.generation = ChatGeneration(
@@ -198,12 +200,11 @@ def on_event_end(
  else:
  step.output = payload
  self.context.loop.create_task(step.update())
- 
+
  self.steps.pop(event_id, None)
 
  def _noop(self, *args, **kwargs):
  pass
 
  start_trace = _noop
  end_trace = _noop
-
diff --git a/backend/chainlit/socket.py b/backend/chainlit/socket.py
@@ -15,7 +15,12 @@
 from chainlit.server import socket
 from chainlit.session import WebsocketSession
 from chainlit.telemetry import trace_event
-from chainlit.types import AudioChunkPayload, AudioEndPayload, UIMessagePayload
+from chainlit.types import (
+ AudioChunk,
+ AudioChunkPayload,
+ AudioEndPayload,
+ UIMessagePayload,
+)
 from chainlit.user_session import user_sessions
 
 
@@ -272,11 +277,7 @@ async def audio_chunk(sid, payload: AudioChunkPayload):
  init_ws_context(session)
 
  if config.code.on_audio_chunk:
- asyncio.create_task(
- config.code.on_audio_chunk(
- payload["isStart"], payload["mimeType"], payload["data"]
- )
- )
+ asyncio.create_task(config.code.on_audio_chunk(AudioChunk(**payload)))
 
 
 @socket.on("audio_end")

diff --git a/backend/chainlit/types.py b/backend/chainlit/types.py
@@ -157,6 +157,15 @@ class UIMessagePayload(TypedDict):
 class AudioChunkPayload(TypedDict):
  isStart: bool
  mimeType: str
+ elapsedTime: float
+ data: bytes
+
+
+@dataclass
+class AudioChunk:
+ isStart: bool
+ mimeType: str
+ elapsedTime: float
  data: bytes
 
 

diff --git a/backend/pyproject.toml b/backend/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "chainlit"
-version = "1.0.6"
+version = "1.1.0"
 keywords = ['LLM', 'Agents', 'gen ai', 'chat ui', 'chatbot ui', 'openai', 'copilot', 'langchain', 'conversational ai']
 description = "Build Conversational AI."
 authors = ["Chainlit"]

diff --git a/cypress/e2e/llama_index_cb/.chainlit/config.toml b/cypress/e2e/llama_index_cb/.chainlit/config.toml
@@ -28,18 +28,29 @@ unsafe_allow_html = false
 # Process and display mathematical expressions. This can clash with "$" characters in messages.
 latex = false
 
-# Authorize users to upload files with messages
-[features.multi_modal]
+# Automatically tag threads with the current chat profile (if a chat profile is used)
+auto_tag_thread = true
+
+# Authorize users to spontaneously upload files with messages
+[features.spontaneous_file_upload]
  enabled = true
  accept = ["*/*"]
  max_files = 20
  max_size_mb = 500
 
-# Allows user to use speech to text
-[features.speech_to_text]
- enabled = false
- # See all languages here https://github.com/JamesBrill/react-speech-recognition/blob/HEAD/docs/API.md#language-string
- # language = "en-US"
+[features.audio]
+ # Threshold for audio recording
+ min_decibels = -45
+ # Delay for the user to start speaking in MS
+ initial_silence_timeout = 3000
+ # Delay for the user to continue speaking in MS. If the user stops speaking for this duration, the recording will stop.
+ silence_timeout = 1500
+ # Above this duration (MS), the recording will forcefully stop.
+ max_duration = 15000
+ # Duration of the audio chunks in MS
+ chunk_duration = 1000
+ # Sample rate of the audio
+ sample_rate = 44100
 
 [UI]
 # Name of the app and chatbot.
@@ -74,6 +85,11 @@ hide_cot = false
 # Specify a custom font url.
 # custom_font = "https://fonts.googleapis.com/css2?family=Inter:wght@400;500;700&display=swap"
 
+# Specify a custom build directory for the frontend.
+# This can be used to customize the frontend code.
+# Be careful: If this is a relative path, it should not start with a slash.
+# custom_build = "./public/build"
+
 # Override default MUI light theme. (Check theme.ts)
 [UI.theme]
  #font_family = "Inter, sans-serif"
@@ -98,4 +114,4 @@ hide_cot = false
 
 
 [meta]
-generated_by = "1.0.400"
+generated_by = "1.0.504"