deepset-ai · vblagoje · May 8, 2024 · May 8, 2024 · May 8, 2024 · May 8, 2024
@@ -86,7 +86,7 @@ def run(self, sources: List[Union[str, Path, ByteStream]]) -> Dict[str, List[Uni
  if isinstance(source, Path):
  mime_type = self._get_mime_type(source)
  elif isinstance(source, ByteStream):
- mime_type = source.meta.get("content_type", None)
+ mime_type = source.resolved_mime_type
- mime_type = source.resolved_mime_type
+ mime_type = source.mime_type
- mime_type = source.resolved_mime_type
+ mime_type = source.mime_type
  else:
  raise ValueError(f"Unsupported data source type: {type(source).__name__}")
 

@@ -1,6 +1,6 @@
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Any, Dict, Optional
+from typing import Any, Dict, List, Optional
 
 
 @dataclass
@@ -12,6 +12,7 @@ class ByteStream:
  data: bytes
  meta: Dict[str, Any] = field(default_factory=dict, hash=False)
  mime_type: Optional[str] = field(default=None)
+ mime_type_resolution_priority: List[str] = field(default_factory=lambda: ["attribute", "meta"])
 
  def to_file(self, destination_path: Path):
  """
@@ -24,21 +25,36 @@ def to_file(self, destination_path: Path):
 
  @classmethod
  def from_file_path(
- cls, filepath: Path, mime_type: Optional[str] = None, meta: Optional[Dict[str, Any]] = None
+ cls,
+ filepath: Path,
+ mime_type: Optional[str] = None,
+ meta: Optional[Dict[str, Any]] = None,
+ mime_type_resolution_priority: Optional[List[str]] = None,
  ) -> "ByteStream":
  """
  Create a ByteStream from the contents read from a file.
 
  :param filepath: A valid path to a file.
  :param mime_type: The mime type of the file.
  :param meta: Additional metadata to be stored with the ByteStream.
+ :param mime_type_resolution_priority: The priority order of the mime type resolution
  """
  with open(filepath, "rb") as fd:
- return cls(data=fd.read(), mime_type=mime_type, meta=meta or {})
+ return cls(
+ data=fd.read(),
+ mime_type=mime_type,
+ meta=meta or {},
+ mime_type_resolution_priority=mime_type_resolution_priority or ["attribute", "meta"],
+ )
 
  @classmethod
  def from_string(
- cls, text: str, encoding: str = "utf-8", mime_type: Optional[str] = None, meta: Optional[Dict[str, Any]] = None
+ cls,
+ text: str,
+ encoding: str = "utf-8",
+ mime_type: Optional[str] = None,
+ meta: Optional[Dict[str, Any]] = None,
+ mime_type_resolution_priority: Optional[List[str]] = None,
  ) -> "ByteStream":
  """
  Create a ByteStream encoding a string.
@@ -47,8 +63,14 @@ def from_string(
  :param encoding: The encoding used to convert the string into bytes
  :param mime_type: The mime type of the file.
  :param meta: Additional metadata to be stored with the ByteStream.
+ :param mime_type_resolution_priority: The priority order of the mime type resolution
  """
- return cls(data=text.encode(encoding), mime_type=mime_type, meta=meta or {})
+ return cls(
+ data=text.encode(encoding),
+ mime_type=mime_type,
+ meta=meta or {},
+ mime_type_resolution_priority=mime_type_resolution_priority or ["attribute", "meta"],
+ )
 
  def to_string(self, encoding: str = "utf-8") -> str:
  """
@@ -59,3 +81,28 @@ def to_string(self, encoding: str = "utf-8") -> str:
  :raises: UnicodeDecodeError: If the ByteStream data cannot be decoded with the specified encoding.
  """
  return self.data.decode(encoding)
+
+ @property
+ def resolved_mime_type(self) -> Optional[str]:
+ """
+ Returns the resolved MIME type of the ByteStream based on the `mime_type_resolution_priority` property.
+
+ The MIME type is consolidated using two different contexts:
+ - `content_type`: Used for resources fetched from the web and stored in the `meta` field.
+ - `mime_type`: Used for local files.
+
+ The `mime_type_resolution_priority` property prioritizes the resolution of the MIME type based on the order
+ of preference, picking one of the two sources of truth. The default order is `["attribute", "meta"]`.
+
+ This property is useful because it can be used with any `ByteStream` instance, regardless of the origin of
+ creation, to conveniently determine the MIME type instead of checking both sources of truth separately.
+
+ :return: The MIME type if available, otherwise `None`.
+ """
+ sources = {"meta": self.meta.get("content_type", None), "attribute": self.mime_type}
+
+ for source in self.mime_type_resolution_priority:
+ if sources[source]:
+ return sources[source]
+
+ return None
@@ -0,0 +1,4 @@
+---
+enhancements:
+ - |
+ Improved MIME type resolution in ByteStream dataclass to allow priority-based MIME type determination. Users can now specify a priority order for resolving the MIME type of a ByteStream, utilizing either the explicitly set mime_type attribute or the content_type found within the ByteStream's metadata. This enhancement simplifies handling of MIME types, especially when dealing with ByteStreams originating from various sources.
@@ -68,3 +68,41 @@ def test_to_file(tmp_path, request):
  ByteStream(test_str.encode()).to_file(test_path)
  with open(test_path, "rb") as fd:
  assert fd.read().decode() == test_str
+
+
+# tests resolved_mime_type with different priority orders
+def test_resolved_mime_type_priority():
+ test_string = "Hello, world!"
+
+ b = ByteStream(
+ data=test_string.encode(),
+ mime_type="application/octet-stream",
+ meta={"content_type": "text/plain"},
+ mime_type_resolution_priority=["meta", "attribute"],
+ )
+ assert b.resolved_mime_type == "text/plain"
+
+ b.mime_type_resolution_priority = ["attribute", "meta"]
+ assert b.resolved_mime_type == "application/octet-stream"
+
+ b = ByteStream(
+ data=test_string.encode(), mime_type=None, meta={}, mime_type_resolution_priority=["meta", "attribute"]
+ )
+ assert b.resolved_mime_type is None
+
+
+# test resolved_mime_type with default priority which is ["attribute", "meta"]
+def test_resolved_mime_type_no_priority():
+ test_string = "Hello, world!"
+
+ b = ByteStream(data=test_string.encode(), mime_type="application/octet-stream", meta={"content_type": "text/plain"})
+ assert b.resolved_mime_type == "application/octet-stream"
+
+ b = ByteStream(data=test_string.encode(), mime_type=None, meta={"content_type": "text/plain"})
+ assert b.resolved_mime_type == "text/plain"
+
+ b = ByteStream(data=test_string.encode(), mime_type="application/octet-stream", meta={})
+ assert b.resolved_mime_type == "application/octet-stream"
+
+ b = ByteStream(data=test_string.encode(), mime_type=None, meta={})
+ assert b.resolved_mime_type is None