From c67804f0c610691953ed751f04c2221da12bcf2c Mon Sep 17 00:00:00 2001
From: Mish Ushakov <10400064+mishushakov@users.noreply.github.com>
Date: Tue, 16 Apr 2024 19:41:00 +0000
Subject: [PATCH 1/8] feat: Browserbase Web Reader
---
.../data_connectors/WebPageDemo.ipynb | 52 +++++++++++++++++++
.../llama_index/readers/web/__init__.py | 2 +
.../readers/web/browserbase_web/BUILD | 5 ++
.../readers/web/browserbase_web/README.md | 45 ++++++++++++++++
.../readers/web/browserbase_web/__init__.py | 0
.../readers/web/browserbase_web/base.py | 48 +++++++++++++++++
.../web/browserbase_web/requirements.txt | 1 +
7 files changed, 153 insertions(+)
create mode 100644 llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/BUILD
create mode 100644 llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md
create mode 100644 llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/__init__.py
create mode 100644 llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py
create mode 100644 llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/requirements.txt
diff --git a/docs/docs/examples/data_connectors/WebPageDemo.ipynb b/docs/docs/examples/data_connectors/WebPageDemo.ipynb
index a2c221bdaceb2..f631afe73c255 100644
--- a/docs/docs/examples/data_connectors/WebPageDemo.ipynb
+++ b/docs/docs/examples/data_connectors/WebPageDemo.ipynb
@@ -130,6 +130,58 @@
"display(Markdown(f\"{response}\"))"
]
},
+ {
+ "cell_type": "markdown",
+ "id": "005d14cd",
+ "metadata": {},
+ "source": [
+ "# Using Browserbase Reader 🅱️\n",
+ "\n",
+ "[Browserbase](https://browserbase.com) is a serverless platform for running headless browsers, it offers advanced debugging, session recordings, stealth mode, integrated proxies and captcha solving.\n",
+ "\n",
+ "## Installation and Setup\n",
+ "\n",
+ "- Get an API key from [browserbase.com](https://browserbase.com) and set it in environment variables (`BROWSERBASE_KEY`).\n",
+ "- Install the [Browserbase SDK](http://github.com/browserbase/python-sdk):"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c74e6425",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "% pip install browserbase"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c23d02bc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from llama_index.readers.web import BrowserbaseWebReader"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7e71d347",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "reader = BrowserbaseWebReader()\n",
+ "docs = reader.load_data(\n",
+ " urls=[\n",
+ " \"https://example.com\",\n",
+ " ],\n",
+ " # Text mode\n",
+ " text_content=False\n",
+ ")"
+ ]
+ },
{
"cell_type": "markdown",
"id": "15f46387",
diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/__init__.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/__init__.py
index 3d23feffdb6b4..87cbcb5898c87 100644
--- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/__init__.py
+++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/__init__.py
@@ -5,6 +5,7 @@
from llama_index.readers.web.beautiful_soup_web.base import (
BeautifulSoupWebReader,
)
+from llama_index.readers.web.browserbase.base import BrowserbaseWebReader
from llama_index.readers.web.firecrawl_web.base import FireCrawlWebReader
from llama_index.readers.web.knowledge_base.base import (
KnowledgeBaseWebReader,
@@ -42,6 +43,7 @@
__all__ = [
"AsyncWebPageReader",
"BeautifulSoupWebReader",
+ "BrowserbaseWebReader",
"FireCrawlWebReader",
"KnowledgeBaseWebReader",
"MainContentExtractorReader",
diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/BUILD b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/BUILD
new file mode 100644
index 0000000000000..8f515a7fcd9f5
--- /dev/null
+++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/BUILD
@@ -0,0 +1,5 @@
+python_sources()
+
+python_requirements(
+ name="reqs",
+)
diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md
new file mode 100644
index 0000000000000..4fae9f7be623d
--- /dev/null
+++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md
@@ -0,0 +1,45 @@
+# Browserbase Web Reader
+
+[Browserbase](https://browserbase.com) is a serverless platform for running headless browsers, it offers advanced debugging, session recordings, stealth mode, integrated proxies and captcha solving.
+
+## Installation and Setup
+
+- Get an API key from [browserbase.com](https://browserbase.com) and set it in environment variables (`BROWSERBASE_KEY`).
+- Install the [Browserbase SDK](http://github.com/browserbase/python-sdk):
+
+```python
+pip install browserbase
+```
+
+### Loading documents
+
+You can load webpages into LangChain using `BrowserbaseWebReader`. Optionally, you can set `text_content` parameter to convert the pages to text-only representation.
+
+```python
+from llama_index.readers.web import BrowserbaseWebReader
+
+
+reader = BrowserbaseWebReader()
+docs = reader.load_data(
+ urls=[
+ "https://example.com",
+ ],
+ # Text mode
+ text_content=False
+)
+```
+
+### Loading images
+
+You can also load screenshots of webpages (as bytes) for multi-modal models.
+
+```python
+from browserbase import Browserbase
+from base64 import b64encode
+
+browser = Browserbase()
+screenshot = browser.screenshot("https://browserbase.com")
+
+# Optional. Convert to base64
+img_encoded = b64encode(screenshot).decode()
+```
diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/__init__.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py
new file mode 100644
index 0000000000000..b007510545c79
--- /dev/null
+++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py
@@ -0,0 +1,48 @@
+import os
+import logging
+from typing import List
+from llama_index.core.readers.base import BaseReader
+from llama_index.core.schema import Document
+
+
+logger = logging.getLogger(__name__)
+
+
+class BrowserbaseWebReader(BaseReader):
+ """Browserbase Web Reader"""
+
+ def __init__(
+ self,
+ api_key: str = os.environ["BROWSERBASE_KEY"],
+ ) -> None:
+ try:
+ from browserbase import Browserbase
+ except ImportError:
+ raise ImportError(
+ "`browserbase` package not found, please run `pip install browserbase`"
+ )
+
+ self.browserbase = Browserbase(api_key=api_key)
+
+ def load_data(self, urls: List[str], text_content: bool = False) -> List[Document]:
+ """Load pages using Browserbase Web Reader"""
+
+ pages = self.browserbase.load_urls(urls, text_content)
+
+ documents = []
+ for i, page in enumerate(pages):
+ documents.append(
+ Document(
+ text=page,
+ metadata={
+ "url": urls[i],
+ },
+ )
+ )
+
+ return documents
+
+
+if __name__ == "__main__":
+ reader = BrowserbaseWebReader()
+ logger.info(reader.load_data(urls=["https://example.com"]))
diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/requirements.txt b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/requirements.txt
new file mode 100644
index 0000000000000..529e0a3ed6e8b
--- /dev/null
+++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/requirements.txt
@@ -0,0 +1 @@
+browserbase
From 94457dec62ac482b656bddc472cf45ab98e557d2 Mon Sep 17 00:00:00 2001
From: Mish Ushakov <10400064+mishushakov@users.noreply.github.com>
Date: Tue, 16 Apr 2024 19:55:12 +0000
Subject: [PATCH 2/8] updated browserbase docs
---
.../llama_index/readers/web/browserbase_web/README.md | 2 ++
1 file changed, 2 insertions(+)
diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md
index 4fae9f7be623d..e96a4be00e8dd 100644
--- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md
+++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md
@@ -11,6 +11,8 @@
pip install browserbase
```
+## Usage
+
### Loading documents
You can load webpages into LangChain using `BrowserbaseWebReader`. Optionally, you can set `text_content` parameter to convert the pages to text-only representation.
From 82f5aa7bf2a225ba271652fd131854f8bbde5268 Mon Sep 17 00:00:00 2001
From: Mish Ushakov <10400064+mishushakov@users.noreply.github.com>
Date: Wed, 17 Apr 2024 13:30:26 +0000
Subject: [PATCH 3/8] updated browserbase readme
---
.../llama_index/readers/web/browserbase_web/README.md | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md
index e96a4be00e8dd..5d6134b8cd4d5 100644
--- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md
+++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md
@@ -7,7 +7,7 @@
- Get an API key from [browserbase.com](https://browserbase.com) and set it in environment variables (`BROWSERBASE_KEY`).
- Install the [Browserbase SDK](http://github.com/browserbase/python-sdk):
-```python
+```
pip install browserbase
```
@@ -15,7 +15,7 @@ pip install browserbase
### Loading documents
-You can load webpages into LangChain using `BrowserbaseWebReader`. Optionally, you can set `text_content` parameter to convert the pages to text-only representation.
+You can load webpages into LlamaIndex using `BrowserbaseWebReader`. Optionally, you can set `text_content` parameter to convert the pages to text-only representation.
```python
from llama_index.readers.web import BrowserbaseWebReader
@@ -27,7 +27,7 @@ docs = reader.load_data(
"https://example.com",
],
# Text mode
- text_content=False
+ text_content=False,
)
```
From 61439c24fe2723bfe848be16d50ec2f7115edec6 Mon Sep 17 00:00:00 2001
From: Mish Ushakov <10400064+mishushakov@users.noreply.github.com>
Date: Thu, 18 Apr 2024 14:29:44 +0000
Subject: [PATCH 4/8] added lazy_load_data to browserbase web reader
---
.../data_connectors/WebPageDemo.ipynb | 2 +-
.../readers/web/browserbase_web/base.py | 22 +++++++------------
2 files changed, 9 insertions(+), 15 deletions(-)
diff --git a/docs/docs/examples/data_connectors/WebPageDemo.ipynb b/docs/docs/examples/data_connectors/WebPageDemo.ipynb
index f631afe73c255..22119fd684a95 100644
--- a/docs/docs/examples/data_connectors/WebPageDemo.ipynb
+++ b/docs/docs/examples/data_connectors/WebPageDemo.ipynb
@@ -178,7 +178,7 @@
" \"https://example.com\",\n",
" ],\n",
" # Text mode\n",
- " text_content=False\n",
+ " text_content=False,\n",
")"
]
},
diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py
index b007510545c79..463b3c3b9a876 100644
--- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py
+++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py
@@ -1,6 +1,6 @@
import os
import logging
-from typing import List
+from typing import Iterator, List
from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document
@@ -24,25 +24,19 @@ def __init__(
self.browserbase = Browserbase(api_key=api_key)
- def load_data(self, urls: List[str], text_content: bool = False) -> List[Document]:
+ def lazy_load_data(self, urls: List[str], text_content: bool = False) -> Iterator[Document]:
"""Load pages using Browserbase Web Reader"""
-
pages = self.browserbase.load_urls(urls, text_content)
- documents = []
for i, page in enumerate(pages):
- documents.append(
- Document(
- text=page,
- metadata={
- "url": urls[i],
- },
- )
+ yield Document(
+ text=page,
+ metadata={
+ "url": urls[i],
+ },
)
- return documents
-
if __name__ == "__main__":
reader = BrowserbaseWebReader()
- logger.info(reader.load_data(urls=["https://example.com"]))
+ logger.warning(reader.load_data(urls=["https://example.com"]))
From 0a4d8cfcec944d0d343139645a2b6af7b5739a0f Mon Sep 17 00:00:00 2001
From: Mish Ushakov <10400064+mishushakov@users.noreply.github.com>
Date: Thu, 18 Apr 2024 14:30:18 +0000
Subject: [PATCH 5/8] browserbase logger fix
---
.../llama_index/readers/web/browserbase_web/base.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py
index 463b3c3b9a876..088f5cecac25a 100644
--- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py
+++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py
@@ -39,4 +39,4 @@ def lazy_load_data(self, urls: List[str], text_content: bool = False) -> Iterato
if __name__ == "__main__":
reader = BrowserbaseWebReader()
- logger.warning(reader.load_data(urls=["https://example.com"]))
+ logger.info(reader.load_data(urls=["https://example.com"]))
From d718034f4dbab7edc6b81313fa19ca9615d336a8 Mon Sep 17 00:00:00 2001
From: Mish Ushakov <10400064+mishushakov@users.noreply.github.com>
Date: Fri, 19 Apr 2024 11:13:06 +0000
Subject: [PATCH 6/8] updated browserbase integration
---
.../examples/data_connectors/WebPageDemo.ipynb | 2 +-
.../readers/web/browserbase_web/README.md | 2 +-
.../readers/web/browserbase_web/base.py | 16 ++++++++++------
3 files changed, 12 insertions(+), 8 deletions(-)
diff --git a/docs/docs/examples/data_connectors/WebPageDemo.ipynb b/docs/docs/examples/data_connectors/WebPageDemo.ipynb
index 22119fd684a95..0b1f036317aa2 100644
--- a/docs/docs/examples/data_connectors/WebPageDemo.ipynb
+++ b/docs/docs/examples/data_connectors/WebPageDemo.ipynb
@@ -141,7 +141,7 @@
"\n",
"## Installation and Setup\n",
"\n",
- "- Get an API key from [browserbase.com](https://browserbase.com) and set it in environment variables (`BROWSERBASE_KEY`).\n",
+ "- Get an API key from [browserbase.com](https://browserbase.com) and set it in environment variables (`BROWSERBASE_API_KEY`).\n",
"- Install the [Browserbase SDK](http://github.com/browserbase/python-sdk):"
]
},
diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md
index 5d6134b8cd4d5..3985ccb9bad2e 100644
--- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md
+++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md
@@ -4,7 +4,7 @@
## Installation and Setup
-- Get an API key from [browserbase.com](https://browserbase.com) and set it in environment variables (`BROWSERBASE_KEY`).
+- Get an API key from [browserbase.com](https://browserbase.com) and set it in environment variables (`BROWSERBASE_API_KEY`).
- Install the [Browserbase SDK](http://github.com/browserbase/python-sdk):
```
diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py
index 088f5cecac25a..f6436389a8db9 100644
--- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py
+++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py
@@ -1,6 +1,6 @@
import os
import logging
-from typing import Iterator, List
+from typing import Optional, Iterator, Sequence
from llama_index.core.readers.base import BaseReader
from llama_index.core.schema import Document
@@ -9,11 +9,14 @@
class BrowserbaseWebReader(BaseReader):
- """Browserbase Web Reader"""
+ """Load pre-rendered web pages using a headless browser hosted on Browserbase.
+ Depends on `browserbase` package.
+ Get your API key from https://browserbase.com
+ """
def __init__(
self,
- api_key: str = os.environ["BROWSERBASE_KEY"],
+ api_key: Optional[str] = None,
) -> None:
try:
from browserbase import Browserbase
@@ -22,10 +25,11 @@ def __init__(
"`browserbase` package not found, please run `pip install browserbase`"
)
+ api_key = api_key or os.environ["BROWSERBASE_API_KEY"]
self.browserbase = Browserbase(api_key=api_key)
- def lazy_load_data(self, urls: List[str], text_content: bool = False) -> Iterator[Document]:
- """Load pages using Browserbase Web Reader"""
+ def lazy_load_data(self, urls: Sequence[str], text_content: bool = False) -> Iterator[Document]:
+ """Load pages from URLs"""
pages = self.browserbase.load_urls(urls, text_content)
for i, page in enumerate(pages):
@@ -39,4 +43,4 @@ def lazy_load_data(self, urls: List[str], text_content: bool = False) -> Iterato
if __name__ == "__main__":
reader = BrowserbaseWebReader()
- logger.info(reader.load_data(urls=["https://example.com"]))
+ logger.warn(reader.load_data(urls=["https://example.com"]))
From a5b6ccbcc9bf45f5b5a59a4f1b8dec2e0732a4e4 Mon Sep 17 00:00:00 2001
From: Mish Ushakov <10400064+mishushakov@users.noreply.github.com>
Date: Fri, 19 Apr 2024 11:38:29 +0000
Subject: [PATCH 7/8] removed redundant line
---
.../llama_index/readers/web/browserbase_web/base.py | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py
index f6436389a8db9..20087ea196eb0 100644
--- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py
+++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py
@@ -25,7 +25,6 @@ def __init__(
"`browserbase` package not found, please run `pip install browserbase`"
)
- api_key = api_key or os.environ["BROWSERBASE_API_KEY"]
self.browserbase = Browserbase(api_key=api_key)
def lazy_load_data(self, urls: Sequence[str], text_content: bool = False) -> Iterator[Document]:
@@ -43,4 +42,4 @@ def lazy_load_data(self, urls: Sequence[str], text_content: bool = False) -> Ite
if __name__ == "__main__":
reader = BrowserbaseWebReader()
- logger.warn(reader.load_data(urls=["https://example.com"]))
+ logger.info(reader.load_data(urls=["https://example.com"]))
From 2f6fb674d6a00ead88115fb1e4893a7c551df738 Mon Sep 17 00:00:00 2001
From: Logan Markewich
Date: Wed, 1 May 2024 21:35:13 -0600
Subject: [PATCH 8/8] linting
---
.../readers/web/browserbase_web/base.py | 17 ++++++++++-------
1 file changed, 10 insertions(+), 7 deletions(-)
diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py
index 20087ea196eb0..63ad0804730e2 100644
--- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py
+++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py
@@ -1,4 +1,3 @@
-import os
import logging
from typing import Optional, Iterator, Sequence
from llama_index.core.readers.base import BaseReader
@@ -9,10 +8,12 @@
class BrowserbaseWebReader(BaseReader):
- """Load pre-rendered web pages using a headless browser hosted on Browserbase.
- Depends on `browserbase` package.
- Get your API key from https://browserbase.com
- """
+ """BrowserbaseWebReader.
+
+ Load pre-rendered web pages using a headless browser hosted on Browserbase.
+ Depends on `browserbase` package.
+ Get your API key from https://browserbase.com
+ """
def __init__(
self,
@@ -27,8 +28,10 @@ def __init__(
self.browserbase = Browserbase(api_key=api_key)
- def lazy_load_data(self, urls: Sequence[str], text_content: bool = False) -> Iterator[Document]:
- """Load pages from URLs"""
+ def lazy_load_data(
+ self, urls: Sequence[str], text_content: bool = False
+ ) -> Iterator[Document]:
+ """Load pages from URLs."""
pages = self.browserbase.load_urls(urls, text_content)
for i, page in enumerate(pages):