From c67804f0c610691953ed751f04c2221da12bcf2c Mon Sep 17 00:00:00 2001 From: Mish Ushakov <10400064+mishushakov@users.noreply.github.com> Date: Tue, 16 Apr 2024 19:41:00 +0000 Subject: [PATCH 1/8] feat: Browserbase Web Reader --- .../data_connectors/WebPageDemo.ipynb | 52 +++++++++++++++++++ .../llama_index/readers/web/__init__.py | 2 + .../readers/web/browserbase_web/BUILD | 5 ++ .../readers/web/browserbase_web/README.md | 45 ++++++++++++++++ .../readers/web/browserbase_web/__init__.py | 0 .../readers/web/browserbase_web/base.py | 48 +++++++++++++++++ .../web/browserbase_web/requirements.txt | 1 + 7 files changed, 153 insertions(+) create mode 100644 llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/BUILD create mode 100644 llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md create mode 100644 llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/__init__.py create mode 100644 llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py create mode 100644 llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/requirements.txt diff --git a/docs/docs/examples/data_connectors/WebPageDemo.ipynb b/docs/docs/examples/data_connectors/WebPageDemo.ipynb index a2c221bdaceb2..f631afe73c255 100644 --- a/docs/docs/examples/data_connectors/WebPageDemo.ipynb +++ b/docs/docs/examples/data_connectors/WebPageDemo.ipynb @@ -130,6 +130,58 @@ "display(Markdown(f\"{response}\"))" ] }, + { + "cell_type": "markdown", + "id": "005d14cd", + "metadata": {}, + "source": [ + "# Using Browserbase Reader 🅱️\n", + "\n", + "[Browserbase](https://browserbase.com) is a serverless platform for running headless browsers, it offers advanced debugging, session recordings, stealth mode, integrated proxies and captcha solving.\n", + "\n", + "## Installation and Setup\n", + "\n", + "- Get an API key from [browserbase.com](https://browserbase.com) and set it in environment variables (`BROWSERBASE_KEY`).\n", + "- Install the [Browserbase SDK](http://github.com/browserbase/python-sdk):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c74e6425", + "metadata": {}, + "outputs": [], + "source": [ + "% pip install browserbase" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c23d02bc", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.readers.web import BrowserbaseWebReader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e71d347", + "metadata": {}, + "outputs": [], + "source": [ + "reader = BrowserbaseWebReader()\n", + "docs = reader.load_data(\n", + " urls=[\n", + " \"https://example.com\",\n", + " ],\n", + " # Text mode\n", + " text_content=False\n", + ")" + ] + }, { "cell_type": "markdown", "id": "15f46387", diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/__init__.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/__init__.py index 3d23feffdb6b4..87cbcb5898c87 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/__init__.py +++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/__init__.py @@ -5,6 +5,7 @@ from llama_index.readers.web.beautiful_soup_web.base import ( BeautifulSoupWebReader, ) +from llama_index.readers.web.browserbase.base import BrowserbaseWebReader from llama_index.readers.web.firecrawl_web.base import FireCrawlWebReader from llama_index.readers.web.knowledge_base.base import ( KnowledgeBaseWebReader, @@ -42,6 +43,7 @@ __all__ = [ "AsyncWebPageReader", "BeautifulSoupWebReader", + "BrowserbaseWebReader", "FireCrawlWebReader", "KnowledgeBaseWebReader", "MainContentExtractorReader", diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/BUILD b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/BUILD new file mode 100644 index 0000000000000..8f515a7fcd9f5 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/BUILD @@ -0,0 +1,5 @@ +python_sources() + +python_requirements( + name="reqs", +) diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md new file mode 100644 index 0000000000000..4fae9f7be623d --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md @@ -0,0 +1,45 @@ +# Browserbase Web Reader + +[Browserbase](https://browserbase.com) is a serverless platform for running headless browsers, it offers advanced debugging, session recordings, stealth mode, integrated proxies and captcha solving. + +## Installation and Setup + +- Get an API key from [browserbase.com](https://browserbase.com) and set it in environment variables (`BROWSERBASE_KEY`). +- Install the [Browserbase SDK](http://github.com/browserbase/python-sdk): + +```python +pip install browserbase +``` + +### Loading documents + +You can load webpages into LangChain using `BrowserbaseWebReader`. Optionally, you can set `text_content` parameter to convert the pages to text-only representation. + +```python +from llama_index.readers.web import BrowserbaseWebReader + + +reader = BrowserbaseWebReader() +docs = reader.load_data( + urls=[ + "https://example.com", + ], + # Text mode + text_content=False +) +``` + +### Loading images + +You can also load screenshots of webpages (as bytes) for multi-modal models. + +```python +from browserbase import Browserbase +from base64 import b64encode + +browser = Browserbase() +screenshot = browser.screenshot("https://browserbase.com") + +# Optional. Convert to base64 +img_encoded = b64encode(screenshot).decode() +``` diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/__init__.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py new file mode 100644 index 0000000000000..b007510545c79 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py @@ -0,0 +1,48 @@ +import os +import logging +from typing import List +from llama_index.core.readers.base import BaseReader +from llama_index.core.schema import Document + + +logger = logging.getLogger(__name__) + + +class BrowserbaseWebReader(BaseReader): + """Browserbase Web Reader""" + + def __init__( + self, + api_key: str = os.environ["BROWSERBASE_KEY"], + ) -> None: + try: + from browserbase import Browserbase + except ImportError: + raise ImportError( + "`browserbase` package not found, please run `pip install browserbase`" + ) + + self.browserbase = Browserbase(api_key=api_key) + + def load_data(self, urls: List[str], text_content: bool = False) -> List[Document]: + """Load pages using Browserbase Web Reader""" + + pages = self.browserbase.load_urls(urls, text_content) + + documents = [] + for i, page in enumerate(pages): + documents.append( + Document( + text=page, + metadata={ + "url": urls[i], + }, + ) + ) + + return documents + + +if __name__ == "__main__": + reader = BrowserbaseWebReader() + logger.info(reader.load_data(urls=["https://example.com"])) diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/requirements.txt b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/requirements.txt new file mode 100644 index 0000000000000..529e0a3ed6e8b --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/requirements.txt @@ -0,0 +1 @@ +browserbase From 94457dec62ac482b656bddc472cf45ab98e557d2 Mon Sep 17 00:00:00 2001 From: Mish Ushakov <10400064+mishushakov@users.noreply.github.com> Date: Tue, 16 Apr 2024 19:55:12 +0000 Subject: [PATCH 2/8] updated browserbase docs --- .../llama_index/readers/web/browserbase_web/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md index 4fae9f7be623d..e96a4be00e8dd 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md +++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md @@ -11,6 +11,8 @@ pip install browserbase ``` +## Usage + ### Loading documents You can load webpages into LangChain using `BrowserbaseWebReader`. Optionally, you can set `text_content` parameter to convert the pages to text-only representation. From 82f5aa7bf2a225ba271652fd131854f8bbde5268 Mon Sep 17 00:00:00 2001 From: Mish Ushakov <10400064+mishushakov@users.noreply.github.com> Date: Wed, 17 Apr 2024 13:30:26 +0000 Subject: [PATCH 3/8] updated browserbase readme --- .../llama_index/readers/web/browserbase_web/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md index e96a4be00e8dd..5d6134b8cd4d5 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md +++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md @@ -7,7 +7,7 @@ - Get an API key from [browserbase.com](https://browserbase.com) and set it in environment variables (`BROWSERBASE_KEY`). - Install the [Browserbase SDK](http://github.com/browserbase/python-sdk): -```python +``` pip install browserbase ``` @@ -15,7 +15,7 @@ pip install browserbase ### Loading documents -You can load webpages into LangChain using `BrowserbaseWebReader`. Optionally, you can set `text_content` parameter to convert the pages to text-only representation. +You can load webpages into LlamaIndex using `BrowserbaseWebReader`. Optionally, you can set `text_content` parameter to convert the pages to text-only representation. ```python from llama_index.readers.web import BrowserbaseWebReader @@ -27,7 +27,7 @@ docs = reader.load_data( "https://example.com", ], # Text mode - text_content=False + text_content=False, ) ``` From 61439c24fe2723bfe848be16d50ec2f7115edec6 Mon Sep 17 00:00:00 2001 From: Mish Ushakov <10400064+mishushakov@users.noreply.github.com> Date: Thu, 18 Apr 2024 14:29:44 +0000 Subject: [PATCH 4/8] added lazy_load_data to browserbase web reader --- .../data_connectors/WebPageDemo.ipynb | 2 +- .../readers/web/browserbase_web/base.py | 22 +++++++------------ 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/docs/docs/examples/data_connectors/WebPageDemo.ipynb b/docs/docs/examples/data_connectors/WebPageDemo.ipynb index f631afe73c255..22119fd684a95 100644 --- a/docs/docs/examples/data_connectors/WebPageDemo.ipynb +++ b/docs/docs/examples/data_connectors/WebPageDemo.ipynb @@ -178,7 +178,7 @@ " \"https://example.com\",\n", " ],\n", " # Text mode\n", - " text_content=False\n", + " text_content=False,\n", ")" ] }, diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py index b007510545c79..463b3c3b9a876 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py +++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py @@ -1,6 +1,6 @@ import os import logging -from typing import List +from typing import Iterator, List from llama_index.core.readers.base import BaseReader from llama_index.core.schema import Document @@ -24,25 +24,19 @@ def __init__( self.browserbase = Browserbase(api_key=api_key) - def load_data(self, urls: List[str], text_content: bool = False) -> List[Document]: + def lazy_load_data(self, urls: List[str], text_content: bool = False) -> Iterator[Document]: """Load pages using Browserbase Web Reader""" - pages = self.browserbase.load_urls(urls, text_content) - documents = [] for i, page in enumerate(pages): - documents.append( - Document( - text=page, - metadata={ - "url": urls[i], - }, - ) + yield Document( + text=page, + metadata={ + "url": urls[i], + }, ) - return documents - if __name__ == "__main__": reader = BrowserbaseWebReader() - logger.info(reader.load_data(urls=["https://example.com"])) + logger.warning(reader.load_data(urls=["https://example.com"])) From 0a4d8cfcec944d0d343139645a2b6af7b5739a0f Mon Sep 17 00:00:00 2001 From: Mish Ushakov <10400064+mishushakov@users.noreply.github.com> Date: Thu, 18 Apr 2024 14:30:18 +0000 Subject: [PATCH 5/8] browserbase logger fix --- .../llama_index/readers/web/browserbase_web/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py index 463b3c3b9a876..088f5cecac25a 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py +++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py @@ -39,4 +39,4 @@ def lazy_load_data(self, urls: List[str], text_content: bool = False) -> Iterato if __name__ == "__main__": reader = BrowserbaseWebReader() - logger.warning(reader.load_data(urls=["https://example.com"])) + logger.info(reader.load_data(urls=["https://example.com"])) From d718034f4dbab7edc6b81313fa19ca9615d336a8 Mon Sep 17 00:00:00 2001 From: Mish Ushakov <10400064+mishushakov@users.noreply.github.com> Date: Fri, 19 Apr 2024 11:13:06 +0000 Subject: [PATCH 6/8] updated browserbase integration --- .../examples/data_connectors/WebPageDemo.ipynb | 2 +- .../readers/web/browserbase_web/README.md | 2 +- .../readers/web/browserbase_web/base.py | 16 ++++++++++------ 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/docs/docs/examples/data_connectors/WebPageDemo.ipynb b/docs/docs/examples/data_connectors/WebPageDemo.ipynb index 22119fd684a95..0b1f036317aa2 100644 --- a/docs/docs/examples/data_connectors/WebPageDemo.ipynb +++ b/docs/docs/examples/data_connectors/WebPageDemo.ipynb @@ -141,7 +141,7 @@ "\n", "## Installation and Setup\n", "\n", - "- Get an API key from [browserbase.com](https://browserbase.com) and set it in environment variables (`BROWSERBASE_KEY`).\n", + "- Get an API key from [browserbase.com](https://browserbase.com) and set it in environment variables (`BROWSERBASE_API_KEY`).\n", "- Install the [Browserbase SDK](http://github.com/browserbase/python-sdk):" ] }, diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md index 5d6134b8cd4d5..3985ccb9bad2e 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md +++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/README.md @@ -4,7 +4,7 @@ ## Installation and Setup -- Get an API key from [browserbase.com](https://browserbase.com) and set it in environment variables (`BROWSERBASE_KEY`). +- Get an API key from [browserbase.com](https://browserbase.com) and set it in environment variables (`BROWSERBASE_API_KEY`). - Install the [Browserbase SDK](http://github.com/browserbase/python-sdk): ``` diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py index 088f5cecac25a..f6436389a8db9 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py +++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py @@ -1,6 +1,6 @@ import os import logging -from typing import Iterator, List +from typing import Optional, Iterator, Sequence from llama_index.core.readers.base import BaseReader from llama_index.core.schema import Document @@ -9,11 +9,14 @@ class BrowserbaseWebReader(BaseReader): - """Browserbase Web Reader""" + """Load pre-rendered web pages using a headless browser hosted on Browserbase. + Depends on `browserbase` package. + Get your API key from https://browserbase.com + """ def __init__( self, - api_key: str = os.environ["BROWSERBASE_KEY"], + api_key: Optional[str] = None, ) -> None: try: from browserbase import Browserbase @@ -22,10 +25,11 @@ def __init__( "`browserbase` package not found, please run `pip install browserbase`" ) + api_key = api_key or os.environ["BROWSERBASE_API_KEY"] self.browserbase = Browserbase(api_key=api_key) - def lazy_load_data(self, urls: List[str], text_content: bool = False) -> Iterator[Document]: - """Load pages using Browserbase Web Reader""" + def lazy_load_data(self, urls: Sequence[str], text_content: bool = False) -> Iterator[Document]: + """Load pages from URLs""" pages = self.browserbase.load_urls(urls, text_content) for i, page in enumerate(pages): @@ -39,4 +43,4 @@ def lazy_load_data(self, urls: List[str], text_content: bool = False) -> Iterato if __name__ == "__main__": reader = BrowserbaseWebReader() - logger.info(reader.load_data(urls=["https://example.com"])) + logger.warn(reader.load_data(urls=["https://example.com"])) From a5b6ccbcc9bf45f5b5a59a4f1b8dec2e0732a4e4 Mon Sep 17 00:00:00 2001 From: Mish Ushakov <10400064+mishushakov@users.noreply.github.com> Date: Fri, 19 Apr 2024 11:38:29 +0000 Subject: [PATCH 7/8] removed redundant line --- .../llama_index/readers/web/browserbase_web/base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py index f6436389a8db9..20087ea196eb0 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py +++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py @@ -25,7 +25,6 @@ def __init__( "`browserbase` package not found, please run `pip install browserbase`" ) - api_key = api_key or os.environ["BROWSERBASE_API_KEY"] self.browserbase = Browserbase(api_key=api_key) def lazy_load_data(self, urls: Sequence[str], text_content: bool = False) -> Iterator[Document]: @@ -43,4 +42,4 @@ def lazy_load_data(self, urls: Sequence[str], text_content: bool = False) -> Ite if __name__ == "__main__": reader = BrowserbaseWebReader() - logger.warn(reader.load_data(urls=["https://example.com"])) + logger.info(reader.load_data(urls=["https://example.com"])) From 2f6fb674d6a00ead88115fb1e4893a7c551df738 Mon Sep 17 00:00:00 2001 From: Logan Markewich Date: Wed, 1 May 2024 21:35:13 -0600 Subject: [PATCH 8/8] linting --- .../readers/web/browserbase_web/base.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py index 20087ea196eb0..63ad0804730e2 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py +++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/browserbase_web/base.py @@ -1,4 +1,3 @@ -import os import logging from typing import Optional, Iterator, Sequence from llama_index.core.readers.base import BaseReader @@ -9,10 +8,12 @@ class BrowserbaseWebReader(BaseReader): - """Load pre-rendered web pages using a headless browser hosted on Browserbase. - Depends on `browserbase` package. - Get your API key from https://browserbase.com - """ + """BrowserbaseWebReader. + + Load pre-rendered web pages using a headless browser hosted on Browserbase. + Depends on `browserbase` package. + Get your API key from https://browserbase.com + """ def __init__( self, @@ -27,8 +28,10 @@ def __init__( self.browserbase = Browserbase(api_key=api_key) - def lazy_load_data(self, urls: Sequence[str], text_content: bool = False) -> Iterator[Document]: - """Load pages from URLs""" + def lazy_load_data( + self, urls: Sequence[str], text_content: bool = False + ) -> Iterator[Document]: + """Load pages from URLs.""" pages = self.browserbase.load_urls(urls, text_content) for i, page in enumerate(pages):