feat: also retrive webpage content from search results

F33RNI · Dec 21, 2023 · 1ca94c7 · 1ca94c7
1 parent b9aecae
commit 1ca94c7
Show file tree

Hide file tree

Showing 2 changed files with 42 additions and 19 deletions.
diff --git a/BotHandler.py b/BotHandler.py
@@ -155,7 +155,9 @@ async def send_message_async(
  if response_len == 0 and len(request_response.response_images) == 0:
  request_response.response = messages["empty_message"]
 
- await _send_prepared_message_async(config, messages, request_response, end, plain_text)
+ await _send_prepared_message_async(
+ config, messages, request_response, end, plain_text
+ )
 
  # Error?
  except Exception as e:
@@ -316,15 +318,19 @@ async def parse_img(img_source: str):
  :return:
  """
  try:
- res = requests.head(
- img_source,
- timeout=10,
- headers={
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) "
- "AppleWebKit/537.36 (KHTML, like Gecko) "
- "Chrome/91.4472.114 Safari/537.36"
- },
- allow_redirects=True,
+ loop = asyncio.get_event_loop()
+ res = await loop.run_in_executor(
+ None,
+ lambda: requests.head(
+ img_source,
+ timeout=10,
+ headers={
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) "
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
+ "Chrome/91.4472.114 Safari/537.36"
+ },
+ allow_redirects=True,
+ ),
  )
  content_type = res.headers.get("content-type")
  if not content_type.startswith("image"):

diff --git a/GoogleAIModule.py b/GoogleAIModule.py
@@ -1,5 +1,6 @@
 from datetime import datetime
 import re
+import asyncio
 import time
 import uuid
 import json
@@ -443,33 +444,49 @@ def __init__(
  self.msg_args = msg_args or []
 
 
-def _get_webpage_by_url(args):
+async def _get_webpage_by_url(args):
  try:
  url = args["url"]
  if not (schema := re.search(r"(.*)://", url)):
  url = "https://" + url
  elif (schema := schema.group(1)) not in ["https", "http"]:
  return {"error": f"Invalid url schema {schema}"}
 
- header = requests.head(url, timeout=20, allow_redirects=True)
+ loop = asyncio.get_event_loop()
+ header = await loop.run_in_executor(
+ None, lambda: requests.head(url, timeout=20, allow_redirects=True)
+ )
  content_type = header.headers.get("content-type")
  if not content_type.startswith("text/html"):
  return {"error": f"Unsupported content type {content_type}"}
 
- res = requests.get(url, timeout=20, allow_redirects=True)
+ res = await loop.run_in_executor(
+ None, lambda: requests.get(url, timeout=20, allow_redirects=True)
+ )
  document = Document(res.content)
  return {"webpage": markdownify(document.summary())}
  except Exception:
  return {"error": "Can not read the url"}
 
 
-def _search_on_google(args):
+async def _complete_google_result(res):
+ return {
+ "url": res.url,
+ "title": res.title,
+ "description": res.description,
+ "content": await _get_webpage_by_url({"url": res.url}),
+ }
+
+
+async def _search_on_google(args):
  try:
  return {
- "results": [
- {"url": res.url, "title": res.title, "description": res.description}
- for res in googlesearch(args["keyword"], advanced=True)
- ]
+ "results": await asyncio.gather(
+ *[
+ _complete_google_result(res)
+ for res in googlesearch(args["keyword"], advanced=True, num_results=3)
+ ]
+ )
  }
 
  except Exception:
@@ -529,7 +546,7 @@ def _invoke_tool(function_call: FunctionCall):
  tool = next((t for t in TOOLS if t.name == function_call.name), None)
  if not tool:
  return {"error": "Function not found"}
- return tool.handler(function_call.args)
+ return asyncio.run(tool.handler(function_call.args))
 
 
 def _get_tool_msg(function_call: FunctionCall):