Merge pull request #178 from openzim/fail_on_too_many_errors

Fail scrapper when there are too many errors while retrieving xblocks
openzim · Jul 14, 2023 · 53db630 · 53db630
2 parents c2e01c2 + 78f1650
commit 53db630
Show file tree

Hide file tree

Showing 17 changed files with 268 additions and 126 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+# Unreleased
+
+- scraper will fail when there are too many erros while retrieving xblocks
+
 # 1.0.1
 
 - fixed recursive paths and URLs in html_processor.py

diff --git a/openedx2zim/annex.py b/openedx2zim/annex.py
@@ -355,14 +355,14 @@ def annex_wiki(self):
  while page_to_visit:
  url = page_to_visit.pop()
  self.add_to_wiki_data(url)
- content = self.scraper.instance_connection.get_page(url)
- # Parse content page
- if content:
+ try:
+  content = self.scraper.instance_connection.get_page(url)
+  # Parse content page
  soup = BeautifulSoup(content, "lxml")
  text = soup.find("div", attrs={"class": "wiki-article"})
  if text: # If it's a page (and not a list of page)
  self.update_wiki_page(soup, text, url, page_to_visit)
- else:
+ except Exception:
  self.wiki_data[url][
  "text"
  ] = """<div><h1 class="page-header">Permission Denied</h1><p class="alert denied">Sorry, you don't have permission to view this page.</p></div>"""

diff --git a/openedx2zim/entrypoint.py b/openedx2zim/entrypoint.py
@@ -202,6 +202,20 @@ def main():
  default=1,
  )
 
+ parser.add_argument(
+ "--watcher-min-dl-count",
+ help="[dev] Minimum number of resources to have downloaded before considering to stop on errors",
+ type=int,
+ default=50,
+ )
+
+ parser.add_argument(
+ "--watcher-min-ratio",
+ help="Minimum ratio of resources processed successfully. If ratio is below this threshold, the scrapper will stop.",
+ type=float,
+ default=0.9,
+ )
+
  parser.add_argument(
  "--version",
  help="Display scraper version and exit",

diff --git a/openedx2zim/html_processor.py b/openedx2zim/html_processor.py
@@ -392,8 +392,9 @@ def download_iframes_from_html(
  else:
  # handle iframe recursively
  iframe_url = prepare_url(src, netloc)
- src_content = self.scraper.instance_connection.get_page(iframe_url)
- if not src_content:
+ try:
+ src_content = self.scraper.instance_connection.get_page(iframe_url)
+ except Exception:
  continue
  path_recursive, netloc_recursive = self.get_path_and_netloc_to_send(
  netloc, path_on_server, iframe_url

diff --git a/openedx2zim/instance_connection.py b/openedx2zim/instance_connection.py
@@ -10,6 +10,10 @@
 logger = getLogger()
 
 
+class GetResponseFailed(Exception):
+ pass
+
+
 def get_response(url, post_data, headers, max_attempts=5):
  req = urllib.request.Request(url, post_data, headers)
  for attempt in range(max_attempts):
@@ -21,7 +25,7 @@ def get_response(url, post_data, headers, max_attempts=5):
  else:
  logger.debug(f"Error opening {url}: {exc}")
  logger.error(f"Max attempts exceeded for {url}")
- return {}
+ raise GetResponseFailed()
 
 
 class InstanceConnection:

diff --git a/openedx2zim/scraper.py b/openedx2zim/scraper.py
@@ -4,6 +4,7 @@
 
 import concurrent.futures
 import datetime
+import json
 import locale
 import os
 import pathlib
@@ -64,6 +65,7 @@
 from .xblocks_extractor.unavailable import Unavailable
 from .xblocks_extractor.vertical import Vertical
 from .xblocks_extractor.video import Video
+from .xblocks_extractor.base_xblock import BaseXblock
 
 XBLOCK_EXTRACTORS = {
  "course": Course,
@@ -119,6 +121,8 @@ def __init__(
  keep_build_dir,
  debug,
  threads,
+ watcher_min_dl_count,
+ watcher_min_ratio,
  ):
 
  # video-encoding info
@@ -152,6 +156,10 @@ def __init__(
  self.threads = threads
  self.yt_downloader = YoutubeDownloader(threads=1)
 
+ # resource processing watcher
+ BaseXblock.watcher_min_dl_count = watcher_min_dl_count
+ BaseXblock.watcher_min_ratio = watcher_min_ratio
+
  # authentication
  self.email = email
  self.password = password
@@ -230,6 +238,7 @@ def prepare_mooc_data(self):
  self.instance_config["course_prefix"],
  self.instance_url,
  )
+ logger.debug(f"Course ID: {self.course_id}")
  logger.info("Getting course info ...")
  self.course_info = self.instance_connection.get_api_json(
  "/api/courses/v1/courses/"
@@ -329,8 +338,9 @@ def get_book_list(self, book, output_path):
  def annex_extra_page(self, tab_href, tab_org_path):
  output_path = self.build_dir.joinpath(tab_org_path)
  output_path.mkdir(parents=True, exist_ok=True)
- page_content = self.instance_connection.get_page(self.instance_url + tab_href)
- if not page_content:
+ try:
+ page_content = self.instance_connection.get_page(self.instance_url + tab_href)
+ except Exception:
  logger.error(f"Failed to get page content for tab {tab_org_path}")
  raise SystemExit(1)
  soup_page = BeautifulSoup(page_content, "lxml")
@@ -403,8 +413,9 @@ def get_tab_path_and_name(self, tab_text, tab_href):
 
  def get_course_tabs(self):
  logger.info("Getting course tabs ...")
- content = self.instance_connection.get_page(self.course_url)
- if not content:
+ try:
+ content = self.instance_connection.get_page(self.course_url)
+ except Exception:
  logger.error("Failed to get course tabs")
  raise SystemExit(1)
  soup = BeautifulSoup(content, "lxml")
@@ -508,8 +519,9 @@ def clean_content(html_article):
 
  # get the course url and generate homepage
  logger.info("Getting homepage ...")
- content = self.instance_connection.get_page(self.course_url)
- if not content:
+ try:
+ content = self.instance_connection.get_page(self.course_url)
+ except Exception:
  logger.error("Error while getting homepage")
  raise SystemExit(1)
  self.build_dir.joinpath("home").mkdir(parents=True, exist_ok=True)
@@ -562,6 +574,15 @@ def clean_content(html_article):
  ]
  concurrent.futures.wait(fs, return_when=concurrent.futures.ALL_COMPLETED)
 
+ if BaseXblock.too_many_failures():
+ logger.error("Stopping scrapper because too many errors occured while getting content")
+ if self.debug:
+ print("Xblock download failure details:", file=sys.stderr)
+ json.dump(BaseXblock.watcher.failed_xblocks, sys.stderr, indent=4)
+ return False
+
+ return True
+
  def s3_credentials_ok(self):
  logger.info("Testing S3 Optimization Cache credentials ...")
  self.s3_storage = KiwixStorage(self.s3_url_with_credentials)
@@ -854,7 +875,8 @@ def run(self):
  self.prepare_mooc_data()
  self.parse_course_xblocks()
  self.annex()
- self.get_content()
+ if not self.get_content():
+ return
  self.render()
  if not self.no_zim:
  self.fname = (

diff --git a/openedx2zim/utils.py b/openedx2zim/utils.py
@@ -86,8 +86,9 @@ def download_and_convert_subtitles(output_path, subtitles, instance_connection):
  subtitle_file = pathlib.Path(output_path).joinpath(f"{lang}.vtt")
  if not subtitle_file.exists():
  try:
- raw_subtitle = instance_connection.get_page(subtitles[lang])
- if not raw_subtitle:
+ try:
+ raw_subtitle = instance_connection.get_page(subtitles[lang])
+ except Exception:
  logger.error(f"Subtitle fetch failed from {subtitles[lang]}")
  continue
  subtitle = html.unescape(

diff --git a/openedx2zim/xblocks_extractor/base_xblock.py b/openedx2zim/xblocks_extractor/base_xblock.py
@@ -1,7 +1,27 @@
+from multiprocessing import Lock
 from slugify import slugify
 
+from ..constants import getLogger
+
+
+class ExtractionWatcher:
+ total_count = 0
+ dl_count = 0
+ success_count = 0
+ failed_xblocks = []
+
+
+logger = getLogger()
+
 
 class BaseXblock:
+
+ watcher = ExtractionWatcher()
+ watcher_min_dl_count = 0
+ watcher_min_ratio = 0
+
+ lock = Lock()
+
  def __init__(
  self, xblock_json, output_path, root_url, xblock_id, descendants, scraper
  ):
@@ -18,7 +38,33 @@ def __init__(
  # make xblock output directory
  self.output_path.mkdir(parents=True, exist_ok=True)
 
+ self.watcher.total_count += 1
+
+ @classmethod
+ def too_many_failures(cls):
+ return cls.watcher.dl_count > cls.watcher_min_dl_count and (cls.watcher.success_count / cls.watcher.dl_count) < cls.watcher_min_ratio
+
  def download(self, instance_connection):
+ if BaseXblock.too_many_failures():
+ return
+ with self.lock:
+ self.watcher.dl_count += 1
+ logger.debug(f"Downloading resource {self.watcher.dl_count} of {self.watcher.total_count} ({self.watcher.success_count} success so far)")
+ try:
+ self.download_inner(instance_connection=instance_connection)
+ except Exception:
+ self.add_failed({})
+ return
+ with self.lock:
+ self.watcher.success_count += 1
+
+ def add_failed(self, description):
+ with self.lock:
+ description["xblock_id"] = self.xblock_id
+ description["class"] = type(self).__name__
+ self.watcher.failed_xblocks.append(description)
+
+ def download_inner(self, instance_connection):
  return
 
  def render(self):

diff --git a/openedx2zim/xblocks_extractor/discussion.py b/openedx2zim/xblocks_extractor/discussion.py
@@ -72,10 +72,13 @@ def get_extra_content(self, soup):
  )
  )
 
- def download(self, instance_connection):
+ def download_inner(self, instance_connection):
  if self.scraper.forum:
- content = instance_connection.get_page(self.xblock_json["student_view_url"])
- if not content:
+ url = self.xblock_json["student_view_url"]
+ try:
+ content = instance_connection.get_page(url)
+ except Exception:
+ self.add_failed({"url": url})
  return
  soup = BeautifulSoup(content, "lxml")
 

diff --git a/openedx2zim/xblocks_extractor/drag_and_drop_v2.py b/openedx2zim/xblocks_extractor/drag_and_drop_v2.py
@@ -20,9 +20,12 @@ def __init__(
  # extra vars
  self.content = None
 
- def download(self, instance_connection):
- content = instance_connection.get_page(self.xblock_json["student_view_url"])
- if not content:
+ def download_inner(self, instance_connection):
+ url = self.xblock_json["student_view_url"]
+ try:
+ content = instance_connection.get_page(url)
+ except Exception:
+ self.add_failed({"url": url})
  return
  soup = BeautifulSoup(content, "lxml")
  self.content = json.loads(

diff --git a/openedx2zim/xblocks_extractor/free_text_response.py b/openedx2zim/xblocks_extractor/free_text_response.py
@@ -15,9 +15,12 @@ def __init__(
  # extra vars
  self.html = ""
 
- def download(self, instance_connection):
- content = instance_connection.get_page(self.xblock_json["student_view_url"])
- if not content:
+ def download_inner(self, instance_connection):
+ url = self.xblock_json["student_view_url"]
+ try:
+ content = instance_connection.get_page(url)
+ except Exception:
+ self.add_failed({"url": url})
  return
  soup = BeautifulSoup(content, "lxml")
  html_content = soup.find("div", attrs={"class": "edx-notes-wrapper"})

diff --git a/openedx2zim/xblocks_extractor/html.py b/openedx2zim/xblocks_extractor/html.py
@@ -15,9 +15,12 @@ def __init__(
  self.is_video = False # check this
  self.html = ""
 
- def download(self, instance_connection):
- content = instance_connection.get_page(self.xblock_json["student_view_url"])
- if not content:
+ def download_inner(self, instance_connection):
+ url = self.xblock_json["student_view_url"]
+ try:
+ content = instance_connection.get_page(url)
+ except Exception:
+ self.add_failed({"url": url})
  return
  soup = BeautifulSoup(content, "lxml")
  html_content = soup.find("div", attrs={"class": "xblock"})

diff --git a/openedx2zim/xblocks_extractor/libcast.py b/openedx2zim/xblocks_extractor/libcast.py
@@ -15,9 +15,12 @@ def __init__(
  # extra vars
  self.subs = []
 
- def download(self, instance_connection):
- content = instance_connection.get_page(self.xblock_json["student_view_url"])
- if not content:
+ def download_inner(self, instance_connection):
+ url = self.xblock_json["student_view_url"]
+ try:
+ content = instance_connection.get_page(url)
+ except Exception:
+ self.add_failed({"url": url})
  return
  soup = BeautifulSoup(content, "lxml")
  url = str(soup.find("video").find("source")["src"])
@@ -44,9 +47,9 @@ def download(self, instance_connection):
  else:
  video_path = self.output_path.joinpath("video.mp4")
  if not video_path.exists():
- self.scraper.download_file(
-  prepare_url(url, self.scraper.instance_url), video_path
- )
+ prepared_url = prepare_url(url, self.scraper.instance_url)
+ if not self.scraper.download_file(prepared_url, video_path):
+  self.add_failed({"url": prepared_url})
 
  def render(self):
  return jinja(

diff --git a/openedx2zim/xblocks_extractor/lti.py b/openedx2zim/xblocks_extractor/lti.py
@@ -13,14 +13,16 @@ def __init__(
  xblock_json, relative_path, root_url, xblock_id, descendants, scraper
  )
 
- def download(self, instance_connection):
+ def download_inner(self, instance_connection):
  # IMPROUVEMENT LTI can be lot of content type ? Here pdf
  url = (
  self.xblock_json["lms_web_url"].replace("/jump_to/", "/xblock/")
  + "/handler/preview_handler"
  )
- content = instance_connection.get_page(url)
- if not content:
+ try:
+ content = instance_connection.get_page(url)
+ except Exception:
+ self.add_failed({"url": url})
  return
  soup = BeautifulSoup(content, "lxml")
  content_url = soup.find("form")

diff --git a/openedx2zim/xblocks_extractor/problem.py b/openedx2zim/xblocks_extractor/problem.py
@@ -200,12 +200,15 @@ def clean_problem_content(self, soup):
  for span in soup.find_all("span", attrs={"class": "sr"}):
  span.decompose()
 
- def download(self, instance_connection):
+ def download_inner(self, instance_connection):
  """ download the problem xblock content from the instance """
 
  # try to fetch content
- content = instance_connection.get_page(self.xblock_json["student_view_url"])
- if not content:
+ url = self.xblock_json["student_view_url"]
+ try:
+ content = instance_connection.get_page(url)
+ except Exception:
+ self.add_failed({"url": url})
  return
  raw_soup = BeautifulSoup(content, "lxml")
  self.xmodule_handler = str(