Skip to content

Commit

Permalink
Merge pull request #178 from openzim/fail_on_too_many_errors
Browse files Browse the repository at this point in the history
Fail scrapper when there are too many errors while retrieving xblocks
  • Loading branch information
benoit74 committed Jul 14, 2023
2 parents c2e01c2 + 78f1650 commit 53db630
Show file tree
Hide file tree
Showing 17 changed files with 268 additions and 126 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# Unreleased

- scraper will fail when there are too many erros while retrieving xblocks

# 1.0.1

- fixed recursive paths and URLs in html_processor.py
Expand Down
8 changes: 4 additions & 4 deletions openedx2zim/annex.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,14 +355,14 @@ def annex_wiki(self):
while page_to_visit:
url = page_to_visit.pop()
self.add_to_wiki_data(url)
content = self.scraper.instance_connection.get_page(url)
# Parse content page
if content:
try:
content = self.scraper.instance_connection.get_page(url)
# Parse content page
soup = BeautifulSoup(content, "lxml")
text = soup.find("div", attrs={"class": "wiki-article"})
if text: # If it's a page (and not a list of page)
self.update_wiki_page(soup, text, url, page_to_visit)
else:
except Exception:
self.wiki_data[url][
"text"
] = """<div><h1 class="page-header">Permission Denied</h1><p class="alert denied">Sorry, you don't have permission to view this page.</p></div>"""
Expand Down
14 changes: 14 additions & 0 deletions openedx2zim/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,20 @@ def main():
default=1,
)

parser.add_argument(
"--watcher-min-dl-count",
help="[dev] Minimum number of resources to have downloaded before considering to stop on errors",
type=int,
default=50,
)

parser.add_argument(
"--watcher-min-ratio",
help="Minimum ratio of resources processed successfully. If ratio is below this threshold, the scrapper will stop.",
type=float,
default=0.9,
)

parser.add_argument(
"--version",
help="Display scraper version and exit",
Expand Down
5 changes: 3 additions & 2 deletions openedx2zim/html_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,8 +392,9 @@ def download_iframes_from_html(
else:
# handle iframe recursively
iframe_url = prepare_url(src, netloc)
src_content = self.scraper.instance_connection.get_page(iframe_url)
if not src_content:
try:
src_content = self.scraper.instance_connection.get_page(iframe_url)
except Exception:
continue
path_recursive, netloc_recursive = self.get_path_and_netloc_to_send(
netloc, path_on_server, iframe_url
Expand Down
6 changes: 5 additions & 1 deletion openedx2zim/instance_connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@
logger = getLogger()


class GetResponseFailed(Exception):
pass


def get_response(url, post_data, headers, max_attempts=5):
req = urllib.request.Request(url, post_data, headers)
for attempt in range(max_attempts):
Expand All @@ -21,7 +25,7 @@ def get_response(url, post_data, headers, max_attempts=5):
else:
logger.debug(f"Error opening {url}: {exc}")
logger.error(f"Max attempts exceeded for {url}")
return {}
raise GetResponseFailed()


class InstanceConnection:
Expand Down
36 changes: 29 additions & 7 deletions openedx2zim/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import concurrent.futures
import datetime
import json
import locale
import os
import pathlib
Expand Down Expand Up @@ -64,6 +65,7 @@
from .xblocks_extractor.unavailable import Unavailable
from .xblocks_extractor.vertical import Vertical
from .xblocks_extractor.video import Video
from .xblocks_extractor.base_xblock import BaseXblock

XBLOCK_EXTRACTORS = {
"course": Course,
Expand Down Expand Up @@ -119,6 +121,8 @@ def __init__(
keep_build_dir,
debug,
threads,
watcher_min_dl_count,
watcher_min_ratio,
):

# video-encoding info
Expand Down Expand Up @@ -152,6 +156,10 @@ def __init__(
self.threads = threads
self.yt_downloader = YoutubeDownloader(threads=1)

# resource processing watcher
BaseXblock.watcher_min_dl_count = watcher_min_dl_count
BaseXblock.watcher_min_ratio = watcher_min_ratio

# authentication
self.email = email
self.password = password
Expand Down Expand Up @@ -230,6 +238,7 @@ def prepare_mooc_data(self):
self.instance_config["course_prefix"],
self.instance_url,
)
logger.debug(f"Course ID: {self.course_id}")
logger.info("Getting course info ...")
self.course_info = self.instance_connection.get_api_json(
"/api/courses/v1/courses/"
Expand Down Expand Up @@ -329,8 +338,9 @@ def get_book_list(self, book, output_path):
def annex_extra_page(self, tab_href, tab_org_path):
output_path = self.build_dir.joinpath(tab_org_path)
output_path.mkdir(parents=True, exist_ok=True)
page_content = self.instance_connection.get_page(self.instance_url + tab_href)
if not page_content:
try:
page_content = self.instance_connection.get_page(self.instance_url + tab_href)
except Exception:
logger.error(f"Failed to get page content for tab {tab_org_path}")
raise SystemExit(1)
soup_page = BeautifulSoup(page_content, "lxml")
Expand Down Expand Up @@ -403,8 +413,9 @@ def get_tab_path_and_name(self, tab_text, tab_href):

def get_course_tabs(self):
logger.info("Getting course tabs ...")
content = self.instance_connection.get_page(self.course_url)
if not content:
try:
content = self.instance_connection.get_page(self.course_url)
except Exception:
logger.error("Failed to get course tabs")
raise SystemExit(1)
soup = BeautifulSoup(content, "lxml")
Expand Down Expand Up @@ -508,8 +519,9 @@ def clean_content(html_article):

# get the course url and generate homepage
logger.info("Getting homepage ...")
content = self.instance_connection.get_page(self.course_url)
if not content:
try:
content = self.instance_connection.get_page(self.course_url)
except Exception:
logger.error("Error while getting homepage")
raise SystemExit(1)
self.build_dir.joinpath("home").mkdir(parents=True, exist_ok=True)
Expand Down Expand Up @@ -562,6 +574,15 @@ def clean_content(html_article):
]
concurrent.futures.wait(fs, return_when=concurrent.futures.ALL_COMPLETED)

if BaseXblock.too_many_failures():
logger.error("Stopping scrapper because too many errors occured while getting content")
if self.debug:
print("Xblock download failure details:", file=sys.stderr)
json.dump(BaseXblock.watcher.failed_xblocks, sys.stderr, indent=4)
return False

return True

def s3_credentials_ok(self):
logger.info("Testing S3 Optimization Cache credentials ...")
self.s3_storage = KiwixStorage(self.s3_url_with_credentials)
Expand Down Expand Up @@ -854,7 +875,8 @@ def run(self):
self.prepare_mooc_data()
self.parse_course_xblocks()
self.annex()
self.get_content()
if not self.get_content():
return
self.render()
if not self.no_zim:
self.fname = (
Expand Down
5 changes: 3 additions & 2 deletions openedx2zim/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,9 @@ def download_and_convert_subtitles(output_path, subtitles, instance_connection):
subtitle_file = pathlib.Path(output_path).joinpath(f"{lang}.vtt")
if not subtitle_file.exists():
try:
raw_subtitle = instance_connection.get_page(subtitles[lang])
if not raw_subtitle:
try:
raw_subtitle = instance_connection.get_page(subtitles[lang])
except Exception:
logger.error(f"Subtitle fetch failed from {subtitles[lang]}")
continue
subtitle = html.unescape(
Expand Down
46 changes: 46 additions & 0 deletions openedx2zim/xblocks_extractor/base_xblock.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,27 @@
from multiprocessing import Lock
from slugify import slugify

from ..constants import getLogger


class ExtractionWatcher:
total_count = 0
dl_count = 0
success_count = 0
failed_xblocks = []


logger = getLogger()


class BaseXblock:

watcher = ExtractionWatcher()
watcher_min_dl_count = 0
watcher_min_ratio = 0

lock = Lock()

def __init__(
self, xblock_json, output_path, root_url, xblock_id, descendants, scraper
):
Expand All @@ -18,7 +38,33 @@ def __init__(
# make xblock output directory
self.output_path.mkdir(parents=True, exist_ok=True)

self.watcher.total_count += 1

@classmethod
def too_many_failures(cls):
return cls.watcher.dl_count > cls.watcher_min_dl_count and (cls.watcher.success_count / cls.watcher.dl_count) < cls.watcher_min_ratio

def download(self, instance_connection):
if BaseXblock.too_many_failures():
return
with self.lock:
self.watcher.dl_count += 1
logger.debug(f"Downloading resource {self.watcher.dl_count} of {self.watcher.total_count} ({self.watcher.success_count} success so far)")
try:
self.download_inner(instance_connection=instance_connection)
except Exception:
self.add_failed({})
return
with self.lock:
self.watcher.success_count += 1

def add_failed(self, description):
with self.lock:
description["xblock_id"] = self.xblock_id
description["class"] = type(self).__name__
self.watcher.failed_xblocks.append(description)

def download_inner(self, instance_connection):
return

def render(self):
Expand Down
9 changes: 6 additions & 3 deletions openedx2zim/xblocks_extractor/discussion.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,13 @@ def get_extra_content(self, soup):
)
)

def download(self, instance_connection):
def download_inner(self, instance_connection):
if self.scraper.forum:
content = instance_connection.get_page(self.xblock_json["student_view_url"])
if not content:
url = self.xblock_json["student_view_url"]
try:
content = instance_connection.get_page(url)
except Exception:
self.add_failed({"url": url})
return
soup = BeautifulSoup(content, "lxml")

Expand Down
9 changes: 6 additions & 3 deletions openedx2zim/xblocks_extractor/drag_and_drop_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,12 @@ def __init__(
# extra vars
self.content = None

def download(self, instance_connection):
content = instance_connection.get_page(self.xblock_json["student_view_url"])
if not content:
def download_inner(self, instance_connection):
url = self.xblock_json["student_view_url"]
try:
content = instance_connection.get_page(url)
except Exception:
self.add_failed({"url": url})
return
soup = BeautifulSoup(content, "lxml")
self.content = json.loads(
Expand Down
9 changes: 6 additions & 3 deletions openedx2zim/xblocks_extractor/free_text_response.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,12 @@ def __init__(
# extra vars
self.html = ""

def download(self, instance_connection):
content = instance_connection.get_page(self.xblock_json["student_view_url"])
if not content:
def download_inner(self, instance_connection):
url = self.xblock_json["student_view_url"]
try:
content = instance_connection.get_page(url)
except Exception:
self.add_failed({"url": url})
return
soup = BeautifulSoup(content, "lxml")
html_content = soup.find("div", attrs={"class": "edx-notes-wrapper"})
Expand Down
9 changes: 6 additions & 3 deletions openedx2zim/xblocks_extractor/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,12 @@ def __init__(
self.is_video = False # check this
self.html = ""

def download(self, instance_connection):
content = instance_connection.get_page(self.xblock_json["student_view_url"])
if not content:
def download_inner(self, instance_connection):
url = self.xblock_json["student_view_url"]
try:
content = instance_connection.get_page(url)
except Exception:
self.add_failed({"url": url})
return
soup = BeautifulSoup(content, "lxml")
html_content = soup.find("div", attrs={"class": "xblock"})
Expand Down
15 changes: 9 additions & 6 deletions openedx2zim/xblocks_extractor/libcast.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,12 @@ def __init__(
# extra vars
self.subs = []

def download(self, instance_connection):
content = instance_connection.get_page(self.xblock_json["student_view_url"])
if not content:
def download_inner(self, instance_connection):
url = self.xblock_json["student_view_url"]
try:
content = instance_connection.get_page(url)
except Exception:
self.add_failed({"url": url})
return
soup = BeautifulSoup(content, "lxml")
url = str(soup.find("video").find("source")["src"])
Expand All @@ -44,9 +47,9 @@ def download(self, instance_connection):
else:
video_path = self.output_path.joinpath("video.mp4")
if not video_path.exists():
self.scraper.download_file(
prepare_url(url, self.scraper.instance_url), video_path
)
prepared_url = prepare_url(url, self.scraper.instance_url)
if not self.scraper.download_file(prepared_url, video_path):
self.add_failed({"url": prepared_url})

def render(self):
return jinja(
Expand Down
8 changes: 5 additions & 3 deletions openedx2zim/xblocks_extractor/lti.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,16 @@ def __init__(
xblock_json, relative_path, root_url, xblock_id, descendants, scraper
)

def download(self, instance_connection):
def download_inner(self, instance_connection):
# IMPROUVEMENT LTI can be lot of content type ? Here pdf
url = (
self.xblock_json["lms_web_url"].replace("/jump_to/", "/xblock/")
+ "/handler/preview_handler"
)
content = instance_connection.get_page(url)
if not content:
try:
content = instance_connection.get_page(url)
except Exception:
self.add_failed({"url": url})
return
soup = BeautifulSoup(content, "lxml")
content_url = soup.find("form")
Expand Down
9 changes: 6 additions & 3 deletions openedx2zim/xblocks_extractor/problem.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,12 +200,15 @@ def clean_problem_content(self, soup):
for span in soup.find_all("span", attrs={"class": "sr"}):
span.decompose()

def download(self, instance_connection):
def download_inner(self, instance_connection):
""" download the problem xblock content from the instance """

# try to fetch content
content = instance_connection.get_page(self.xblock_json["student_view_url"])
if not content:
url = self.xblock_json["student_view_url"]
try:
content = instance_connection.get_page(url)
except Exception:
self.add_failed({"url": url})
return
raw_soup = BeautifulSoup(content, "lxml")
self.xmodule_handler = str(
Expand Down
Loading

0 comments on commit 53db630

Please sign in to comment.