Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for xuetangx.com #412

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ These are the current supported sites:
- [France Université Numérique](https://www.france-universite-numerique-mooc.fr/)
- [GW Online SEAS](http://openedx.seas.gwu.edu/) - George Washington University
- [GW Online Open](http://mooc.online.gwu.edu/) - George Washington University
- [Xuetangx (学堂在线)](http://www.xuetangx.com/)

This is the full [list of sites powered by Open edX][sites]. Not all of them
are supported at the moment, we welcome you to contribute support for them
Expand Down
137 changes: 122 additions & 15 deletions edx_dl/edx_dl.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,14 @@
import pickle
import re
import sys
import math

from functools import partial
from multiprocessing.dummy import Pool as ThreadPool

from six.moves.http_cookiejar import CookieJar
from six.moves.urllib.error import HTTPError, URLError
from six.moves.urllib.parse import urlencode
from six.moves.urllib.parse import urlencode, quote
from six.moves.urllib.request import (
urlopen,
build_opener,
Expand Down Expand Up @@ -93,19 +94,25 @@
'bits':{
'url':'http://any-learn.bits-pilani.ac.in',
'courseware-selector': ('nav', {'aria-label': 'Course Navigation'}),
},
'xuetangx': {
'url': 'http://www.xuetangx.com',
'courseware-selector': None,
}
}
BASE_URL = OPENEDX_SITES['edx']['url']
SITE_NAME = 'edx'
BASE_URL = OPENEDX_SITES[SITE_NAME]['url']
EDX_HOMEPAGE = BASE_URL + '/login_ajax'
LOGIN_API = BASE_URL + '/login_ajax'
DASHBOARD = BASE_URL + '/dashboard'
COURSEWARE_SEL = OPENEDX_SITES['edx']['courseware-selector']
COURSEWARE_SEL = OPENEDX_SITES[SITE_NAME]['courseware-selector']


def change_openedx_site(site_name):
"""
Changes the openedx website for the given one via the key
"""
global SITE_NAME
global BASE_URL
global EDX_HOMEPAGE
global LOGIN_API
Expand All @@ -117,11 +124,15 @@ def change_openedx_site(site_name):
logging.error("OpenEdX platform should be one of: %s", ', '.join(sites))
sys.exit(ExitCode.UNKNOWN_PLATFORM)

BASE_URL = OPENEDX_SITES[site_name]['url']
SITE_NAME = site_name
BASE_URL = OPENEDX_SITES[SITE_NAME]['url']
EDX_HOMEPAGE = BASE_URL + '/login_ajax'
LOGIN_API = BASE_URL + '/login_ajax'
DASHBOARD = BASE_URL + '/dashboard'
COURSEWARE_SEL = OPENEDX_SITES[site_name]['courseware-selector']
if site_name == 'xuetangx':
DASHBOARD = BASE_URL + '/api/web/courses/mycourses?format=json'
else:
DASHBOARD = BASE_URL + '/dashboard'
COURSEWARE_SEL = OPENEDX_SITES[SITE_NAME]['courseware-selector']


def _display_courses(courses):
Expand All @@ -135,10 +146,67 @@ def _display_courses(courses):
logging.info(' %s', course.url)


def get_courses_info_xuetangx(url, headers):
"""
Extracts the courses information from the dashboard.

This function is re-implemented for http://www.xuetangx.com, because
Xuetangx uses a REST API, which is quite different from other OpenEdX sites.
"""
def fetch_and_parse(base_url, param):
"""
Fetches the JSON API, and returns the total count, and a list of dicts
for the results on the current page.

:param base_url: the URL of the API.
:param param: query parameters, represented by a list of tuples.
:return: a (total, results) tuple; (0, []) on failure.
"""
url = base_url + '?' + urlencode(param)
page = get_page_contents(url, headers)
try:
d = json.loads(page)
total = d['total']
results = d['results']
except (json.JSONDecodeError, KeyError):
total = 0
results = []
return total, results

logging.info('Extracting course information from JSON API.')

api_url = BASE_URL + '/api/web/courses/mycourses'
query_params = [
[('type', 'started'), ('format', 'json')],
[('type', 'ended'), ('format', 'json')]
]
# use default page size, and fetch multiple times, in case there is a hard
# limit set by the API
page_size = 10

courses = []
page_extractor = get_page_extractor(url)

for param in query_params:
total, results = fetch_and_parse(api_url, param)
page_count = int(math.ceil(1.0 * total / page_size))
for i in range(page_count):
if i:
# page needs to be re-fetched unless it is the first one
new_param = param + [('offset', i * page_size)]
_, results = fetch_and_parse(api_url, new_param)
courses += page_extractor.extract_courses(results, BASE_URL)

return courses


def get_courses_info(url, headers):
"""
Extracts the courses information from the dashboard.
"""
if SITE_NAME == 'xuetangx':
return get_courses_info_xuetangx(url, headers)

logging.info('Extracting course information from dashboard.')

page = get_page_contents(url, headers)
Expand Down Expand Up @@ -310,6 +378,14 @@ def parse_args():
default=False,
help='list available sections')

parser.add_argument('--quality',
dest='quality',
action='store',
choices={'high', 'standard'},
default='high',
help='quality of video to download; works for xuetangx'
' only')

parser.add_argument('--youtube-dl-options',
dest='youtube_dl_options',
action='store',
Expand Down Expand Up @@ -437,6 +513,9 @@ def extract_units(url, headers, file_formats):

page = get_page_contents(url, headers)
page_extractor = get_page_extractor(url)
set_headers = getattr(page_extractor, 'set_headers', None)
if callable(set_headers):
set_headers(headers)
units = page_extractor.extract_units_from_html(page, BASE_URL, file_formats)

return units
Expand Down Expand Up @@ -666,27 +745,45 @@ def _build_subtitles_downloads(video, target_dir, filename_prefix, headers):
return downloads


def _build_url_downloads(urls, target_dir, filename_prefix):
def _build_url_downloads(urls, target_dir, filename_prefix, args,
is_video=False):
"""
Builds a dict {url: filename} for the given urls
If it is a youtube url it uses the valid template for youtube-dl
otherwise just takes the name of the file from the url
"""
if SITE_NAME == 'xuetangx' and is_video and urls:
# take advantage of the fact that the URL of HQ videos are
# lexicographically larger on Xuetangx ('quality20' > 'quality10')
urls = [max(urls)] if args.quality == 'high' else [min(urls)]
downloads = {url:
_build_filename_from_url(url, target_dir, filename_prefix)
_build_filename_from_url(url, target_dir, filename_prefix,
is_video=is_video)
for url in urls}
return downloads


def _build_filename_from_url(url, target_dir, filename_prefix):
def _build_filename_from_url(url, target_dir, filename_prefix, is_video=False,
video_counter=[0]):
"""
Builds the appropriate filename for the given args
"""
# video file names in Xuetangx do not make sense;
# use a counter as a workaround
if is_video:
video_counter[0] += 1

if is_youtube_url(url):
filename_template = filename_prefix + "-%(title)s-%(id)s.%(ext)s"
filename = os.path.join(target_dir, filename_template)
else:
original_filename = url.rsplit('/', 1)[1]
if SITE_NAME == 'xuetangx' and is_video:
original_filename = 'video_%05d.mp4' % video_counter[0]
else:
original_filename = url.rsplit('/', 1)[1]
# remove special characters that may cause problems under Windows
original_filename = ''.join(list(filter(
lambda c: c not in ';/?:@&=+$,', original_filename)))
filename = os.path.join(target_dir,
filename_prefix + '-' + original_filename)

Expand All @@ -697,6 +794,8 @@ def download_url(url, filename, headers, args):
"""
Downloads the given url in filename.
"""
# resolve unicode issue
url = quote(url, safe=';/?:@&=+$,')

if is_youtube_url(url):
download_youtube_url(url, filename, headers, args)
Expand Down Expand Up @@ -779,13 +878,15 @@ def skip_or_download(downloads, headers, args, f=download_url):
def download_video(video, args, target_dir, filename_prefix, headers):
if args.prefer_cdn_videos or video.video_youtube_url is None:
mp4_downloads = _build_url_downloads(video.mp4_urls, target_dir,
filename_prefix)
filename_prefix, args,
is_video=True)
skip_or_download(mp4_downloads, headers, args)
else:
if video.video_youtube_url is not None:
youtube_downloads = _build_url_downloads([video.video_youtube_url],
target_dir,
filename_prefix)
filename_prefix,
is_video=True)
skip_or_download(youtube_downloads, headers, args)

# the behavior with subtitles is different, since the subtitles don't know
Expand Down Expand Up @@ -813,7 +914,7 @@ def download_unit(unit, args, target_dir, filename_prefix, headers):
download_video(video, args, target_dir, new_prefix, headers)

res_downloads = _build_url_downloads(unit.resources_urls, target_dir,
filename_prefix)
filename_prefix, args)
skip_or_download(res_downloads, headers, args)


Expand All @@ -827,13 +928,19 @@ def download(args, selections, all_units, headers):
# notice that we could iterate over all_units, but we prefer to do it over
# sections/subsections to add correct prefixes and show nicer information.

# courses on Xuetangx may contain chinese characters
preserve_non_ascii = (SITE_NAME == 'xuetangx')

for selected_course, selected_sections in selections.items():
coursename = directory_name(selected_course.name)
coursename = directory_name(selected_course.name,
minimal_change=preserve_non_ascii)
for selected_section in selected_sections:
section_dirname = "%02d-%s" % (selected_section.position,
selected_section.name)
target_dir = os.path.join(args.output_dir, coursename,
clean_filename(section_dirname))
clean_filename(section_dirname,
minimal_change=
preserve_non_ascii))
mkdir_p(target_dir)
counter = 0
for subsection in selected_section.subsections:
Expand Down
77 changes: 76 additions & 1 deletion edx_dl/parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@
"""
import re
import json
import logging

from datetime import timedelta, datetime

from six.moves import html_parser
from bs4 import BeautifulSoup as BeautifulSoup_

from .common import Course, Section, SubSection, Unit, Video
from .utils import get_page_contents, remove_blanks


# Force use of bs4 with html.parser
Expand Down Expand Up @@ -188,7 +190,9 @@ def extract_resources_urls(self, text, BASE_URL, file_formats):
youtube_links = re_youtube_links.findall(text)
resources_urls += youtube_links

return resources_urls
# there may be some surplus blank characters extracted from the HTML;
# remove them
return list(map(remove_blanks, resources_urls))

def extract_sections_from_html(self, page, BASE_URL):
"""
Expand Down Expand Up @@ -408,6 +412,75 @@ def _make_subsections(section_soup):
return sections


class XuetangxPageExtractor(ClassicEdXPageExtractor):

def __init__(self):
self.headers = None

def set_headers(self, headers):
"""Sets the headers necessary for accessing the video URL API"""
self.headers = headers
self.base_url = None

def extract_courses(self, results, BASE_URL):
"""
Extract courses from a list of dicts.
"""
courses = []

for result in results:
try:
course_id = result['id']
course_name = result['name']
course_url = BASE_URL + result['info_link']
# Xuetangx allows accessing materials for all archived courses,
# so it's safe to mark all courses as 'Started'.
course_state = 'Started'
except KeyError:
continue
courses.append(Course(id=course_id,
name=course_name,
url=course_url,
state=course_state))

return courses

def extract_units_from_html(self, page, BASE_URL, file_formats):
self.base_url = BASE_URL
return ClassicEdXPageExtractor.extract_units_from_html(self, page,
BASE_URL,
file_formats)

def extract_mp4_urls(self, text):
"""
Looks for available links to the mp4 version of the videos
"""
# Xuetangx does not provide the video URL directly in the page;
# instead, a video id can be found in the page and translated into
# actual URL through a "video2source" API.
m = re.search('(?<=data-ccsource=&#39;).+(?=&#39;)', text)
if not m:
return []

video_id = m.group(0)
if not self.base_url:
logging.debug('Base URL unset; please set self.base_url before '
'calling extract_mp4_urls')
return []
video_src_url = self.base_url + '/videoid2source/' + video_id
video_src_json = get_page_contents(video_src_url, self.headers)
try:
sources = json.loads(video_src_json)['sources']
except (json.JSONDecodeError, KeyError):
return []

mp4_urls = []
for quality in sources:
if sources[quality]:
mp4_urls.append(sources[quality][0])
return mp4_urls


def get_page_extractor(url):
"""
factory method for page extractors
Expand All @@ -423,6 +496,8 @@ def get_page_extractor(url):
url.startswith('https://www.fun-mooc.fr')
):
return CurrentEdXPageExtractor()
elif 'xuetangx.com' in url:
return XuetangxPageExtractor()
else:
return ClassicEdXPageExtractor()

Expand Down