mirror of
https://github.com/coursera-dl/coursera-dl.git
synced 2026-01-23 02:35:37 +00:00
Now specialization names can be passed and they will be expanded: corresponding child classes will be downloaded.
239 lines
9 KiB
Python
239 lines
9 KiB
Python
"""
|
|
This module contains implementation for extractors. Extractors know how
|
|
to parse site of MOOC platform and return a list of modules to download.
|
|
Usually they do not download heavy content, except when necessary
|
|
to parse course syllabus.
|
|
"""
|
|
|
|
import abc
|
|
import json
|
|
import logging
|
|
|
|
from .api import (CourseraOnDemand, OnDemandCourseMaterialItemsV1,
|
|
ModulesV1, LessonsV1, ItemsV2)
|
|
from .define import OPENCOURSE_ONDEMAND_COURSE_MATERIALS_V2
|
|
from .network import get_page
|
|
from .utils import is_debug_run, spit_json
|
|
|
|
|
|
class PlatformExtractor(object):
|
|
__metaclass__ = abc.ABCMeta
|
|
|
|
def get_modules(self):
|
|
"""
|
|
Get course modules.
|
|
"""
|
|
pass
|
|
|
|
|
|
class CourseraExtractor(PlatformExtractor):
|
|
def __init__(self, session):
|
|
self._notebook_downloaded = False
|
|
self._session = session
|
|
|
|
def list_courses(self):
|
|
"""
|
|
List enrolled courses.
|
|
|
|
@return: List of enrolled courses.
|
|
@rtype: [str]
|
|
"""
|
|
course = CourseraOnDemand(session=self._session,
|
|
course_id=None,
|
|
course_name=None)
|
|
return course.list_courses()
|
|
|
|
def get_modules(self, class_name,
|
|
reverse=False, unrestricted_filenames=False,
|
|
subtitle_language='en', video_resolution=None,
|
|
download_quizzes=False, mathjax_cdn_url=None,
|
|
download_notebooks=False):
|
|
|
|
page = self._get_on_demand_syllabus(class_name)
|
|
error_occurred, modules = self._parse_on_demand_syllabus(
|
|
class_name,
|
|
page, reverse, unrestricted_filenames,
|
|
subtitle_language, video_resolution,
|
|
download_quizzes, mathjax_cdn_url, download_notebooks)
|
|
|
|
return error_occurred, modules
|
|
|
|
def _get_on_demand_syllabus(self, class_name):
|
|
"""
|
|
Get the on-demand course listing webpage.
|
|
"""
|
|
|
|
url = OPENCOURSE_ONDEMAND_COURSE_MATERIALS_V2.format(
|
|
class_name=class_name)
|
|
page = get_page(self._session, url)
|
|
logging.debug('Downloaded %s (%d bytes)', url, len(page))
|
|
|
|
return page
|
|
|
|
def _parse_on_demand_syllabus(self, course_name, page, reverse=False,
|
|
unrestricted_filenames=False,
|
|
subtitle_language='en',
|
|
video_resolution=None,
|
|
download_quizzes=False,
|
|
mathjax_cdn_url=None,
|
|
download_notebooks=False
|
|
):
|
|
"""
|
|
Parse a Coursera on-demand course listing/syllabus page.
|
|
|
|
@return: Tuple of (bool, list), where bool indicates whether
|
|
there was at least on error while parsing syllabus, the list
|
|
is a list of parsed modules.
|
|
@rtype: (bool, list)
|
|
"""
|
|
|
|
dom = json.loads(page)
|
|
class_id = dom['elements'][0]['id']
|
|
|
|
logging.info('Parsing syllabus of on-demand course (id=%s). '
|
|
'This may take some time, please be patient ...',
|
|
class_id)
|
|
modules = []
|
|
|
|
json_modules = dom['linked']['onDemandCourseMaterialItems.v2']
|
|
course = CourseraOnDemand(
|
|
session=self._session, course_id=class_id,
|
|
course_name=course_name,
|
|
unrestricted_filenames=unrestricted_filenames,
|
|
mathjax_cdn_url=mathjax_cdn_url)
|
|
course.obtain_user_id()
|
|
ondemand_material_items = OnDemandCourseMaterialItemsV1.create(
|
|
session=self._session, course_name=course_name)
|
|
|
|
if is_debug_run():
|
|
spit_json(dom, '%s-syllabus-raw.json' % course_name)
|
|
spit_json(json_modules, '%s-material-items-v2.json' % course_name)
|
|
spit_json(ondemand_material_items._items,
|
|
'%s-course-material-items.json' % course_name)
|
|
|
|
error_occurred = False
|
|
|
|
all_modules = ModulesV1.from_json(
|
|
dom['linked']['onDemandCourseMaterialModules.v1'])
|
|
all_lessons = LessonsV1.from_json(
|
|
dom['linked']['onDemandCourseMaterialLessons.v1'])
|
|
all_items = ItemsV2.from_json(
|
|
dom['linked']['onDemandCourseMaterialItems.v2'])
|
|
|
|
for module in all_modules:
|
|
logging.info('Processing module %s', module.slug)
|
|
lessons = []
|
|
for section in module.children(all_lessons):
|
|
logging.info('Processing section %s', section.slug)
|
|
lectures = []
|
|
available_lectures = section.children(all_items)
|
|
|
|
# Certain modules may be empty-looking programming assignments
|
|
# e.g. in data-structures, algorithms-on-graphs ondemand
|
|
# courses
|
|
if not available_lectures:
|
|
lecture = ondemand_material_items.get(section.id)
|
|
if lecture is not None:
|
|
available_lectures = [lecture]
|
|
|
|
for lecture in available_lectures:
|
|
typename = lecture.type_name
|
|
|
|
logging.info('Processing lecture %s (%s)',
|
|
lecture.slug, typename)
|
|
# Empty dictionary means there were no data
|
|
# None means an error occurred
|
|
links = {}
|
|
|
|
if typename == 'lecture':
|
|
# lecture_video_id = lecture['content']['definition']['videoId']
|
|
# assets = lecture['content']['definition'].get(
|
|
# 'assets', [])
|
|
lecture_video_id = lecture.id
|
|
# assets = []
|
|
|
|
links = course.extract_links_from_lecture(
|
|
class_id,
|
|
lecture_video_id, subtitle_language,
|
|
video_resolution)
|
|
|
|
elif typename == 'supplement':
|
|
links = course.extract_links_from_supplement(
|
|
lecture.id)
|
|
|
|
elif typename == 'phasedPeer':
|
|
links = course.extract_links_from_peer_assignment(
|
|
lecture.id)
|
|
|
|
elif typename in ('gradedProgramming', 'ungradedProgramming'):
|
|
links = course.extract_links_from_programming(
|
|
lecture.id)
|
|
|
|
elif typename == 'quiz':
|
|
if download_quizzes:
|
|
links = course.extract_links_from_quiz(
|
|
lecture.id)
|
|
|
|
elif typename == 'exam':
|
|
if download_quizzes:
|
|
links = course.extract_links_from_exam(
|
|
lecture.id)
|
|
|
|
elif typename == 'programming':
|
|
if download_quizzes:
|
|
links = course.extract_links_from_programming_immediate_instructions(
|
|
lecture.id)
|
|
|
|
elif typename == 'notebook':
|
|
if download_notebooks and not self._notebook_downloaded:
|
|
logging.warning(
|
|
'According to notebooks platform, content will be downloaded first')
|
|
links = course.extract_links_from_notebook(
|
|
lecture.id)
|
|
self._notebook_downloaded = True
|
|
|
|
else:
|
|
logging.info(
|
|
'Unsupported typename "%s" in lecture "%s" (lecture id "%s")',
|
|
typename, lecture.slug, lecture.id)
|
|
continue
|
|
|
|
if links is None:
|
|
error_occurred = True
|
|
elif links:
|
|
lectures.append((lecture.slug, links))
|
|
|
|
if lectures:
|
|
lessons.append((section.slug, lectures))
|
|
|
|
if lessons:
|
|
modules.append((module.slug, lessons))
|
|
|
|
if modules and reverse:
|
|
modules.reverse()
|
|
|
|
# Processing resources section
|
|
json_references = course.extract_references_poll()
|
|
references = []
|
|
if json_references:
|
|
logging.info('Processing resources')
|
|
for json_reference in json_references:
|
|
reference = []
|
|
reference_slug = json_reference['slug']
|
|
logging.info('Processing resource %s',
|
|
reference_slug)
|
|
|
|
links = course.extract_links_from_reference(
|
|
json_reference['shortId'])
|
|
if links is None:
|
|
error_occurred = True
|
|
elif links:
|
|
reference.append(('', links))
|
|
|
|
if reference:
|
|
references.append((reference_slug, reference))
|
|
|
|
if references:
|
|
modules.append(("Resources", references))
|
|
|
|
return error_occurred, modules
|