From 2ea2e7aa62e175f96531d26151fa941f6ee14f58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=AD=99=E4=B8=9A=E5=86=9B?= Date: Fri, 10 Mar 2017 01:47:48 +0800 Subject: [PATCH 01/87] improve download subtiles. add support to download certern language's subtile --- coursera/filtering.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/coursera/filtering.py b/coursera/filtering.py index d9ed5d5..14f4116 100644 --- a/coursera/filtering.py +++ b/coursera/filtering.py @@ -94,15 +94,16 @@ def find_resources_to_get(lecture, file_formats, resource_filter, ignored_format logging.info("The following file formats will be ignored: " + ",".join(ignored_formats)) for fmt, resources in iteritems(lecture): - fmt0 = fmt if '.' in fmt: - fmt = fmt.split('.')[1] + short_fmt = fmt.split('.')[1] + else: + short_fmt = None - if fmt in ignored_formats: + if fmt in ignored_formats or short_fmt in ignored_formats: continue - if fmt in file_formats or 'all' in file_formats: + if fmt in file_formats or short_fmt in file_formats or 'all' in file_formats: for r in resources: if resource_filter and r[1] and not re.search(resource_filter, r[1]): logging.debug('Skipping b/c of rf: %s %s', From eec99f64a480dbc4cf34d4ca0cc471e189ff44cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=AD=99=E4=B8=9A=E5=86=9B?= Date: Fri, 10 Mar 2017 14:35:53 +0800 Subject: [PATCH 02/87] add support to download certain subtitle --- coursera/filtering.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/coursera/filtering.py b/coursera/filtering.py index 14f4116..4331831 100644 --- a/coursera/filtering.py +++ b/coursera/filtering.py @@ -95,15 +95,15 @@ def find_resources_to_get(lecture, file_formats, resource_filter, ignored_format for fmt, resources in iteritems(lecture): fmt0 = fmt + + short_fmt = None if '.' in fmt: short_fmt = fmt.split('.')[1] - else: - short_fmt = None - if fmt in ignored_formats or short_fmt in ignored_formats: + if fmt in ignored_formats or (short_fmt != None and short_fmt in ignored_formats) : continue - if fmt in file_formats or short_fmt in file_formats or 'all' in file_formats: + if fmt in file_formats or (short_fmt != None and short_fmt in file_formats) or 'all' in file_formats: for r in resources: if resource_filter and r[1] and not re.search(resource_filter, r[1]): logging.debug('Skipping b/c of rf: %s %s', From b7f24a772420e10fe477122431ee1916b95c36ed Mon Sep 17 00:00:00 2001 From: Yuri Bochkarev Date: Sun, 19 Mar 2017 15:41:43 +0300 Subject: [PATCH 03/87] Mention Windows proxy support in the README.md related #205 #594 --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index 28c3d70..3207408 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,7 @@ - [Resuming downloads](#resuming-downloads) - [Troubleshooting](#troubleshooting) - [Found 0 sections and 0 lectures on this page](#found-0-sections-and-0-lectures-on-this-page) + - [Windows: Proxy support](#windows-proxy-support) - [Windows: Failed to create process](#windows-failed-to-create-process) - [SSLError: Errno 1 _ssl.c:504: error:14094410:SSL routines:SSL3_READ_BYTES:sslv3 alert handshake failure](#sslerror-errno-1-_sslc504-error14094410ssl-routinesssl3_read_bytessslv3-alert-handshake-failure) - [Reporting issues](#reporting-issues) @@ -398,6 +399,18 @@ Alternatively you may want to try this Chrome extension: https://chrome.google.c If none of the above works for you, there is nothing we can do. +## Windows: proxy support + +If you're on Windows behind a proxy, set up the environment variables +before running the script as follows: + +``` +set HTTP_PROXY=http://host:port +set HTTPS_PROXY=http://host:port +``` + +Related discussion: [#205](https://github.com/coursera-dl/coursera-dl/issues/205) + ## Windows: Failed to create process In `C:\Users\\AppData\Local\Programs\Python\Python35-32\Scripts` From f66e13f66820a61b609f38c119b34e9ce11885b4 Mon Sep 17 00:00:00 2001 From: Tony Yang Date: Tue, 21 Mar 2017 14:55:11 +0800 Subject: [PATCH 04/87] change course URL in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3207408..083ef02 100644 --- a/README.md +++ b/README.md @@ -317,7 +317,7 @@ one of the following actions solve your problem: * Make sure the class name you are using corresponds to the resource name used in the URL for that class: - `https://class.coursera.org//class/index` + `https://www.coursera.org/learn//home/welcome` * Have you tried to clean the cached cookies/credentials with the `--clear-cache` option? From f37bc44f510f69e158f814876eaa48ef67713604 Mon Sep 17 00:00:00 2001 From: "Gautam krishna.R" Date: Mon, 1 May 2017 15:02:42 +0530 Subject: [PATCH 05/87] fixes repository misclassifying --- .gitattributes | 1 + 1 file changed, 1 insertion(+) create mode 100644 .gitattributes diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..3ff2dd9 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +tests/* linguist-vendored From 19103f2718d7d4e5aff1ac8a840a265116cf40f3 Mon Sep 17 00:00:00 2001 From: "Gautam krishna.R" Date: Mon, 1 May 2017 15:10:04 +0530 Subject: [PATCH 06/87] Update .gitattributes --- .gitattributes | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitattributes b/.gitattributes index 3ff2dd9..a76c228 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1 +1 @@ -tests/* linguist-vendored +coursera/test/* linguist-vendored From 08b8ad44c2404586b1c4d3258971e1e630f15ab8 Mon Sep 17 00:00:00 2001 From: ifaint Date: Sat, 13 May 2017 10:04:01 +0800 Subject: [PATCH 07/87] 1. enable multiple subtitles and transcripts with alternatives. 1. enable downloading resources blocks. 1. enable file to store parameters. 1. added unit-test. --- README.md | 29 ++- coursera/api.py | 227 +++++++++++++++--- coursera/commandline.py | 30 ++- coursera/coursera_dl.py | 4 +- coursera/define.py | 12 +- coursera/extractors.py | 39 ++- .../fixtures/json/references-poll-output.json | 24 ++ .../fixtures/json/references-poll-reply.json | 47 ++++ ...diate-instructions-empty-instructions.json | 18 ++ ...mmediate-instructions-no-instructions.json | 6 + ...ming-immediate-instructions-one-asset.json | 18 ++ .../fixtures/json/video-output-1-all.json | 16 ++ .../test/fixtures/json/video-output-1-en.json | 4 + .../test/fixtures/json/video-output-1.json | 6 + .../test/fixtures/json/video-output-2.json | 6 + .../test/fixtures/json/video-reply-1.json | 47 ++++ .../test/fixtures/json/video-reply-2.json | 77 ++++++ coursera/test/test_api.py | 164 ++++++++++++- requirements.txt | 1 + tox.ini | 1 + 20 files changed, 730 insertions(+), 46 deletions(-) create mode 100644 coursera/test/fixtures/json/references-poll-output.json create mode 100644 coursera/test/fixtures/json/references-poll-reply.json create mode 100644 coursera/test/fixtures/json/supplement-programming-immediate-instructions-empty-instructions.json create mode 100644 coursera/test/fixtures/json/supplement-programming-immediate-instructions-no-instructions.json create mode 100644 coursera/test/fixtures/json/supplement-programming-immediate-instructions-one-asset.json create mode 100644 coursera/test/fixtures/json/video-output-1-all.json create mode 100644 coursera/test/fixtures/json/video-output-1-en.json create mode 100644 coursera/test/fixtures/json/video-output-1.json create mode 100644 coursera/test/fixtures/json/video-output-2.json create mode 100644 coursera/test/fixtures/json/video-reply-1.json create mode 100644 coursera/test/fixtures/json/video-reply-2.json diff --git a/README.md b/README.md index 083ef02..2ed7098 100644 --- a/README.md +++ b/README.md @@ -70,6 +70,7 @@ I've downloaded many other good videos such as those from Khan Academy. certain resources. * File format extension filter to grab resource types you want. * Login credentials accepted on command-line or from `.netrc` file. + * Default arguments loaded from `coursera-dl.conf` file. * Core functionality tested on Linux, Mac and Windows. # Disclaimer @@ -278,6 +279,23 @@ instead. This is especially convenient, as typing usernames (email addresses) and passwords directly on the command line can get tiresome (even more if you happened to choose a "strong" password). +Alternatively, if you want to store your preferred parameters (which might +also include your username and password), create a file named `coursera-dl.conf` +where the script is supposed to be executed, with the following format: + + --username + --password + --subtitle-language en,zh-CN|zh-TW + --download-quizzes True + #--mathjax-cdn https://cdn.bootcss.com/mathjax/2.7.1/MathJax.js + # more other parameters + +Parameter which is stored in the file will be overriden if it is again specifed +in your commandline script + +**Note:** In `coursera-dl.conf`, all the parameters should not be wrapped +with quotes. + ## Resuming downloads In default mode when you interrupt the download process by pressing @@ -341,7 +359,7 @@ one of the following actions solve your problem: * If results show 0 sections, you most likely have provided invalid credentials (username and/or password in the command line or in your - `.netrc` file). + `.netrc` file or in your `coursera-dl.conf` file). * For courses that have not started yet, but have had a previous iteration sometimes a preview is available, containing all the classes from the last @@ -456,6 +474,15 @@ If you still have the problem, please read the following issues for more ideas o This is also worth reading: https://urllib3.readthedocs.io/en/latest/security.html#insecureplatformwarning +## Use an alternative cdn url for `MathJax.js` + +When saving a course page, we enabled `MathJax` rendering for math equations, by +injecting `MathJax.js` in the header. The script is using a cdn service provided +by [mathjax.org](https://cdn.mathjax.org/mathjax/latest/MathJax.js). However, that +url is not accessible in some countries/regions, you can provide a +`--mathjax-cdn ` parameter to specify the `MathJax.js` file that is +accessible in your region. + # Reporting issues Before reporting any issue please follow the steps below: diff --git a/coursera/api.py b/coursera/api.py index e22c32d..a680b1a 100644 --- a/coursera/api.py +++ b/coursera/api.py @@ -25,12 +25,18 @@ from .define import (OPENCOURSE_SUPPLEMENT_URL, OPENCOURSE_ONDEMAND_COURSE_MATERIALS, OPENCOURSE_VIDEO_URL, OPENCOURSE_MEMBERSHIPS, + OPENCOURSE_REFERENCES_POLL_URL, + OPENCOURSE_REFERENCE_ITEM_URL, + OPENCOURSE_PROGRAMMING_IMMEDIATE_INSTRUCTIOINS_URL, + POST_OPENCOURSE_API_QUIZ_SESSION, POST_OPENCOURSE_API_QUIZ_SESSION_GET_STATE, POST_OPENCOURSE_ONDEMAND_EXAM_SESSIONS, POST_OPENCOURSE_ONDEMAND_EXAM_SESSIONS_GET_STATE, - INSTRUCTIONS_HTML_INJECTION, + INSTRUCTIONS_HTML_INJECTION_PRE, + INSTRUCTIONS_HTML_MATHJAX_URL, + INSTRUCTIONS_HTML_INJECTION_AFTER, IN_MEMORY_EXTENSION, IN_MEMORY_MARKER) @@ -135,9 +141,12 @@ class QuizExamToMarkupConverter(object): class MarkupToHTMLConverter(object): - def __init__(self, session): + def __init__(self, session, mathjax_cdn_url=None): self._session = session self._asset_retriever = AssetRetriever(session) + if not mathjax_cdn_url: + mathjax_cdn_url = INSTRUCTIONS_HTML_MATHJAX_URL + self._mathjax_cdn_url = mathjax_cdn_url def __call__(self, markup): """ @@ -170,7 +179,11 @@ class MarkupToHTMLConverter(object): soup.insert(0, meta) # 1. Inject basic CSS style - css_soup = BeautifulSoup(INSTRUCTIONS_HTML_INJECTION) + css = "".join([ + INSTRUCTIONS_HTML_INJECTION_PRE, + self._mathjax_cdn_url, + INSTRUCTIONS_HTML_INJECTION_AFTER]) + css_soup = BeautifulSoup(css) soup.append(css_soup) # 2. Replace with

@@ -386,7 +399,8 @@ class CourseraOnDemand(object): """ def __init__(self, session, course_id, course_name, - unrestricted_filenames=False): + unrestricted_filenames=False, + mathjax_cdn_url=None): """ Initialize Coursera OnDemand API. @@ -409,7 +423,7 @@ class CourseraOnDemand(object): self._user_id = None self._quiz_to_markup = QuizExamToMarkupConverter(session) - self._markup_to_html = MarkupToHTMLConverter(session) + self._markup_to_html = MarkupToHTMLConverter(session, mathjax_cdn_url=mathjax_cdn_url) self._asset_retriever = AssetRetriever(session) def obtain_user_id(self): @@ -721,29 +735,11 @@ class CourseraOnDemand(object): video_url = sources[0]['formatSources']['video/mp4'] video_content['mp4'] = video_url - # subtitles and transcripts - subtitle_nodes = [ - ('subtitles', 'srt', 'subtitle'), - ('subtitlesTxt', 'txt', 'transcript'), - ] - for (subtitle_node, subtitle_extension, subtitle_description) in subtitle_nodes: - logging.debug('Gathering %s URLs for video_id <%s>.', subtitle_description, video_id) - subtitles = dom.get(subtitle_node) - if subtitles is not None: - if subtitle_language == 'all': - for current_subtitle_language in subtitles: - video_content[current_subtitle_language + '.' + subtitle_extension] = make_coursera_absolute_url(subtitles.get(current_subtitle_language)) - else: - if subtitle_language != 'en' and subtitle_language not in subtitles: - logging.warning("%s unavailable in '%s' language for video " - "with video id: [%s], falling back to 'en' " - "%s", subtitle_description.capitalize(), subtitle_language, video_id, subtitle_description) - subtitle_language = 'en' + subtitle_link = self._extract_subtitles_from_video_dom( + dom, subtitle_language, video_id) - subtitle_url = subtitles.get(subtitle_language) - if subtitle_url is not None: - # some subtitle urls are relative! - video_content[subtitle_language + '.' + subtitle_extension] = make_coursera_absolute_url(subtitle_url) + for key, value in iteritems(subtitle_link): + video_content[key] = value lecture_video_content = {} for key, value in iteritems(video_content): @@ -751,6 +747,102 @@ class CourseraOnDemand(object): return lecture_video_content + def _extract_subtitles_from_video_dom(self, video_dom, + subtitle_language, video_id): + # subtitles and transcripts + subtitle_nodes = [ + ('subtitles', 'srt', 'subtitle'), + ('subtitlesTxt', 'txt', 'transcript'), + ] + subtitle_set_download = set() + subtitle_set_nonexist = set() + subtitle_links = {} + for (subtitle_node, subtitle_extension, subtitle_description) \ + in subtitle_nodes: + logging.debug('Gathering %s URLs for video_id <%s>.', + subtitle_description, video_id) + subtitles = video_dom.get(subtitle_node) + download_all_subtitle = False + if subtitles is not None: + subtitles_set = set(subtitles) + requested_subtitle_list = [s.strip() for s in + subtitle_language.split(",")] + for language_with_alts in requested_subtitle_list: + if download_all_subtitle: + break + grouped_language_list = [l.strip() for l in + language_with_alts.split("|")] + for language in grouped_language_list: + if language == "all": + download_all_subtitle = True + break + elif language in subtitles_set: + subtitle_set_download.update([language]) + break + else: + subtitle_set_nonexist.update([language]) + + if download_all_subtitle and subtitles is not None: + subtitle_set_download = set(subtitles) + + if not download_all_subtitle and subtitle_set_nonexist: + logging.warning("%s unavailable in '%s' language for video " + "with video id: [%s]," + "%s", subtitle_description.capitalize(), + ", ".join(subtitle_set_nonexist), video_id, + subtitle_description) + if not subtitle_set_download: + logging.warning("%s all requested subtitles are unavaliable," + "with video id: [%s], falling back to 'en' " + "%s", subtitle_description.capitalize(), + video_id, + subtitle_description) + subtitle_set_download = set(['en']) + + for current_subtitle_language in subtitle_set_download: + subtitle_url = subtitles.get(current_subtitle_language) + if subtitle_url is not None: + # some subtitle urls are relative! + subtitle_links[ + "%s.%s" % (current_subtitle_language, subtitle_extension) + ] = make_coursera_absolute_url(subtitle_url) + return subtitle_links + + def extract_links_from_programming_immediate_instructions(self, element_id): + """ + Return a dictionary with links to supplement files (pdf, csv, zip, + ipynb, html and so on) extracted from graded programming assignment. + + @param element_id: Element ID to extract files from. + @type element_id: str + + @return: @see CourseraOnDemand._extract_links_from_text + """ + logging.debug('Extracting links from programming immediate ' + 'instructions for element_id <%s>.', element_id) + + try: + # Assignment text (instructions) contains asset tags which describe + # supplementary files. + text = ''.join( + self._extract_programming_immediate_instructions_text(element_id)) + if not text: + return {} + + supplement_links = self._extract_links_from_text(text) + instructions = (IN_MEMORY_MARKER + self._markup_to_html(text), + 'instructions') + extend_supplement_links( + supplement_links, {IN_MEMORY_EXTENSION: [instructions]}) + return supplement_links + except requests.exceptions.HTTPError as exception: + logging.error('Could not download programming assignment %s: %s', + element_id, exception) + if is_debug_run(): + logging.exception('Could not download programming assignment %s: %s', + element_id, exception) + return None + def extract_links_from_programming(self, element_id): """ Return a dictionary with links to supplement files (pdf, csv, zip, @@ -876,6 +968,87 @@ class CourseraOnDemand(object): 'url': element['url'].strip()} for element in dom['elements']] + def extract_references_poll(self): + try: + dom = get_page(self._session, + OPENCOURSE_REFERENCES_POLL_URL.format( + course_id=self._course_id), + json=True + ) + logging.info('Downloaded resource poll (%d bytes)', len(dom)) + return dom['elements'] + + except requests.exceptions.HTTPError as exception: + logging.error('Could not download resource section: %s', + exception) + if is_debug_run(): + logging.exception('Could not download resource section: %s', + exception) + return None + + def extract_links_from_reference(self, short_id): + """ + Return a dictionary with supplement files (pdf, csv, zip, ipynb, html + and so on) extracted from supplement page. + + @return: @see CourseraOnDemand._extract_links_from_text + """ + logging.debug('Gathering resource URLs for short_id <%s>.', short_id) + + try: + dom = get_page(self._session, OPENCOURSE_REFERENCE_ITEM_URL, + json=True, + course_id=self._course_id, + short_id=short_id) + + resource_content = {} + + # Supplement content has structure as follows: + # 'linked' { + # 'openCourseAssets.v1' [ { + # 'definition' { + # 'value' + + for asset in dom['linked']['openCourseAssets.v1']: + value = asset['definition']['value'] + # Supplement lecture types are known to contain both tags + # and tags (depending on the course), so we extract + # both of them. + extend_supplement_links( + resource_content, self._extract_links_from_text(value)) + + instructions = (IN_MEMORY_MARKER + self._markup_to_html(value), + 'resources') + extend_supplement_links( + resource_content, {IN_MEMORY_EXTENSION: [instructions]}) + + return resource_content + except requests.exceptions.HTTPError as exception: + logging.error('Could not download supplement %s: %s', + short_id, exception) + if is_debug_run(): + logging.exception('Could not download supplement %s: %s', + short_id, exception) + return None + + def _extract_programming_immediate_instructions_text(self, element_id): + """ + Extract assignment text (instructions). + + @param element_id: Element id to extract assignment instructions from. + @type element_id: str + + @return: List of assignment text (instructions). + @rtype: [str] + """ + dom = get_page(self._session, OPENCOURSE_PROGRAMMING_IMMEDIATE_INSTRUCTIOINS_URL, + json=True, + course_id=self._course_id, + element_id=element_id) + + return [element['assignmentInstructions']['definition']['value'] + for element in dom['elements']] + def _extract_assignment_text(self, element_id): """ Extract assignment text (instructions). diff --git a/coursera/commandline.py b/coursera/commandline.py index dc9a2bb..5b47196 100644 --- a/coursera/commandline.py +++ b/coursera/commandline.py @@ -6,13 +6,15 @@ handling. The primary candidate is argument parser. import os import sys import logging -import argparse +import configargparse as argparse from coursera import __version__ from .credentials import get_credentials, CredentialsError, keyring from .utils import decode_input +LOCAL_CONF_FILE_NAME = 'coursera-dl.conf' + def class_name_arg_required(args): """ @@ -33,8 +35,14 @@ def parse_args(args=None): Parse the arguments/options passed to the program on the command line. """ - parser = argparse.ArgumentParser( - description='Download Coursera.org lecture material and resources.') + parse_kwargs = { + "description": 'Download Coursera.org lecture material and resources.' + } + + conf_file_path = os.path.join(os.getcwd(), LOCAL_CONF_FILE_NAME) + if os.path.isfile(conf_file_path): + parse_kwargs["default_config_files"] = [conf_file_path] + parser = argparse.ArgParser(**parse_kwargs) # Basic options group_basic = parser.add_argument_group('Basic options') @@ -93,7 +101,15 @@ def parse_args(args=None): action='store', default='all', help='Choose language to download subtitles and transcripts. (Default: all)' - 'Use special value "all" to download all available.') + 'Use special value "all" to download all available.' + 'To download subtitles and transcripts of multiple languages,' + 'use comma(s) (without spaces) to seperate the names of the languages, i.e., "en,zh-CN".' + 'To download subtitles and transcripts of alternative language(s) ' + 'if only the current language is not available,' + 'put an "|" for each of the alternative languages after ' + 'the current language, i.e., "en|fr,zh-CN|zh-TW|de", and make sure the parameter are wrapped with ' + 'quotes when "|" presents.' + ) # Selection of material to download group_material = parser.add_argument_group('Selection of material to download') @@ -316,6 +332,12 @@ def parse_args(args=None): default=False, help='generate M3U playlists for course weeks') + group_adv_misc.add_argument('--mathjax-cdn', + dest='mathjax_cdn_url', + default='https://cdn.mathjax.org/mathjax/latest/MathJax.js', + help='the cdn address of MathJax.js' + ) + # Debug options group_debug = parser.add_argument_group('Debugging options') diff --git a/coursera/coursera_dl.py b/coursera/coursera_dl.py index 124868e..941793d 100644 --- a/coursera/coursera_dl.py +++ b/coursera/coursera_dl.py @@ -135,7 +135,9 @@ def download_on_demand_class(args, class_name): args.unrestricted_filenames, args.subtitle_language, args.video_resolution, - args.download_quizzes) + args.download_quizzes, + args.mathjax_cdn_url + ) if is_debug_run or args.cache_syllabus(): with open(cached_syllabus_filename, 'w') as file_object: diff --git a/coursera/define.py b/coursera/define.py index 1a72241..7b3fbe5 100644 --- a/coursera/define.py +++ b/coursera/define.py @@ -67,6 +67,12 @@ OPENCOURSE_SUPPLEMENT_URL = 'https://www.coursera.org/api/onDemandSupplements.v1 '{course_id}~{element_id}?includes=asset&fields=openCourseAssets.v1%28typeName%29,openCourseAssets.v1%28definition%29' OPENCOURSE_PROGRAMMING_ASSIGNMENTS_URL = \ 'https://www.coursera.org/api/onDemandProgrammingLearnerAssignments.v1/{course_id}~{element_id}?fields=submissionLearnerSchema' +OPENCOURSE_PROGRAMMING_IMMEDIATE_INSTRUCTIOINS_URL = \ + 'https://www.coursera.org/api/onDemandProgrammingImmediateInstructions.v1/{course_id}~{element_id}' +OPENCOURSE_REFERENCES_POLL_URL = \ + "https://www.coursera.org/api/onDemandReferences.v1/?courseId={course_id}&q=courseListed&fields=name%2CshortId%2Cslug%2Ccontent&includes=assets" +OPENCOURSE_REFERENCE_ITEM_URL = \ + "https://www.coursera.org/api/onDemandReferences.v1/?courseId={course_id}&q=shortId&shortId={short_id}&fields=name%2CshortId%2Cslug%2Ccontent&includes=assets" # These are ids that are present in tag in assignment text: # @@ -772,7 +778,7 @@ FORMAT_MAX_LENGTH = 20 TITLE_MAX_LENGTH = 200 #: CSS that is usen to prettify instructions -INSTRUCTIONS_HTML_INJECTION = ''' +INSTRUCTIONS_HTML_INJECTION_PRE = ''' ''' + +# The following url is the root url (tree) for a Coursera Course +OPENCOURSE_NOTEBOOK_DESCRIPTIONS = "https://hub.coursera-notebooks.org/hub/coursera_login?token={authId}&next=/" +OPENCOURSE_NOTEBOOK_LAUNCHES = "https://www.coursera.org/api/onDemandNotebookWorkspaceLaunches.v1/?fields=authorizationId%2CcontentPath%2CuseLegacySystem" +OPENCOURSE_NOTEBOOK_TREE = "https://hub.coursera-notebooks.org/user/{jupId}/api/contents/{path}?type=directory&_={timestamp}" +OPENCOURSE_NOTEBOOK_DOWNLOAD = "https://hub.coursera-notebooks.org/user/{jupId}/files/{path}?download=1" diff --git a/coursera/extractors.py b/coursera/extractors.py index 420373f..6e6d631 100644 --- a/coursera/extractors.py +++ b/coursera/extractors.py @@ -29,7 +29,7 @@ class PlatformExtractor(object): class CourseraExtractor(PlatformExtractor): def __init__(self, session, username, password): login(session, username, password) - + self._notebook_downloaded = False self._session = session def list_courses(self): @@ -47,13 +47,14 @@ class CourseraExtractor(PlatformExtractor): def get_modules(self, class_name, reverse=False, unrestricted_filenames=False, subtitle_language='en', video_resolution=None, - download_quizzes=False, mathjax_cdn_url=None): + download_quizzes=False, mathjax_cdn_url=None, + download_notebooks=False): page = self._get_on_demand_syllabus(class_name) error_occured, modules = self._parse_on_demand_syllabus( page, reverse, unrestricted_filenames, subtitle_language, video_resolution, - download_quizzes, mathjax_cdn_url) + download_quizzes, mathjax_cdn_url, download_notebooks) return error_occured, modules def _get_on_demand_syllabus(self, class_name): @@ -72,7 +73,8 @@ class CourseraExtractor(PlatformExtractor): subtitle_language='en', video_resolution=None, download_quizzes=False, - mathjax_cdn_url=None + mathjax_cdn_url=None, + download_notebooks=False ): """ Parse a Coursera on-demand course listing/syllabus page. @@ -145,8 +147,7 @@ class CourseraExtractor(PlatformExtractor): video_resolution, assets) elif typename == 'supplement': - links = course.extract_links_from_supplement( - lecture['id']) + links = course.extract_links_from_supplement(lecture['id']) elif typename in ('gradedProgramming', 'ungradedProgramming'): links = course.extract_links_from_programming(lecture['id']) @@ -162,7 +163,12 @@ class CourseraExtractor(PlatformExtractor): elif typename == 'programming': if download_quizzes: links = course.extract_links_from_programming_immediate_instructions(lecture['id']) - + + elif typename == 'notebook': + if download_notebooks and self._notebook_downloaded == False: + logging.warning('According to notebooks platform, content will be downloaded first') + links = course.extract_links_from_notebook(lecture['id']) + self._notebook_downloaded = True else: logging.info('Unsupported typename "%s" in lecture "%s"', typename, lecture_slug) From 788f9539fb4ee0da7767867b2e979e11b8e9ee10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mois=C3=A9s=20Lodeiro?= Date: Mon, 20 Nov 2017 11:27:52 +0000 Subject: [PATCH 15/87] Added info message when skipping file --- coursera/api.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/coursera/api.py b/coursera/api.py index 6376709..de8679b 100644 --- a/coursera/api.py +++ b/coursera/api.py @@ -498,6 +498,9 @@ class CourseraOnDemand(object): logging.info('Downloading {} into {}'.format(tail, head)) with open(self._course_name + "/notebook/" + head + "/" + tail, 'wb+') as f: f.write(r.content) + else: + logging.info('Skipping {}... (file exists)'.format(tail)) + if not str(extension[1:]) in supplement_links: supplement_links[str(extension[1:])] = [] @@ -520,6 +523,8 @@ class CourseraOnDemand(object): logging.info('Downloading Jupyter {} into {}'.format(tail, head)) with open(self._course_name + "/notebook/" + head + "/" + tail, 'wb+') as f: f.write(r.content) + else: + logging.info('Skipping {}... (file exists)'.format(tail)) if not "ipynb" in supplement_links: supplement_links["ipynb"] = [] From 27fae191840befdb0d10c269c6fe648b98a88719 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rog=C3=A9rio=20Brito?= Date: Wed, 29 Nov 2017 22:44:08 -0200 Subject: [PATCH 16/87] travis: Remove Python 3.3 from build matrix. This closes #632. Thanks @PrabhanshuAttri for the sharp eye. --- .travis.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 6ae0053..6c13a24 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,14 +2,12 @@ language: python python: - "2.6" - "2.7" - - "3.3" - "3.4" - "3.5" - "3.6" - "pypy" matrix: allow_failures: - - python: "3.3" - python: "pypy" # command to install dependencies install: From e16d9c1ae39441ce396004763574d5474e5fc872 Mon Sep 17 00:00:00 2001 From: orlandocr Date: Mon, 8 Jan 2018 20:06:19 -0600 Subject: [PATCH 17/87] Fixed --download-quizzes usage Fixed --download-quizzes parameter usage description --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 28c43ef..599ff0f 100644 --- a/README.md +++ b/README.md @@ -287,7 +287,7 @@ where the script is supposed to be executed, with the following format: --username --password --subtitle-language en,zh-CN|zh-TW - --download-quizzes True + --download-quizzes #--mathjax-cdn https://cdn.bootcss.com/mathjax/2.7.1/MathJax.js # more other parameters From 2250ea6238fdad261f4bc8200d7d487ffd954153 Mon Sep 17 00:00:00 2001 From: SCaffrey Date: Mon, 15 Jan 2018 22:13:08 +0800 Subject: [PATCH 18/87] Update README.md ref: https://github.com/googlehosts/hosts/blob/master/hosts-files/hosts#L163 --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 28c43ef..c563a50 100644 --- a/README.md +++ b/README.md @@ -403,9 +403,9 @@ one of the following actions solve your problem: ## China issues If you are from China and you're having problems downloading videos, -adding "52.84.246.72 d3c33hcgiwev3.cloudfront.net" in the hosts file +adding "52.84.167.78 d3c33hcgiwev3.cloudfront.net" in the hosts file (/etc/hosts) and freshing DNS with "ipconfig/flushdns" may work -(see this [comment](https://github.com/coursera-dl/coursera-dl/issues/606#issuecomment-305698809)). +(see https://github.com/googlehosts/hosts for more info). ## Found 0 sections and 0 lectures on this page From 9cf1af597917c65a42b604ef7437986854a48f18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rog=C3=A9rio=20Brito?= Date: Mon, 19 Feb 2018 22:45:02 -0300 Subject: [PATCH 19/87] CHANGELOG: Update with info of new release. --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5526c73..8806246 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Change Log +## 0.10.0 (2018-02-19) + +Features: + - Support Coursera Notebooks (option: `--download-notebooks`) + - Add hints in the documentation for users in China + ## 0.9.0 (2017-05-25) Features: From 761c7fb1880ee9769a17abf9f900de9263d23c61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rog=C3=A9rio=20Brito?= Date: Mon, 19 Feb 2018 22:45:31 -0300 Subject: [PATCH 20/87] coursera: Update version number. [ci skip] --- coursera/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/coursera/__init__.py b/coursera/__init__.py index e4e49b3..9d1bb72 100644 --- a/coursera/__init__.py +++ b/coursera/__init__.py @@ -1 +1 @@ -__version__ = '0.9.0' +__version__ = '0.10.0' From b4ebc526ac81caf88dde936a14cef3e1b461a5a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rog=C3=A9rio=20Brito?= Date: Mon, 19 Feb 2018 22:53:17 -0300 Subject: [PATCH 21/87] README: Remove unavailable badge. --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 919314b..0ae240f 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,6 @@ [![Build status](https://ci.appveyor.com/api/projects/status/3hru0ycv5fbny5k8/branch/master?svg=true)](https://ci.appveyor.com/project/balta2ar/coursera-dl/branch/master) [![Coverage Status](https://coveralls.io/repos/coursera-dl/coursera-dl/badge.svg)](https://coveralls.io/r/coursera-dl/coursera-dl) [![Latest version on PyPI](https://img.shields.io/pypi/v/coursera-dl.svg)](https://pypi.python.org/pypi/coursera-dl) -[![Downloads from PyPI](https://img.shields.io/pypi/dm/coursera-dl.svg)](https://pypi.python.org/pypi/coursera-dl) [![Code Climate](https://codeclimate.com/github/coursera-dl/coursera-dl/badges/gpa.svg)](https://codeclimate.com/github/coursera-dl/coursera-dl) - [Introduction](#introduction) From 6e933dd0a115bd4c5a0b8e35860693547af369df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rog=C3=A9rio=20Brito?= Date: Mon, 19 Feb 2018 22:54:32 -0300 Subject: [PATCH 22/87] setup.py: Remove support for Python 3.3 and add for Python 3.6. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 19c0be5..f489078 100644 --- a/setup.py +++ b/setup.py @@ -75,9 +75,9 @@ trove_classifiers = [ 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: Implementation :: CPython', 'Programming Language :: Python :: Implementation :: PyPy', 'Programming Language :: Python', From c484e66a45cde3a8422e3d8e9967852cc35ddec3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rog=C3=A9rio=20Brito?= Date: Mon, 19 Feb 2018 22:57:20 -0300 Subject: [PATCH 23/87] README: Adjust Python 3 versions. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0ae240f..5560929 100644 --- a/README.md +++ b/README.md @@ -94,7 +94,7 @@ relevant excerpt: `coursera-dl` requires Python 2 or Python 3 and a free Coursera account enrolled in the class of interest. (As of February of 2016, we test automatically the execution of the program with Python versions 2.6, 2.7, -Pypy, 3.2, 3.3, 3.4, and 3.5). +Pypy, 3.4, 3.5, and 3.6). **Note:** We *strongly* recommend that you use a Python 3 interpreter (3.4 or later). From 2e265ef24e99d2e026e9d7a31de9d15edc92ec45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rog=C3=A9rio=20Brito?= Date: Mon, 19 Feb 2018 22:59:04 -0300 Subject: [PATCH 24/87] README: Remove dead bitdeli badge. --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index 5560929..96736a3 100644 --- a/README.md +++ b/README.md @@ -587,5 +587,3 @@ geemail dotcom (twitter: [@jplehmann][12]). [issue213]: https://github.com/coursera-dl/coursera-dl/issues/213 [issue500]: https://github.com/coursera-dl/coursera-dl/issues/500 [pipinstallerbug]: http://stackoverflow.com/questions/31808180/installing-pyinstaller-via-pip-leads-to-failed-to-create-process - -[![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/coursera-dl/coursera-dl/trend.png)](https://bitdeli.com/free "Bitdeli Badge") From 360aec5f2724a77d718b82617671a35cfc4529af Mon Sep 17 00:00:00 2001 From: OPSXCQ Date: Fri, 3 Nov 2017 21:36:59 -0200 Subject: [PATCH 25/87] dockerfile added --- Dockerfile | 22 ++++++++++++++++++++++ README.md | 13 +++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..65915b4 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,22 @@ +FROM python:3.4-slim + +LABEL maintainer "opsxcq@strm.sh" + +WORKDIR /src +COPY requirements.txt /src + +COPY requirements-dev.txt /src + +RUN apt-get update && \ + apt-get install -y --no-install-recommends gcc g++ libssl-dev && \ + rm -rf /var/lib/apt/lists/* && \ + pip install -r requirements.txt && \ + pip install -r requirements-dev.txt && \ + apt-get purge -y --auto-remove gcc g++ libssl-dev + +COPY . /src +RUN python setup.py install + +WORKDIR /courses +ENTRYPOINT ["coursera-dl"] +CMD ["--help"] diff --git a/README.md b/README.md index 96736a3..1a2f8a4 100644 --- a/README.md +++ b/README.md @@ -214,6 +214,19 @@ your own, please check that the versions of your modules are at least those listed in the `requirements.txt` file (and, `requirements-dev.txt` file, if applicable). +## Docker + +If you prefer you can run this software inside Docker: + +``` +docker run --rm -it \ + -v "$(pwd):/courses" \ + strm/coursera-dl \ + -u -p +``` + +The actual working dir for coursera-dl is /courses, all courses will be downloaded there if you don't specify otherwise. + ## Windows `python -m pip install coursera-dl` From 4326937e1207f5ed0e4024e1f5f50234bc124840 Mon Sep 17 00:00:00 2001 From: OPSXCQ Date: Fri, 2 Mar 2018 18:36:46 +0000 Subject: [PATCH 26/87] Removed old Dockerfile and related files. Bellow a list of reasons for the change: * Is better to use an official python image instead * UBUNTU:14 is very old, better use a more recent image * The old Docker image used to clone this repo, that isn't necessary. * Old Docker image wan't ephemeral enough * For layer optimization, is better to add the dependencies before installing the software * Old image didn't set an entrypoint --- deploy/.netrc | 1 - deploy/Dockerfile | 14 -------------- deploy/README.md | 10 ---------- deploy/build.sh | 7 ------- deploy/download.sh | 15 --------------- 5 files changed, 47 deletions(-) delete mode 100644 deploy/.netrc delete mode 100644 deploy/Dockerfile delete mode 100644 deploy/README.md delete mode 100755 deploy/build.sh delete mode 100755 deploy/download.sh diff --git a/deploy/.netrc b/deploy/.netrc deleted file mode 100644 index bd0c698..0000000 --- a/deploy/.netrc +++ /dev/null @@ -1 +0,0 @@ -machine coursera-dl login password diff --git a/deploy/Dockerfile b/deploy/Dockerfile deleted file mode 100644 index 3a73264..0000000 --- a/deploy/Dockerfile +++ /dev/null @@ -1,14 +0,0 @@ -FROM ubuntu:14.04 -MAINTAINER Dmitry Senin - -RUN apt-get update -RUN DEBIAN_FRONTEND=noninteractive apt-get install -y git build-essential libssl-dev libffi-dev -RUN DEBIAN_FRONTEND=noninteractive apt-get install -y python-pip python-dev -RUN pip install ndg-httpsclient - -COPY .netrc /root/.netrc -RUN chmod 0600 /root/.netrc - -RUN cd /root && git clone https://github.com/coursera-dl/coursera.git -RUN cd /root/coursera && pip install -r requirements.txt -RUN cd /usr/bin && ln -s /root/coursera/coursera-dl coursera-dl diff --git a/deploy/README.md b/deploy/README.md deleted file mode 100644 index 30725b9..0000000 --- a/deploy/README.md +++ /dev/null @@ -1,10 +0,0 @@ -# How to launch the container - -1. [optional] Insert your username and password in the `.netrc` file if you - plan to use the `-n` optionof `coursera-dl` (edit template in this - directory). -2. Build Docker image: - `./build.sh` -3. Run Docker container to download courses A, B and C: - `./download.sh A B C` -4. All courses will be downloaded in directory `~/courses` diff --git a/deploy/build.sh b/deploy/build.sh deleted file mode 100755 index 55c1661..0000000 --- a/deploy/build.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/sh - -if groups | grep -q "docker" ; then - docker build --tag coursera-img --rm . -else - sudo docker build --tag coursera-img --rm . -fi diff --git a/deploy/download.sh b/deploy/download.sh deleted file mode 100755 index bcda3b9..0000000 --- a/deploy/download.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/sh - -COURSES=$* - -if [ ! -e ~/courses ]; then - mkdir ~/courses -fi - -if groups | grep -q "docker" ; then - docker run --rm --name coursera -v ~/courses:/courses coursera-img \ - coursera-dl -n --path /courses $COURSES -else - sudo docker run --rm --name coursera -v ~/courses:/courses coursera-img \ - coursera-dl -n --path /courses $COURSES -fi From acfa6c5fce19b5cb797a00633ce32908ef086d02 Mon Sep 17 00:00:00 2001 From: Yuri Bochkarev Date: Sun, 25 Mar 2018 16:10:10 +0300 Subject: [PATCH 27/87] Fix style and whitespace --- coursera/api.py | 39 ++++++++++++++++++--------------------- coursera/test/test_api.py | 2 +- 2 files changed, 19 insertions(+), 22 deletions(-) diff --git a/coursera/api.py b/coursera/api.py index de8679b..85770a5 100644 --- a/coursera/api.py +++ b/coursera/api.py @@ -5,6 +5,7 @@ downloader. """ import os +import re import json import base64 import logging @@ -468,7 +469,7 @@ class CourseraOnDemand(object): supplement_links = {} - url = url.format(**kwargs) + url = url.format(**kwargs) reply = get_page( self._session, url, @@ -478,21 +479,21 @@ class CourseraOnDemand(object): headers = self._auth_headers_with_json() for content in reply['content']: - + if content['type'] == 'directory': a = self._get_notebook_folder(OPENCOURSE_NOTEBOOK_TREE, jupyterId, jupId=jupyterId, path=content['path'], timestamp=int(time.time())) supplement_links.update(a) - + elif content['type'] == 'file': tmpUrl = OPENCOURSE_NOTEBOOK_DOWNLOAD.format(path=content['path'], jupId=jupyterId, timestamp=int(time.time())) filename, extension = os.path.splitext(clean_url(tmpUrl)) - + head, tail = os.path.split(content['path']) - + if os.path.isdir(self._course_name + "/notebook/" + head + "/") == False: logging.info('Creating [{}] directories...'.format(head)) os.makedirs(self._course_name + "/notebook/" + head + "/") - + r = requests.get(tmpUrl.replace(" ", "%20"), cookies=self._session.cookies) if os.path.exists(self._course_name + "/notebook/" + head + "/" + tail) == False: logging.info('Downloading {} into {}'.format(tail, head)) @@ -504,20 +505,19 @@ class CourseraOnDemand(object): if not str(extension[1:]) in supplement_links: supplement_links[str(extension[1:])] = [] - - supplement_links[str(extension[1:])].append((tmpUrl.replace(" ", "%20"), filename)) + supplement_links[str(extension[1:])].append((tmpUrl.replace(" ", "%20"), filename)) elif content['type'] == 'notebook': tmpUrl = OPENCOURSE_NOTEBOOK_DOWNLOAD.format(path=content['path'], jupId=jupyterId, timestamp=int(time.time())) filename, extension = os.path.splitext(clean_url(tmpUrl)) - + head, tail = os.path.split(content['path']) - + if os.path.isdir(self._course_name + "/notebook/" + head + "/") == False: logging.info('Creating [{}] directories...'.format(head)) os.makedirs(self._course_name + "/notebook/" + head + "/") - + r = requests.get(tmpUrl.replace(" ", "%20"), cookies=self._session.cookies) if os.path.exists(self._course_name + "/notebook/" + head + "/" + tail) == False: logging.info('Downloading Jupyter {} into {}'.format(tail, head)) @@ -528,18 +528,16 @@ class CourseraOnDemand(object): if not "ipynb" in supplement_links: supplement_links["ipynb"] = [] - + supplement_links["ipynb"].append((tmpUrl.replace(" ", "%20"), filename)) else: logging.info('Unsupported typename {} in notebook'.format(content['type'])) - + return supplement_links - def _get_notebook_json(self, notebook_id, authorizationId): - - import re, time + headers = self._auth_headers_with_json() reply = get_page( self._session, @@ -553,22 +551,21 @@ class CourseraOnDemand(object): if len(jupyterId) == 0: logging.error('Could not download notebook %s', notebook_id) return None - + jupyterId = jupyterId[0] newReq = requests.Session() req = newReq.get(OPENCOURSE_NOTEBOOK_TREE.format(jupId=jupyterId, path="/", timestamp=int(time.time())), headers=headers) - + return self._get_notebook_folder(OPENCOURSE_NOTEBOOK_TREE, jupyterId, jupId=jupyterId, path="/", timestamp=int(time.time())) - def extract_links_from_notebook(self, notebook_id): - try: + try: authorizationId = self._extract_notebook_text(notebook_id) ret = self._get_notebook_json(notebook_id, authorizationId) return ret - except requests.exceptions.HTTPError as exception: + except requests.exceptions.HTTPError as exception: logging.error('Could not download notebook %s: %s', notebook_id, exception) if is_debug_run(): logging.exception('Could not download notebook %s: %s', notebook_id, exception) diff --git a/coursera/test/test_api.py b/coursera/test/test_api.py index 23096ed..fbd445d 100644 --- a/coursera/test/test_api.py +++ b/coursera/test/test_api.py @@ -117,7 +117,7 @@ def test_extract_links_from_reference_http_error(get_page, course): @patch('coursera.api.get_page') def test_extract_links_from_programming_immediate_instructions_http_error( - get_page, course): + get_page, course): """ This test checks that downloader skips locked programming immediate instructions instead of throwing an error. (Locked == returning 403 error code) From 26cf38cee3275767fd706d981b999b65f7bb25df Mon Sep 17 00:00:00 2001 From: Yuri Bochkarev Date: Sun, 25 Mar 2018 16:13:27 +0300 Subject: [PATCH 28/87] Add support for "phasedPeer" typename (peer assignment instructions) The structure is very similar to all other text instructions but peer assignment instructions should be retrieved using the following API method: onDemandPeerAssignmentInstructions.v1 fix #650 --- coursera/api.py | 69 +++++++++++++++++++++++++++ coursera/define.py | 104 +++++++++++++++++++++++++++++++++++++++++ coursera/extractors.py | 11 +++-- 3 files changed, 181 insertions(+), 3 deletions(-) diff --git a/coursera/api.py b/coursera/api.py index 85770a5..a8859e0 100644 --- a/coursera/api.py +++ b/coursera/api.py @@ -31,6 +31,7 @@ from .define import (OPENCOURSE_SUPPLEMENT_URL, OPENCOURSE_REFERENCES_POLL_URL, OPENCOURSE_REFERENCE_ITEM_URL, OPENCOURSE_PROGRAMMING_IMMEDIATE_INSTRUCTIOINS_URL, + OPENCOURSE_PEER_ASSIGNMENT_INSTRUCTIONS, # New feature, Notebook (Python Jupyter) OPENCOURSE_NOTEBOOK_DESCRIPTIONS, @@ -992,6 +993,39 @@ class CourseraOnDemand(object): element_id, exception) return None + def extract_links_from_peer_assignment(self, element_id): + """ + Return a dictionary with links to supplement files (pdf, csv, zip, + ipynb, html and so on) extracted from peer assignment. + + @param element_id: Element ID to extract files from. + @type element_id: str + + @return: @see CourseraOnDemand._extract_links_from_text + """ + logging.debug('Gathering supplement URLs for element_id <%s>.', element_id) + + try: + # Assignment text (instructions) contains asset tags which describe + # supplementary files. + text = ''.join(self._extract_peer_assignment_text(element_id)) + if not text: + return {} + + supplement_links = self._extract_links_from_text(text) + instructions = (IN_MEMORY_MARKER + self._markup_to_html(text), + 'peer_assignment_instructions') + extend_supplement_links( + supplement_links, {IN_MEMORY_EXTENSION: [instructions]}) + return supplement_links + except requests.exceptions.HTTPError as exception: + logging.error('Could not download peer assignment %s: %s', + element_id, exception) + if is_debug_run(): + logging.exception('Could not download peer assignment %s: %s', + element_id, exception) + return None + def extract_links_from_supplement(self, element_id): """ Return a dictionary with supplement files (pdf, csv, zip, ipynb, html @@ -1209,6 +1243,41 @@ class CourseraOnDemand(object): ['assignmentInstructions']['definition']['value'] for element in dom['elements']] + def _extract_peer_assignment_text(self, element_id): + """ + Extract peer assignment text (instructions). + + @param element_id: Element id to extract peer assignment instructions from. + @type element_id: str + + @return: List of peer assignment text (instructions). + @rtype: [str] + """ + dom = get_page(self._session, OPENCOURSE_PEER_ASSIGNMENT_INSTRUCTIONS, + json=True, + user_id=self._user_id, + course_id=self._course_id, + element_id=element_id) + + result = [] + + for element in dom['elements']: + # There is only one section with Instructions + if 'introduction' in element['instructions']: + result.append(element['instructions']['introduction']['definition']['value']) + + # But there may be multiple sections in Sections + for section in element['instructions'].get('sections', []): + section_value = section['content']['definition']['value'] + section_title = section.get('title') + if section_title is not None: + # If section title is present, put it in the beginning of + # section value as if it was there. + section_value = ('%s' % section_title) + section_value + result.append(section_value) + + return result + def _extract_links_from_text(self, text): """ Extract supplement links from the html text. Links may be provided diff --git a/coursera/define.py b/coursera/define.py index 1fc9d17..ba9ff66 100644 --- a/coursera/define.py +++ b/coursera/define.py @@ -185,6 +185,110 @@ ABOUT_URL = ('https://api.coursera.org/api/catalog.v1/courses?' AUTH_REDIRECT_URL = ('https://class.coursera.org/{class_name}' '/auth/auth_redirector?type=login&subtype=normal') +# Sample URL: +# +# https://www.coursera.org/api/onDemandPeerAssignmentInstructions.v1/?q=latest&userId=4958&courseId=RcnRZHHtEeWxvQr3acyajw&itemId=2yTvX&includes=gradingMetadata%2CreviewSchemas%2CsubmissionSchemas&fields=instructions%2ConDemandPeerAssignmentGradingMetadata.v1(requiredAuthoredReviewCount%2CisMentorGraded%2CassignmentDetails)%2ConDemandPeerReviewSchemas.v1(reviewSchema)%2ConDemandPeerSubmissionSchemas.v1(submissionSchema) +# +# Sample response: +# +# { +# "elements": [ +# { +# "instructions": { +# "introduction": { +# "typeName": "cml", +# "definition": { +# "dtdId": "assess/1", +# "value": "Ваше первое задание заключается в установке Python и библиотек.." +# } +# }, +# "sections": [ +# { +# "typeId": "unknown", +# "title": "Review criteria", +# "content": { +# "typeName": "cml", +# "definition": { +# "dtdId": "assess/1", +# "value": "В результате работы вы установите на компьютер Python и библиотеки, необходимые для дальнейшего прохождения курса.." +# } +# } +# } +# ] +# }, +# "id": "4958~RcnRZHHtEeWxvQr3acyajw~2yTvX~8x7Qhs66EeW2Tw715xhIPQ@13" +# } +# ], +# "paging": {}, +# "linked": { +# "onDemandPeerSubmissionSchemas.v1": [ +# { +# "submissionSchema": { +# "parts": [ +# { +# "details": { +# "typeName": "fileUpload", +# "definition": { +# "required": false +# } +# }, +# "id": "_fcfP3bPT5W4pkfkshmUAQ", +# "prompt": { +# "typeName": "cml", +# "definition": { +# "dtdId": "assess/1", +# "value": "Загрузите скриншот №1." +# } +# } +# }, +# { +# "details": { +# "typeName": "fileUpload", +# "definition": { +# "required": false +# } +# }, +# "id": "92ea4b4e-3492-41eb-ee32-2624ee807bd3", +# "prompt": { +# "typeName": "cml", +# "definition": { +# "dtdId": "assess/1", +# "value": "Загрузите скриншот №2." +# } +# } +# } +# ] +# }, +# "id": "4958~RcnRZHHtEeWxvQr3acyajw~2yTvX~8x7Qhs66EeW2Tw715xhIPQ@13" +# } +# ], +# "onDemandPeerAssignmentGradingMetadata.v1": [ +# { +# "assignmentDetails": { +# "typeName": "phased", +# "definition": { +# "receivedReviewCutoffs": { +# "count": 3 +# }, +# "passingFraction": 0.8 +# } +# }, +# "requiredAuthoredReviewCount": 3, +# "isMentorGraded": false, +# "id": "4958~RcnRZHHtEeWxvQr3acyajw~2yTvX~8x7Qhs66EeW2Tw715xhIPQ@13" +# } +# ], +# "onDemandPeerReviewSchemas.v1": [] +# } +# } +# +# This URL is used to retrieve "phasedPeer" typename instructions' contents +OPENCOURSE_PEER_ASSIGNMENT_INSTRUCTIONS = ( + 'https://www.coursera.org/api/onDemandPeerAssignmentInstructions.v1/?' + 'q=latest&userId={user_id}&courseId={course_id}&itemId={element_id}&' + 'includes=gradingMetadata%2CreviewSchemas%2CsubmissionSchemas&' + 'fields=instructions%2ConDemandPeerAssignmentGradingMetadata.v1(requiredAuthoredReviewCount%2CisMentorGraded%2CassignmentDetails)%2ConDemandPeerReviewSchemas.v1(reviewSchema)%2ConDemandPeerSubmissionSchemas.v1(submissionSchema)') + #POST_OPENCOURSE_API_QUIZ_SESSION = 'https://www.coursera.org/api/opencourse.v1/user/4958/course/text-mining/item/7OQHc/quiz/session' # Sample response: # diff --git a/coursera/extractors.py b/coursera/extractors.py index 6e6d631..7d23b96 100644 --- a/coursera/extractors.py +++ b/coursera/extractors.py @@ -149,6 +149,9 @@ class CourseraExtractor(PlatformExtractor): elif typename == 'supplement': links = course.extract_links_from_supplement(lecture['id']) + elif typename == 'phasedPeer': + links = course.extract_links_from_peer_assignment(lecture['id']) + elif typename in ('gradedProgramming', 'ungradedProgramming'): links = course.extract_links_from_programming(lecture['id']) @@ -163,15 +166,17 @@ class CourseraExtractor(PlatformExtractor): elif typename == 'programming': if download_quizzes: links = course.extract_links_from_programming_immediate_instructions(lecture['id']) - + elif typename == 'notebook': if download_notebooks and self._notebook_downloaded == False: logging.warning('According to notebooks platform, content will be downloaded first') links = course.extract_links_from_notebook(lecture['id']) self._notebook_downloaded = True + else: - logging.info('Unsupported typename "%s" in lecture "%s"', - typename, lecture_slug) + logging.info( + 'Unsupported typename "%s" in lecture "%s" (lecture id "%s")', + typename, lecture_slug, lecture['id']) continue if links is None: From 1ed4490b5b0667d97e209ddd20631649d48637b7 Mon Sep 17 00:00:00 2001 From: Yuri Bochkarev Date: Sun, 25 Mar 2018 16:17:34 +0300 Subject: [PATCH 29/87] Add tests for "phasedPeer" typename (peer assignment instructions) --- coursera/test/test_api.py | 24 ++++++++++++++++++++++-- coursera/test/utils.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 2 deletions(-) diff --git a/coursera/test/test_api.py b/coursera/test/test_api.py index fbd445d..063d274 100644 --- a/coursera/test/test_api.py +++ b/coursera/test/test_api.py @@ -10,7 +10,7 @@ from mock import patch, Mock from coursera import api from coursera import define -from coursera.test.utils import slurp_fixture +from coursera.test.utils import slurp_fixture, links_to_plain_text from coursera.utils import BeautifulSoup from requests.exceptions import HTTPError @@ -139,9 +139,28 @@ def test_ondemand_programming_supplement_no_instructions(get_page, course): assert {} == output +@patch('coursera.api.get_page') +@pytest.mark.parametrize( + "input_filename,expected_output", [ + ('peer-assignment-instructions-all.json', 'intro Review criteria section'), + ('peer-assignment-instructions-no-title.json', 'intro section'), + ('peer-assignment-instructions-only-introduction.json', 'intro'), + ('peer-assignment-instructions-only-sections.json', 'Review criteria section'), + ('peer-assignment-no-instructions.json', ''), + ] +) +def test_ondemand_from_peer_assgnment_instructions( + get_page, course, input_filename, expected_output): + instructions = slurp_fixture('json/%s' % input_filename) + get_page.return_value = json.loads(instructions) + + output = course.extract_links_from_peer_assignment('0') + assert expected_output == links_to_plain_text(output) + + @patch('coursera.api.get_page') def test_ondemand_from_programming_immediate_instructions_no_instructions( - get_page, course): + get_page, course): no_instructions = slurp_fixture( 'json/supplement-programming-immediate-instructions-no-instructions.json') get_page.return_value = json.loads(no_instructions) @@ -149,6 +168,7 @@ def test_ondemand_from_programming_immediate_instructions_no_instructions( output = course.extract_links_from_programming_immediate_instructions('0') assert {} == output + @patch('coursera.api.get_page') def test_ondemand_programming_supplement_empty_instructions(get_page, course): empty_instructions = slurp_fixture('json/supplement-programming-empty-instructions.json') diff --git a/coursera/test/utils.py b/coursera/test/utils.py index cc6805e..0e8e1a2 100644 --- a/coursera/test/utils.py +++ b/coursera/test/utils.py @@ -2,9 +2,43 @@ Helper functions that are only used in tests. """ import os +import re from io import open +from six import iteritems + +from coursera.define import IN_MEMORY_MARKER +from coursera.utils import BeautifulSoup + def slurp_fixture(path): return open(os.path.join(os.path.dirname(__file__), "fixtures", path), encoding='utf8').read() + + +def links_to_plain_text(links): + """ + Converts extracted links into text and cleans up extra whitespace. Only HTML + sections are converted. This is a helper to be used in tests. + + @param links: Links obtained from such methods as extract_links_from_peer_assignment. + @type links: @see CourseraOnDemand._extract_links_from_text + + @return: HTML converted to plain text with extra space removed. + @rtype: str + """ + result = [] + for filetype, contents in iteritems(links): + if filetype != 'html': + continue + + for content, _prefix in contents: + if content.startswith(IN_MEMORY_MARKER): + content = content[len(IN_MEMORY_MARKER):] + + soup = BeautifulSoup(content) + [script.extract() for script in soup(["script", "style"])] + text = re.sub(r'[ \t\r\n]+', ' ', soup.get_text()).strip() + result.append(text) + + return ''.join(result) From fda7e337c3f0307b1c2fba628b9f51000585cd66 Mon Sep 17 00:00:00 2001 From: Yuri Bochkarev Date: Sun, 25 Mar 2018 16:18:01 +0300 Subject: [PATCH 30/87] Add fixtures to test "phasedPeer" --- .../peer-assignment-instructions-all.json | 29 +++++++++++++++++++ ...peer-assignment-instructions-no-title.json | 28 ++++++++++++++++++ ...gnment-instructions-only-introduction.json | 16 ++++++++++ ...assignment-instructions-only-sections.json | 22 ++++++++++++++ .../json/peer-assignment-no-instructions.json | 4 +++ 5 files changed, 99 insertions(+) create mode 100644 coursera/test/fixtures/json/peer-assignment-instructions-all.json create mode 100644 coursera/test/fixtures/json/peer-assignment-instructions-no-title.json create mode 100644 coursera/test/fixtures/json/peer-assignment-instructions-only-introduction.json create mode 100644 coursera/test/fixtures/json/peer-assignment-instructions-only-sections.json create mode 100644 coursera/test/fixtures/json/peer-assignment-no-instructions.json diff --git a/coursera/test/fixtures/json/peer-assignment-instructions-all.json b/coursera/test/fixtures/json/peer-assignment-instructions-all.json new file mode 100644 index 0000000..70d9f5c --- /dev/null +++ b/coursera/test/fixtures/json/peer-assignment-instructions-all.json @@ -0,0 +1,29 @@ +{ + "elements": [ + { + "instructions": { + "introduction": { + "typeName": "cml", + "definition": { + "dtdId": "assess/1", + "value": "intro" + } + }, + "sections": [ + { + "typeId": "unknown", + "title": "Review criteria", + "content": { + "typeName": "cml", + "definition": { + "dtdId": "assess/1", + "value": "section" + } + } + } + ] + }, + "id": "4958~RcnRZHHtEeWxvQr3acyajw~2yTvX~8x7Qhs66EeW2Tw715xhIPQ@13" + } + ] +} diff --git a/coursera/test/fixtures/json/peer-assignment-instructions-no-title.json b/coursera/test/fixtures/json/peer-assignment-instructions-no-title.json new file mode 100644 index 0000000..f210263 --- /dev/null +++ b/coursera/test/fixtures/json/peer-assignment-instructions-no-title.json @@ -0,0 +1,28 @@ +{ + "elements": [ + { + "instructions": { + "introduction": { + "typeName": "cml", + "definition": { + "dtdId": "assess/1", + "value": "intro" + } + }, + "sections": [ + { + "typeId": "unknown", + "content": { + "typeName": "cml", + "definition": { + "dtdId": "assess/1", + "value": "section" + } + } + } + ] + }, + "id": "4958~RcnRZHHtEeWxvQr3acyajw~2yTvX~8x7Qhs66EeW2Tw715xhIPQ@13" + } + ] +} diff --git a/coursera/test/fixtures/json/peer-assignment-instructions-only-introduction.json b/coursera/test/fixtures/json/peer-assignment-instructions-only-introduction.json new file mode 100644 index 0000000..7a186c4 --- /dev/null +++ b/coursera/test/fixtures/json/peer-assignment-instructions-only-introduction.json @@ -0,0 +1,16 @@ +{ + "elements": [ + { + "instructions": { + "introduction": { + "typeName": "cml", + "definition": { + "dtdId": "assess/1", + "value": "intro" + } + } + }, + "id": "4958~RcnRZHHtEeWxvQr3acyajw~2yTvX~8x7Qhs66EeW2Tw715xhIPQ@13" + } + ] +} diff --git a/coursera/test/fixtures/json/peer-assignment-instructions-only-sections.json b/coursera/test/fixtures/json/peer-assignment-instructions-only-sections.json new file mode 100644 index 0000000..7cd735c --- /dev/null +++ b/coursera/test/fixtures/json/peer-assignment-instructions-only-sections.json @@ -0,0 +1,22 @@ +{ + "elements": [ + { + "instructions": { + "sections": [ + { + "typeId": "unknown", + "title": "Review criteria", + "content": { + "typeName": "cml", + "definition": { + "dtdId": "assess/1", + "value": "section" + } + } + } + ] + }, + "id": "4958~RcnRZHHtEeWxvQr3acyajw~2yTvX~8x7Qhs66EeW2Tw715xhIPQ@13" + } + ] +} diff --git a/coursera/test/fixtures/json/peer-assignment-no-instructions.json b/coursera/test/fixtures/json/peer-assignment-no-instructions.json new file mode 100644 index 0000000..9764791 --- /dev/null +++ b/coursera/test/fixtures/json/peer-assignment-no-instructions.json @@ -0,0 +1,4 @@ +{ + "elements": [ + ] +} From 7d6d0909abec759a1205250944df48c297aff346 Mon Sep 17 00:00:00 2001 From: Yuri Bochkarev Date: Sun, 8 Apr 2018 21:47:37 +0300 Subject: [PATCH 31/87] Mention how to configure timeouts for an external downloader (aria2c) ref #453 ref #626 --- README.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/README.md b/README.md index 96736a3..5236470 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@ - [Resuming downloads](#resuming-downloads) - [Troubleshooting](#troubleshooting) - [China issues](#china-issues) + - [Download timeouts](#download-timeouts) - [Found 0 sections and 0 lectures on this page](#found-0-sections-and-0-lectures-on-this-page) - [Windows: Proxy support](#windows-proxy-support) - [Windows: Failed to create process](#windows-failed-to-create-process) @@ -424,6 +425,30 @@ Alternatively you may want to try this Chrome extension: https://chrome.google.c If none of the above works for you, there is nothing we can do. +## Download timeouts + +Coursera-dl supports external downloaders but note that they are only used to +download materials after the syllabus has been parsed, e.g. videos, PDFs, some +handouts and additional files (syllabus is always downloaded using the internal +downloader). If you experience problems with downloading such materials, you may +want to start using external downloader and configure its timeout values. For +example, you can use aria2c downloader by passing `--aria` option: + +``` +coursera-dl -n --path . --aria2 +``` + +And put this into aria2c's configuration file `~/.aria2/aria2.conf` to reduce +timeouts: + +``` +connect-timeout=2 +timeout=2 +bt-stop-timeout=1 +``` + +Timeout configuration for internal downloader is not supported. + ## Windows: proxy support If you're on Windows behind a proxy, set up the environment variables From 564c741755fd0f6880650caa88b9fbeb5845d6c6 Mon Sep 17 00:00:00 2001 From: NoUrEdDiN Date: Mon, 7 May 2018 10:11:48 +0200 Subject: [PATCH 32/87] improve and use clean_filename `clean_filename` wasn't used; it's now used before making directories or write files under the notebook folder. `clean_filename` is improved a little, to handle more Windows-specific edge cases. And the existing tests were updated accordingly. --- coursera/api.py | 2 ++ coursera/test/test_utils.py | 4 ++-- coursera/utils.py | 13 ++++++++++++- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/coursera/api.py b/coursera/api.py index a8859e0..e9c5000 100644 --- a/coursera/api.py +++ b/coursera/api.py @@ -490,6 +490,8 @@ class CourseraOnDemand(object): filename, extension = os.path.splitext(clean_url(tmpUrl)) head, tail = os.path.split(content['path']) + head = '/'.join([clean_filename(dir, minimal_change=True) for dir in head.split('/')]) + tail = clean_filename(tail, minimal_change=True) if os.path.isdir(self._course_name + "/notebook/" + head + "/") == False: logging.info('Creating [{}] directories...'.format(head)) diff --git a/coursera/test/test_utils.py b/coursera/test/test_utils.py index ec198d5..8724519 100644 --- a/coursera/test/test_utils.py +++ b/coursera/test/test_utils.py @@ -34,7 +34,7 @@ from coursera.utils import total_seconds, is_course_complete ('Week 3: Data and Abstraction', 'Week_3-_Data_and_Abstraction'), ('  (Week 1) BRANDING: Marketing Strategy and Brand Positioning', 'Week_1_BRANDING-__Marketing_Strategy_and_Brand_Positioning'), - ('test & " adfas', 'test___adfas'), + ('test & " adfas', 'test__-_adfas'), # `"` were changed first to `-` (' ', ''), ('☂℮﹩т ω☤☂ℌ Ṳᾔ☤ḉ◎ⅾε', '__') ] @@ -54,7 +54,7 @@ def test_clean_filename(unclean, clean): 'Week 3- Data and Abstraction'), ('  (Week 1) BRANDING: Marketing Strategy and Brand Positioning', '  (Week 1) BRANDING- Marketing Strategy and Brand Positioning'), - ('test & " adfas', 'test & " adfas'), + ('test & " adfas', 'test & - adfas'), # `"` are forbidden on Windows (' ', u'\xa0'), ('☂℮﹩т ω☤☂ℌ Ṳᾔ☤ḉ◎ⅾε', '☂℮﹩т ω☤☂ℌ Ṳᾔ☤ḉ◎ⅾε') ] diff --git a/coursera/utils.py b/coursera/utils.py index 6fd4814..87c115e 100644 --- a/coursera/utils.py +++ b/coursera/utils.py @@ -106,13 +106,24 @@ def clean_filename(s, minimal_change=False): s = unquote_plus(s) # Strip forbidden characters + # https://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx s = ( s.replace(':', '-') .replace('/', '-') + .replace('<', '-') + .replace('>', '-') + .replace('"', '-') + .replace('\\', '-') + .replace('|', '-') + .replace('?', '-') + .replace('*', '-') .replace('\x00', '-') - .replace('\n', '') + .replace('\n', ' ') ) + # Remove trailing dots and spaces; forbidden on Windows + s = s.rstrip(' .') + if minimal_change: return s From b01bde501e3e4a5159ea2f19d2118b50edca2bb6 Mon Sep 17 00:00:00 2001 From: Dharmanshu Saini <36137804+dharmanshu24@users.noreply.github.com> Date: Thu, 31 May 2018 21:00:17 +0530 Subject: [PATCH 33/87] Update Readme.md to run without -p field --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 5236470..1487b68 100644 --- a/README.md +++ b/README.md @@ -247,6 +247,10 @@ credentials (e.g. email address and password or a `~/.netrc` file), the class names, as well as any additional parameters: General: coursera-dl -u -p modelthinking-004 + +If you don't want to type your password in command line as plain text, you can use the script without `-p` option. In this case you will be prompted for password once the script is run. + + Without -p field: coursera-dl -u modelthinking-004 Multiple classes: coursera-dl -u -p saas historyofrock1-001 algo-2012-002 Filter by section name: coursera-dl -u -p -sf "Chapter_Four" crypto-004 Filter by lecture name: coursera-dl -u -p -lf "3.1_" ml-2012-002 From 32e95d0d1cd9ccaf2b0e72de9fb9fc908ef7e02d Mon Sep 17 00:00:00 2001 From: TheGoddessInari Date: Fri, 1 Jun 2018 19:25:10 -0700 Subject: [PATCH 34/87] Switch to API subdomain for API URL defines. Started hitting errors today, switch from www.coursera.org/api/ to api.coursera.org/api/ to fix. --- coursera/define.py | 54 +++++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/coursera/define.py b/coursera/define.py index ba9ff66..a50272a 100644 --- a/coursera/define.py +++ b/coursera/define.py @@ -13,11 +13,11 @@ HTTP_FORBIDDEN = 403 COURSERA_URL = 'https://www.coursera.org' AUTH_URL = 'https://accounts.coursera.org/api/v1/login' -AUTH_URL_V3 = 'https://www.coursera.org/api/login/v3' +AUTH_URL_V3 = 'https://api.coursera.org/api/login/v3' CLASS_URL = 'https://class.coursera.org/{class_name}' # The following link is left just for illustative purposes: -# https://www.coursera.org/api/courses.v1?fields=display%2CpartnerIds%2CphotoUrl%2CstartDate%2Cpartners.v1(homeLink%2Cname)&includes=partnerIds&q=watchlist&start=0 +# https://api.coursera.org/api/courses.v1?fields=display%2CpartnerIds%2CphotoUrl%2CstartDate%2Cpartners.v1(homeLink%2Cname)&includes=partnerIds&q=watchlist&start=0 # Reply is as follows: # { # "elements": [ @@ -34,10 +34,10 @@ CLASS_URL = 'https://class.coursera.org/{class_name}' # }, # "linked": {} # } -OPENCOURSE_LIST_COURSES = 'https://www.coursera.org/api/courses.v1?q=watchlist&start={start}' +OPENCOURSE_LIST_COURSES = 'https://api.coursera.org/api/courses.v1?q=watchlist&start={start}' # The following link is left just for illustative purposes: -# https://www.coursera.org/api/memberships.v1?fields=courseId,enrolledTimestamp,grade,id,lastAccessedTimestamp,onDemandSessionMembershipIds,onDemandSessionMemberships,role,v1SessionId,vc,vcMembershipId,courses.v1(courseStatus,display,partnerIds,photoUrl,specializations,startDate,v1Details,v2Details),partners.v1(homeLink,name),v1Details.v1(sessionIds),v1Sessions.v1(active,certificatesReleased,dbEndDate,durationString,hasSigTrack,startDay,startMonth,startYear),v2Details.v1(onDemandSessions,plannedLaunchDate,sessionsEnabledAt),specializations.v1(logo,name,partnerIds,shortName)&includes=courseId,onDemandSessionMemberships,vcMembershipId,courses.v1(partnerIds,specializations,v1Details,v2Details),v1Details.v1(sessionIds),v2Details.v1(onDemandSessions),specializations.v1(partnerIds)&q=me&showHidden=true&filter=current,preEnrolled +# https://api.coursera.org/api/memberships.v1?fields=courseId,enrolledTimestamp,grade,id,lastAccessedTimestamp,onDemandSessionMembershipIds,onDemandSessionMemberships,role,v1SessionId,vc,vcMembershipId,courses.v1(courseStatus,display,partnerIds,photoUrl,specializations,startDate,v1Details,v2Details),partners.v1(homeLink,name),v1Details.v1(sessionIds),v1Sessions.v1(active,certificatesReleased,dbEndDate,durationString,hasSigTrack,startDay,startMonth,startYear),v2Details.v1(onDemandSessions,plannedLaunchDate,sessionsEnabledAt),specializations.v1(logo,name,partnerIds,shortName)&includes=courseId,onDemandSessionMemberships,vcMembershipId,courses.v1(partnerIds,specializations,v1Details,v2Details),v1Details.v1(sessionIds),v2Details.v1(onDemandSessions),specializations.v1(partnerIds)&q=me&showHidden=true&filter=current,preEnrolled # Sample reply: # { # "elements": [ @@ -60,19 +60,19 @@ OPENCOURSE_LIST_COURSES = 'https://www.coursera.org/api/courses.v1?q=watchlist&s # ] # } # } -OPENCOURSE_MEMBERSHIPS = 'https://www.coursera.org/api/memberships.v1?includes=courseId,courses.v1&q=me&showHidden=true&filter=current,preEnrolled' -OPENCOURSE_CONTENT_URL = 'https://www.coursera.org/api/opencourse.v1/course/{class_name}?showLockedItems=true' -OPENCOURSE_VIDEO_URL = 'https://www.coursera.org/api/opencourse.v1/video/{video_id}' -OPENCOURSE_SUPPLEMENT_URL = 'https://www.coursera.org/api/onDemandSupplements.v1/'\ +OPENCOURSE_MEMBERSHIPS = 'https://api.coursera.org/api/memberships.v1?includes=courseId,courses.v1&q=me&showHidden=true&filter=current,preEnrolled' +OPENCOURSE_CONTENT_URL = 'https://api.coursera.org/api/opencourse.v1/course/{class_name}?showLockedItems=true' +OPENCOURSE_VIDEO_URL = 'https://api.coursera.org/api/opencourse.v1/video/{video_id}' +OPENCOURSE_SUPPLEMENT_URL = 'https://api.coursera.org/api/onDemandSupplements.v1/'\ '{course_id}~{element_id}?includes=asset&fields=openCourseAssets.v1%28typeName%29,openCourseAssets.v1%28definition%29' OPENCOURSE_PROGRAMMING_ASSIGNMENTS_URL = \ - 'https://www.coursera.org/api/onDemandProgrammingLearnerAssignments.v1/{course_id}~{element_id}?fields=submissionLearnerSchema' + 'https://api.coursera.org/api/onDemandProgrammingLearnerAssignments.v1/{course_id}~{element_id}?fields=submissionLearnerSchema' OPENCOURSE_PROGRAMMING_IMMEDIATE_INSTRUCTIOINS_URL = \ - 'https://www.coursera.org/api/onDemandProgrammingImmediateInstructions.v1/{course_id}~{element_id}' + 'https://api.coursera.org/api/onDemandProgrammingImmediateInstructions.v1/{course_id}~{element_id}' OPENCOURSE_REFERENCES_POLL_URL = \ - "https://www.coursera.org/api/onDemandReferences.v1/?courseId={course_id}&q=courseListed&fields=name%2CshortId%2Cslug%2Ccontent&includes=assets" + "https://api.coursera.org/api/onDemandReferences.v1/?courseId={course_id}&q=courseListed&fields=name%2CshortId%2Cslug%2Ccontent&includes=assets" OPENCOURSE_REFERENCE_ITEM_URL = \ - "https://www.coursera.org/api/onDemandReferences.v1/?courseId={course_id}&q=shortId&shortId={short_id}&fields=name%2CshortId%2Cslug%2Ccontent&includes=assets" + "https://api.coursera.org/api/onDemandReferences.v1/?courseId={course_id}&q=shortId&shortId={short_id}&fields=name%2CshortId%2Cslug%2Ccontent&includes=assets" # These are ids that are present in tag in assignment text: # @@ -95,7 +95,7 @@ OPENCOURSE_REFERENCE_ITEM_URL = \ # "linked": null # } OPENCOURSE_ASSET_URL = \ - 'https://www.coursera.org/api/assetUrls.v1?ids={ids}' + 'https://api.coursera.org/api/assetUrls.v1?ids={ids}' # These ids are provided in lecture json: # @@ -143,7 +143,7 @@ OPENCOURSE_ASSET_URL = \ # "linked": null # } OPENCOURSE_ASSETS_URL = \ - 'https://www.coursera.org/api/openCourseAssets.v1/{id}' + 'https://api.coursera.org/api/openCourseAssets.v1/{id}' # These asset ids are ids returned from OPENCOURSE_ASSETS_URL request: # See example above. @@ -166,10 +166,10 @@ OPENCOURSE_ASSETS_URL = \ # "linked": null # } OPENCOURSE_API_ASSETS_V1_URL = \ - 'https://www.coursera.org/api/assets.v1?ids={id}' + 'https://api.coursera.org/api/assets.v1?ids={id}' OPENCOURSE_ONDEMAND_COURSE_MATERIALS = \ - 'https://www.coursera.org/api/onDemandCourseMaterials.v1/?'\ + 'https://api.coursera.org/api/onDemandCourseMaterials.v1/?'\ 'q=slug&slug={class_name}&includes=moduleIds%2ClessonIds%2CpassableItemGroups%2CpassableItemGroupChoices%2CpassableLessonElements%2CitemIds%2Ctracks'\ '&fields=moduleIds%2ConDemandCourseMaterialModules.v1(name%2Cslug%2Cdescription%2CtimeCommitment%2ClessonIds%2Coptional)%2ConDemandCourseMaterialLessons.v1(name%2Cslug%2CtimeCommitment%2CelementIds%2Coptional%2CtrackId)%2ConDemandCourseMaterialPassableItemGroups.v1(requiredPassedCount%2CpassableItemGroupChoiceIds%2CtrackId)%2ConDemandCourseMaterialPassableItemGroupChoices.v1(name%2Cdescription%2CitemIds)%2ConDemandCourseMaterialPassableLessonElements.v1(gradingWeight)%2ConDemandCourseMaterialItems.v1(name%2Cslug%2CtimeCommitment%2Ccontent%2CisLocked%2ClockableByItem%2CitemLockedReasonCode%2CtrackId)%2ConDemandCourseMaterialTracks.v1(passablesCount)'\ '&showLockedItems=true' @@ -187,7 +187,7 @@ AUTH_REDIRECT_URL = ('https://class.coursera.org/{class_name}' # Sample URL: # -# https://www.coursera.org/api/onDemandPeerAssignmentInstructions.v1/?q=latest&userId=4958&courseId=RcnRZHHtEeWxvQr3acyajw&itemId=2yTvX&includes=gradingMetadata%2CreviewSchemas%2CsubmissionSchemas&fields=instructions%2ConDemandPeerAssignmentGradingMetadata.v1(requiredAuthoredReviewCount%2CisMentorGraded%2CassignmentDetails)%2ConDemandPeerReviewSchemas.v1(reviewSchema)%2ConDemandPeerSubmissionSchemas.v1(submissionSchema) +# https://api.coursera.org/api/onDemandPeerAssignmentInstructions.v1/?q=latest&userId=4958&courseId=RcnRZHHtEeWxvQr3acyajw&itemId=2yTvX&includes=gradingMetadata%2CreviewSchemas%2CsubmissionSchemas&fields=instructions%2ConDemandPeerAssignmentGradingMetadata.v1(requiredAuthoredReviewCount%2CisMentorGraded%2CassignmentDetails)%2ConDemandPeerReviewSchemas.v1(reviewSchema)%2ConDemandPeerSubmissionSchemas.v1(submissionSchema) # # Sample response: # @@ -284,12 +284,12 @@ AUTH_REDIRECT_URL = ('https://class.coursera.org/{class_name}' # # This URL is used to retrieve "phasedPeer" typename instructions' contents OPENCOURSE_PEER_ASSIGNMENT_INSTRUCTIONS = ( - 'https://www.coursera.org/api/onDemandPeerAssignmentInstructions.v1/?' + 'https://api.coursera.org/api/onDemandPeerAssignmentInstructions.v1/?' 'q=latest&userId={user_id}&courseId={course_id}&itemId={element_id}&' 'includes=gradingMetadata%2CreviewSchemas%2CsubmissionSchemas&' 'fields=instructions%2ConDemandPeerAssignmentGradingMetadata.v1(requiredAuthoredReviewCount%2CisMentorGraded%2CassignmentDetails)%2ConDemandPeerReviewSchemas.v1(reviewSchema)%2ConDemandPeerSubmissionSchemas.v1(submissionSchema)') -#POST_OPENCOURSE_API_QUIZ_SESSION = 'https://www.coursera.org/api/opencourse.v1/user/4958/course/text-mining/item/7OQHc/quiz/session' +#POST_OPENCOURSE_API_QUIZ_SESSION = 'https://api.coursera.org/api/opencourse.v1/user/4958/course/text-mining/item/7OQHc/quiz/session' # Sample response: # # { @@ -305,9 +305,9 @@ OPENCOURSE_PEER_ASSIGNMENT_INSTRUCTIONS = ( # "progressState": "Started" # } # } -POST_OPENCOURSE_API_QUIZ_SESSION = 'https://www.coursera.org/api/opencourse.v1/user/{user_id}/course/{class_name}/item/{quiz_id}/quiz/session' +POST_OPENCOURSE_API_QUIZ_SESSION = 'https://api.coursera.org/api/opencourse.v1/user/{user_id}/course/{class_name}/item/{quiz_id}/quiz/session' -#POST_OPENCOURSE_API_QUIZ_SESSION_GET_STATE = 'https://www.coursera.org/api/opencourse.v1/user/4958/course/text-mining/item/7OQHc/quiz/session/opencourse~bVgqTevEEeWvGQrWsIkLlw:4958:BiNDdOvPEeWAkwqbKEEh3w@13:1468773901987@1/action/getState?autoEnroll=false' +#POST_OPENCOURSE_API_QUIZ_SESSION_GET_STATE = 'https://api.coursera.org/api/opencourse.v1/user/4958/course/text-mining/item/7OQHc/quiz/session/opencourse~bVgqTevEEeWvGQrWsIkLlw:4958:BiNDdOvPEeWAkwqbKEEh3w@13:1468773901987@1/action/getState?autoEnroll=false' # Sample response: # # { @@ -389,9 +389,9 @@ POST_OPENCOURSE_API_QUIZ_SESSION = 'https://www.coursera.org/api/opencourse.v1/u # } # } # -POST_OPENCOURSE_API_QUIZ_SESSION_GET_STATE = 'https://www.coursera.org/api/opencourse.v1/user/{user_id}/course/{class_name}/item/{quiz_id}/quiz/session/{session_id}/action/getState?autoEnroll=false' +POST_OPENCOURSE_API_QUIZ_SESSION_GET_STATE = 'https://api.coursera.org/api/opencourse.v1/user/{user_id}/course/{class_name}/item/{quiz_id}/quiz/session/{session_id}/action/getState?autoEnroll=false' -#POST_OPENCOURSE_ONDEMAND_EXAM_SESSIONS = 'https://www.coursera.org/api/onDemandExamSessions.v1/-N44X0IJEeWpogr5ZO8qxQ~YV0W4~10!~1467462079068/actions?includes=gradingAttempts' +#POST_OPENCOURSE_ONDEMAND_EXAM_SESSIONS = 'https://api.coursera.org/api/onDemandExamSessions.v1/-N44X0IJEeWpogr5ZO8qxQ~YV0W4~10!~1467462079068/actions?includes=gradingAttempts' # Sample response: # # { @@ -532,14 +532,14 @@ POST_OPENCOURSE_API_QUIZ_SESSION_GET_STATE = 'https://www.coursera.org/api/openc # Request payload: # {"courseId":"-N44X0IJEeWpogr5ZO8qxQ","itemId":"YV0W4"} # -#POST_OPENCOURSE_ONDEMAND_EXAM_SESSIONS = 'https://www.coursera.org/api/onDemandExamSessions.v1/-N44X0IJEeWpogr5ZO8qxQ~YV0W4~10!~1467462079068/actions?includes=gradingAttempts' +#POST_OPENCOURSE_ONDEMAND_EXAM_SESSIONS = 'https://api.coursera.org/api/onDemandExamSessions.v1/-N44X0IJEeWpogr5ZO8qxQ~YV0W4~10!~1467462079068/actions?includes=gradingAttempts' # Response for this request is empty. Result (session_id) should be taken # either from Location header or from X-Coursera-Id header. # # Request payload: # {"courseId":"-N44X0IJEeWpogr5ZO8qxQ","itemId":"YV0W4"} -POST_OPENCOURSE_ONDEMAND_EXAM_SESSIONS = 'https://www.coursera.org/api/onDemandExamSessions.v1' +POST_OPENCOURSE_ONDEMAND_EXAM_SESSIONS = 'https://api.coursera.org/api/onDemandExamSessions.v1' # Sample response: # { @@ -851,7 +851,7 @@ POST_OPENCOURSE_ONDEMAND_EXAM_SESSIONS = 'https://www.coursera.org/api/onDemandE # # Request payload: # {"name":"getState","argument":[]} -POST_OPENCOURSE_ONDEMAND_EXAM_SESSIONS_GET_STATE = 'https://www.coursera.org/api/onDemandExamSessions.v1/{session_id}/actions?includes=gradingAttempts' +POST_OPENCOURSE_ONDEMAND_EXAM_SESSIONS_GET_STATE = 'https://api.coursera.org/api/onDemandExamSessions.v1/{session_id}/actions?includes=gradingAttempts' ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) # define a per-user cache folder @@ -940,6 +940,6 @@ INSTRUCTIONS_HTML_INJECTION_AFTER ='''?config=TeX-AMS-MML_HTMLorMML"> # The following url is the root url (tree) for a Coursera Course OPENCOURSE_NOTEBOOK_DESCRIPTIONS = "https://hub.coursera-notebooks.org/hub/coursera_login?token={authId}&next=/" -OPENCOURSE_NOTEBOOK_LAUNCHES = "https://www.coursera.org/api/onDemandNotebookWorkspaceLaunches.v1/?fields=authorizationId%2CcontentPath%2CuseLegacySystem" +OPENCOURSE_NOTEBOOK_LAUNCHES = "https://api.coursera.org/api/onDemandNotebookWorkspaceLaunches.v1/?fields=authorizationId%2CcontentPath%2CuseLegacySystem" OPENCOURSE_NOTEBOOK_TREE = "https://hub.coursera-notebooks.org/user/{jupId}/api/contents/{path}?type=directory&_={timestamp}" OPENCOURSE_NOTEBOOK_DOWNLOAD = "https://hub.coursera-notebooks.org/user/{jupId}/files/{path}?download=1" From 362c21db55aeb8c2ae37c26c7b411f7699f03a07 Mon Sep 17 00:00:00 2001 From: Yuri Bochkarev Date: Sat, 2 Jun 2018 19:15:57 +0300 Subject: [PATCH 35/87] Update CHANGELOG.md --- CHANGELOG.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8806246..3a0d8c0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,14 @@ # Change Log +## 0.11.0 (2018-06-02) + +Features: + - Add support for "peer assignment" section (#650) + +Bugfixes: + - Switched to api.coursera.org subdomain for API requests (#660) + + ## 0.10.0 (2018-02-19) Features: From dd983468c8308c504d254f2ed9e880eb732bd295 Mon Sep 17 00:00:00 2001 From: Yuri Bochkarev Date: Sat, 2 Jun 2018 19:16:54 +0300 Subject: [PATCH 36/87] Bump version (0.10.0 -> 0.11.0) Features: - Add support for "peer assignment" section (#650) Bugfixes: - Switched to api.coursera.org subdomain for API requests (#660) --- coursera/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/coursera/__init__.py b/coursera/__init__.py index 9d1bb72..f323a57 100644 --- a/coursera/__init__.py +++ b/coursera/__init__.py @@ -1 +1 @@ -__version__ = '0.10.0' +__version__ = '0.11.0' From 82722d80c62b346d901f07253f28cfa9d9cdc3cd Mon Sep 17 00:00:00 2001 From: TheGoddessInari Date: Sat, 2 Jun 2018 12:58:57 -0700 Subject: [PATCH 37/87] setup.py: Match file.open encoding with the source code encoding. Hit encoding errors on Windows after 0.11, easiest solution is to match encodings. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index f489078..bdb6775 100644 --- a/setup.py +++ b/setup.py @@ -48,7 +48,7 @@ def read_file(filename, alt=None): lines = None try: - with open(filename) as f: + with open(filename, encoding='utf-8') as f: lines = f.read() except IOError: lines = [] if alt is None else alt From 699a9e03f322ff70d150908f04d1e4ef42eb6e7d Mon Sep 17 00:00:00 2001 From: Yuri Bochkarev Date: Sat, 2 Jun 2018 23:14:47 +0300 Subject: [PATCH 38/87] Add more files to ignore to MANIFEST.in --- MANIFEST.in | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/MANIFEST.in b/MANIFEST.in index a6c2ce8..c4b71c1 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,33 @@ include requirements*.txt include CONTRIBUTING.md include LICENSE + +exclude .coveragerc +exclude .ctags +exclude .gitattributes +exclude .github/ISSUE_TEMPLATE.md +exclude .github/PULL_REQUEST_TEMPLATE.md +exclude .gitignore +exclude .travis.yml +exclude AUTHORS.md +exclude CHANGELOG.md +exclude README.md +exclude appveyor.yml +exclude appveyor/install.ps1 +exclude appveyor/run_with_env.cmd +exclude assets/hat-logo.svg +exclude coursera-dl +exclude coursera-dl.bat +exclude deploy/.netrc +exclude deploy/Dockerfile +exclude deploy/README.md +exclude deploy/build.sh +exclude deploy/download.sh +exclude fabfile.py +exclude tox.ini + +prune appveyor/ +prune assets/ +prune deploy/ +prune coursera/test/ +prune .github/ From 6dccacd464eb6d0965589fa156b8d83ea831228b Mon Sep 17 00:00:00 2001 From: Yuri Bochkarev Date: Sat, 2 Jun 2018 23:16:29 +0300 Subject: [PATCH 39/87] Update CHANGELOG.md --- CHANGELOG.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3a0d8c0..3828bcc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Change Log +## 0.11.1 (2018-06-02) + +Bugfixes: + - Specify utf-8 encoding in setup.py to fix installation on Windows (#662) + ## 0.11.0 (2018-06-02) Features: @@ -8,7 +13,6 @@ Features: Bugfixes: - Switched to api.coursera.org subdomain for API requests (#660) - ## 0.10.0 (2018-02-19) Features: From bb62038650a0b95fc09ac69d30197c15e6ff9d12 Mon Sep 17 00:00:00 2001 From: Yuri Bochkarev Date: Sat, 2 Jun 2018 23:17:18 +0300 Subject: [PATCH 40/87] Bump version (0.11.0 -> 0.11.1) Bugfixes: - Specify utf-8 encoding in setup.py to fix installation on Windows (#662) --- coursera/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/coursera/__init__.py b/coursera/__init__.py index f323a57..ae4865c 100644 --- a/coursera/__init__.py +++ b/coursera/__init__.py @@ -1 +1 @@ -__version__ = '0.11.0' +__version__ = '0.11.1' From c98e83702ee20827bb1f8f3f436c2eca59e99c7e Mon Sep 17 00:00:00 2001 From: TheGoddessInari Date: Sat, 2 Jun 2018 19:10:08 -0700 Subject: [PATCH 41/87] Followup, change to api.coursera.org in the definition and tests. I wish I had time to track down the real URL that's preventing TLS negotiation, but this at least gets coursera-dl working with its own downloader again. --- coursera/define.py | 2 +- .../fixtures/json/video-output-1-all.json | 28 +++++++++---------- .../test/fixtures/json/video-output-1-en.json | 4 +-- .../test/fixtures/json/video-output-1.json | 8 +++--- .../test/fixtures/json/video-output-2.json | 8 +++--- coursera/test/test_workflow.py | 6 ++-- 6 files changed, 28 insertions(+), 28 deletions(-) diff --git a/coursera/define.py b/coursera/define.py index a50272a..e2ec105 100644 --- a/coursera/define.py +++ b/coursera/define.py @@ -11,7 +11,7 @@ import tempfile HTTP_FORBIDDEN = 403 -COURSERA_URL = 'https://www.coursera.org' +COURSERA_URL = 'https://api.coursera.org' AUTH_URL = 'https://accounts.coursera.org/api/v1/login' AUTH_URL_V3 = 'https://api.coursera.org/api/login/v3' CLASS_URL = 'https://class.coursera.org/{class_name}' diff --git a/coursera/test/fixtures/json/video-output-1-all.json b/coursera/test/fixtures/json/video-output-1-all.json index e8df433..1e04337 100644 --- a/coursera/test/fixtures/json/video-output-1-all.json +++ b/coursera/test/fixtures/json/video-output-1-all.json @@ -1,16 +1,16 @@ { - "zh-CN.txt": "https://www.coursera.org/api/subtitleAssetProxy.v1/UKEuZoMQRcChLmaDEMXAsA?expiry=1495238400000&hmac=eNyKwEu_aMQtn7bg0mUj6uIyVZvjahFSE5x2CrbOXOU&fileExtension=txt", - "en.srt": "https://www.coursera.org/api/subtitleAssetProxy.v1/GgGZN65HQkyBmTeuR2JMsw?expiry=1495238400000&hmac=afqFhv9FWfxxEeSka8PCA4ihiyX3g2Z6K4jWJPFlcdo&fileExtension=srt", - "zh-CN.srt": "https://www.coursera.org/api/subtitleAssetProxy.v1/UKEuZoMQRcChLmaDEMXAsA?expiry=1495238400000&hmac=nmGzGoF4oNLv28ZDLUtX5dF4xPXUABgym76XMs4UzDE&fileExtension=srt", - "en.txt": "https://www.coursera.org/api/subtitleAssetProxy.v1/GgGZN65HQkyBmTeuR2JMsw?expiry=1495238400000&hmac=2Z37WW5Rc7GoT0eft1vdK0HX5imBqoZTKULMTiZ2EjM&fileExtension=txt", - "hi.srt": "https://www.coursera.org/api/subtitleAssetProxy.v1/v2OWSJUVSqCjlkiVFRqgng?expiry=1495238400000&hmac=qk--Ptsc4w3u6c-5BFPO9vhjyczMHzlSqUOQskjbfZ0&fileExtension=srt", - "es.srt": "https://www.coursera.org/api/subtitleAssetProxy.v1/jtYTHsSQToaWEx7EkJ6G4A?expiry=1495238400000&hmac=Ts5QKzu0jwhUafwsaHk7RKoQJK26d4_bzrX2M6iuRaQ&fileExtension=srt", - "pl.srt": "https://www.coursera.org/api/subtitleAssetProxy.v1/RGtowSWPQxSraMElj3MUbA?expiry=1495238400000&hmac=mcaMPGeK3J7Fn9RRwnuVFnHkyr1COFnLXYKVkUbyfSg&fileExtension=srt", - "ja.srt": "https://www.coursera.org/api/subtitleAssetProxy.v1/758f7ykrRcWfH-8pK3XFHw?expiry=1495238400000&hmac=huh5qtCJVj4rEJnsJ6D7MJdCcqN-s9cMd-M6xlSicLc&fileExtension=srt", - "pt-BR.srt": "https://www.coursera.org/api/subtitleAssetProxy.v1/1kRk9rXlSSeEZPa15aknhQ?expiry=1495238400000&hmac=XYyDJ71d9gl3HOqNplyJeEr7Wd2UhU3DhT-9w_Yudzs&fileExtension=srt", - "hi.txt": "https://www.coursera.org/api/subtitleAssetProxy.v1/v2OWSJUVSqCjlkiVFRqgng?expiry=1495238400000&hmac=earWLk_RUi3K5UpZfEVOlBgOcpSE9efXz2njRKu31rQ&fileExtension=txt", - "es.txt": "https://www.coursera.org/api/subtitleAssetProxy.v1/jtYTHsSQToaWEx7EkJ6G4A?expiry=1495238400000&hmac=sd6_C14J-qEkvvbqNTgI8W5eUCvOKwW6RzHcz8yF2Jk&fileExtension=txt", - "pl.txt": "https://www.coursera.org/api/subtitleAssetProxy.v1/RGtowSWPQxSraMElj3MUbA?expiry=1495238400000&hmac=sFwO_BWNlhZEDHsXYkFlnOEtHBIX8lSsVGIOLIHeZZ0&fileExtension=txt", - "ja.txt": "https://www.coursera.org/api/subtitleAssetProxy.v1/758f7ykrRcWfH-8pK3XFHw?expiry=1495238400000&hmac=WMhDBDbF6SiBuvRwg_QEkglLSK36bj8_5y6kZ9z94YY&fileExtension=txt", - "pt-BR.txt": "https://www.coursera.org/api/subtitleAssetProxy.v1/1kRk9rXlSSeEZPa15aknhQ?expiry=1495238400000&hmac=uQaL2V2AJ_Wp5dlCZH1HeyTU_AQo9VdJ2cphUhG8yxk&fileExtension=txt" + "zh-CN.txt": "https://api.coursera.org/api/subtitleAssetProxy.v1/UKEuZoMQRcChLmaDEMXAsA?expiry=1495238400000&hmac=eNyKwEu_aMQtn7bg0mUj6uIyVZvjahFSE5x2CrbOXOU&fileExtension=txt", + "en.srt": "https://api.coursera.org/api/subtitleAssetProxy.v1/GgGZN65HQkyBmTeuR2JMsw?expiry=1495238400000&hmac=afqFhv9FWfxxEeSka8PCA4ihiyX3g2Z6K4jWJPFlcdo&fileExtension=srt", + "zh-CN.srt": "https://api.coursera.org/api/subtitleAssetProxy.v1/UKEuZoMQRcChLmaDEMXAsA?expiry=1495238400000&hmac=nmGzGoF4oNLv28ZDLUtX5dF4xPXUABgym76XMs4UzDE&fileExtension=srt", + "en.txt": "https://api.coursera.org/api/subtitleAssetProxy.v1/GgGZN65HQkyBmTeuR2JMsw?expiry=1495238400000&hmac=2Z37WW5Rc7GoT0eft1vdK0HX5imBqoZTKULMTiZ2EjM&fileExtension=txt", + "hi.srt": "https://api.coursera.org/api/subtitleAssetProxy.v1/v2OWSJUVSqCjlkiVFRqgng?expiry=1495238400000&hmac=qk--Ptsc4w3u6c-5BFPO9vhjyczMHzlSqUOQskjbfZ0&fileExtension=srt", + "es.srt": "https://api.coursera.org/api/subtitleAssetProxy.v1/jtYTHsSQToaWEx7EkJ6G4A?expiry=1495238400000&hmac=Ts5QKzu0jwhUafwsaHk7RKoQJK26d4_bzrX2M6iuRaQ&fileExtension=srt", + "pl.srt": "https://api.coursera.org/api/subtitleAssetProxy.v1/RGtowSWPQxSraMElj3MUbA?expiry=1495238400000&hmac=mcaMPGeK3J7Fn9RRwnuVFnHkyr1COFnLXYKVkUbyfSg&fileExtension=srt", + "ja.srt": "https://api.coursera.org/api/subtitleAssetProxy.v1/758f7ykrRcWfH-8pK3XFHw?expiry=1495238400000&hmac=huh5qtCJVj4rEJnsJ6D7MJdCcqN-s9cMd-M6xlSicLc&fileExtension=srt", + "pt-BR.srt": "https://api.coursera.org/api/subtitleAssetProxy.v1/1kRk9rXlSSeEZPa15aknhQ?expiry=1495238400000&hmac=XYyDJ71d9gl3HOqNplyJeEr7Wd2UhU3DhT-9w_Yudzs&fileExtension=srt", + "hi.txt": "https://api.coursera.org/api/subtitleAssetProxy.v1/v2OWSJUVSqCjlkiVFRqgng?expiry=1495238400000&hmac=earWLk_RUi3K5UpZfEVOlBgOcpSE9efXz2njRKu31rQ&fileExtension=txt", + "es.txt": "https://api.coursera.org/api/subtitleAssetProxy.v1/jtYTHsSQToaWEx7EkJ6G4A?expiry=1495238400000&hmac=sd6_C14J-qEkvvbqNTgI8W5eUCvOKwW6RzHcz8yF2Jk&fileExtension=txt", + "pl.txt": "https://api.coursera.org/api/subtitleAssetProxy.v1/RGtowSWPQxSraMElj3MUbA?expiry=1495238400000&hmac=sFwO_BWNlhZEDHsXYkFlnOEtHBIX8lSsVGIOLIHeZZ0&fileExtension=txt", + "ja.txt": "https://api.coursera.org/api/subtitleAssetProxy.v1/758f7ykrRcWfH-8pK3XFHw?expiry=1495238400000&hmac=WMhDBDbF6SiBuvRwg_QEkglLSK36bj8_5y6kZ9z94YY&fileExtension=txt", + "pt-BR.txt": "https://api.coursera.org/api/subtitleAssetProxy.v1/1kRk9rXlSSeEZPa15aknhQ?expiry=1495238400000&hmac=uQaL2V2AJ_Wp5dlCZH1HeyTU_AQo9VdJ2cphUhG8yxk&fileExtension=txt" } \ No newline at end of file diff --git a/coursera/test/fixtures/json/video-output-1-en.json b/coursera/test/fixtures/json/video-output-1-en.json index 76d8cca..e2cb7cb 100644 --- a/coursera/test/fixtures/json/video-output-1-en.json +++ b/coursera/test/fixtures/json/video-output-1-en.json @@ -1,4 +1,4 @@ { - "en.srt": "https://www.coursera.org/api/subtitleAssetProxy.v1/GgGZN65HQkyBmTeuR2JMsw?expiry=1495238400000&hmac=afqFhv9FWfxxEeSka8PCA4ihiyX3g2Z6K4jWJPFlcdo&fileExtension=srt", - "en.txt": "https://www.coursera.org/api/subtitleAssetProxy.v1/GgGZN65HQkyBmTeuR2JMsw?expiry=1495238400000&hmac=2Z37WW5Rc7GoT0eft1vdK0HX5imBqoZTKULMTiZ2EjM&fileExtension=txt" + "en.srt": "https://api.coursera.org/api/subtitleAssetProxy.v1/GgGZN65HQkyBmTeuR2JMsw?expiry=1495238400000&hmac=afqFhv9FWfxxEeSka8PCA4ihiyX3g2Z6K4jWJPFlcdo&fileExtension=srt", + "en.txt": "https://api.coursera.org/api/subtitleAssetProxy.v1/GgGZN65HQkyBmTeuR2JMsw?expiry=1495238400000&hmac=2Z37WW5Rc7GoT0eft1vdK0HX5imBqoZTKULMTiZ2EjM&fileExtension=txt" } \ No newline at end of file diff --git a/coursera/test/fixtures/json/video-output-1.json b/coursera/test/fixtures/json/video-output-1.json index 86b0591..315a1af 100644 --- a/coursera/test/fixtures/json/video-output-1.json +++ b/coursera/test/fixtures/json/video-output-1.json @@ -1,6 +1,6 @@ { - "zh-CN.txt": "https://www.coursera.org/api/subtitleAssetProxy.v1/UKEuZoMQRcChLmaDEMXAsA?expiry=1495238400000&hmac=eNyKwEu_aMQtn7bg0mUj6uIyVZvjahFSE5x2CrbOXOU&fileExtension=txt", - "en.srt": "https://www.coursera.org/api/subtitleAssetProxy.v1/GgGZN65HQkyBmTeuR2JMsw?expiry=1495238400000&hmac=afqFhv9FWfxxEeSka8PCA4ihiyX3g2Z6K4jWJPFlcdo&fileExtension=srt", - "zh-CN.srt": "https://www.coursera.org/api/subtitleAssetProxy.v1/UKEuZoMQRcChLmaDEMXAsA?expiry=1495238400000&hmac=nmGzGoF4oNLv28ZDLUtX5dF4xPXUABgym76XMs4UzDE&fileExtension=srt", - "en.txt": "https://www.coursera.org/api/subtitleAssetProxy.v1/GgGZN65HQkyBmTeuR2JMsw?expiry=1495238400000&hmac=2Z37WW5Rc7GoT0eft1vdK0HX5imBqoZTKULMTiZ2EjM&fileExtension=txt" + "zh-CN.txt": "https://api.coursera.org/api/subtitleAssetProxy.v1/UKEuZoMQRcChLmaDEMXAsA?expiry=1495238400000&hmac=eNyKwEu_aMQtn7bg0mUj6uIyVZvjahFSE5x2CrbOXOU&fileExtension=txt", + "en.srt": "https://api.coursera.org/api/subtitleAssetProxy.v1/GgGZN65HQkyBmTeuR2JMsw?expiry=1495238400000&hmac=afqFhv9FWfxxEeSka8PCA4ihiyX3g2Z6K4jWJPFlcdo&fileExtension=srt", + "zh-CN.srt": "https://api.coursera.org/api/subtitleAssetProxy.v1/UKEuZoMQRcChLmaDEMXAsA?expiry=1495238400000&hmac=nmGzGoF4oNLv28ZDLUtX5dF4xPXUABgym76XMs4UzDE&fileExtension=srt", + "en.txt": "https://api.coursera.org/api/subtitleAssetProxy.v1/GgGZN65HQkyBmTeuR2JMsw?expiry=1495238400000&hmac=2Z37WW5Rc7GoT0eft1vdK0HX5imBqoZTKULMTiZ2EjM&fileExtension=txt" } \ No newline at end of file diff --git a/coursera/test/fixtures/json/video-output-2.json b/coursera/test/fixtures/json/video-output-2.json index f264b10..0b29d8c 100644 --- a/coursera/test/fixtures/json/video-output-2.json +++ b/coursera/test/fixtures/json/video-output-2.json @@ -1,6 +1,6 @@ { - "zh-TW.srt": "https://www.coursera.org/api/subtitleAssetProxy.v1/j8femXUVQaGH3pl1FYGh-Q?expiry=1495238400000&hmac=-sOeJbk_bICP9OMfbtkjLuwUAIZZcjGasIMk8JO6n0Q&fileExtension=srt", - "en.txt": "https://www.coursera.org/api/subtitleAssetProxy.v1/r3LdPY_CTUqy3T2Pwu1KVQ?expiry=1495238400000&hmac=xhMK0SSslbfwxl-vzjAXy-bd_iQQTY9iAIrNP4QHxq4&fileExtension=txt", - "en.srt": "https://www.coursera.org/api/subtitleAssetProxy.v1/r3LdPY_CTUqy3T2Pwu1KVQ?expiry=1495238400000&hmac=nO6NGCExQ5FO0aFFnr_YVXtd_lVW4JQaT34WS9tJi6c&fileExtension=srt", - "zh-TW.txt": "https://www.coursera.org/api/subtitleAssetProxy.v1/j8femXUVQaGH3pl1FYGh-Q?expiry=1495238400000&hmac=O9DKhZW6bOsI7ncNZIZPBMXmsreSrgulhGf3eyTCULo&fileExtension=txt" + "zh-TW.srt": "https://api.coursera.org/api/subtitleAssetProxy.v1/j8femXUVQaGH3pl1FYGh-Q?expiry=1495238400000&hmac=-sOeJbk_bICP9OMfbtkjLuwUAIZZcjGasIMk8JO6n0Q&fileExtension=srt", + "en.txt": "https://api.coursera.org/api/subtitleAssetProxy.v1/r3LdPY_CTUqy3T2Pwu1KVQ?expiry=1495238400000&hmac=xhMK0SSslbfwxl-vzjAXy-bd_iQQTY9iAIrNP4QHxq4&fileExtension=txt", + "en.srt": "https://api.coursera.org/api/subtitleAssetProxy.v1/r3LdPY_CTUqy3T2Pwu1KVQ?expiry=1495238400000&hmac=nO6NGCExQ5FO0aFFnr_YVXtd_lVW4JQaT34WS9tJi6c&fileExtension=srt", + "zh-TW.txt": "https://api.coursera.org/api/subtitleAssetProxy.v1/j8femXUVQaGH3pl1FYGh-Q?expiry=1495238400000&hmac=O9DKhZW6bOsI7ncNZIZPBMXmsreSrgulhGf3eyTCULo&fileExtension=txt" } \ No newline at end of file diff --git a/coursera/test/test_workflow.py b/coursera/test/test_workflow.py index 1c36224..99567b6 100644 --- a/coursera/test/test_workflow.py +++ b/coursera/test/test_workflow.py @@ -37,7 +37,7 @@ class MockedFailingDownloader(Downloader): raise self._exception_to_throw -TEST_URL = "https://www.coursera.org/api/test-url" +TEST_URL = "https://api.coursera.org/api/test-url" def make_test_modules(): @@ -110,7 +110,7 @@ def test_iter_modules(): (0, '01_section1'), (0, normpath('test_class/01_section1/01_module1')), (0, 'lecture1', 'en.txt', 'title'), - ('en.txt', 'https://www.coursera.org/api/test-url', 'title') + ('en.txt', 'https://api.coursera.org/api/test-url', 'title') ] collected_output = [] @@ -138,7 +138,7 @@ def test_walk_modules(): (0, '01_section1', 0, normpath('test_class/01_section1/01_module1'), 0, 'lecture1', normpath('test_class/01_section1/01_module1/01_lecture1_title.en.txt'), - 'https://www.coursera.org/api/test-url')] + 'https://api.coursera.org/api/test-url')] collected_output = [] for module, section, lecture, resource in _walk_modules( From bff4f4f9539c2c21189977d72d7ffb9085fb9588 Mon Sep 17 00:00:00 2001 From: Yuri Bochkarev Date: Sun, 3 Jun 2018 11:24:34 +0300 Subject: [PATCH 42/87] Use TLSv1.2 instead of v1.0 (fix #661, #663) --- coursera/cookies.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/coursera/cookies.py b/coursera/cookies.py index 36fa1df..6beac2a 100644 --- a/coursera/cookies.py +++ b/coursera/cookies.py @@ -53,6 +53,7 @@ def __fixed_init__(self, version, name, value, rest, rfc2109=False) + cookielib.Cookie.__init__ = __fixed_init__ @@ -170,7 +171,8 @@ def down_the_wabbit_hole(session, class_name): try: r.raise_for_status() except requests.exceptions.HTTPError as e: - raise AuthenticationFailed('Cannot login on class.coursera.org: %s' % e) + raise AuthenticationFailed( + 'Cannot login on class.coursera.org: %s' % e) logging.debug('Exiting "deep" authentication.') @@ -375,8 +377,9 @@ class TLSAdapter(HTTPAdapter): A customized HTTP Adapter which uses TLS v1.2 for encrypted connections. """ + def init_poolmanager(self, connections, maxsize, block=False): self.poolmanager = PoolManager(num_pools=connections, maxsize=maxsize, block=block, - ssl_version=ssl.PROTOCOL_TLSv1) + ssl_version=ssl.PROTOCOL_TLSv1_2) From 2d3191997e71b7177e7acbdd5b2e86d6055bcb16 Mon Sep 17 00:00:00 2001 From: Yuri Bochkarev Date: Sun, 3 Jun 2018 11:28:08 +0300 Subject: [PATCH 43/87] Update CHANGELOG.md --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3828bcc..83f6b90 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Change Log +## 0.11.2 (2018-06-03) + +Bugfixes: + - Use TLS v1.2 instead of v1.0 + - Switched to api.coursera.org subdomain for subtitles requests (#664) + ## 0.11.1 (2018-06-02) Bugfixes: From 0ac9765f817653453498c08e932c206c8c917203 Mon Sep 17 00:00:00 2001 From: Yuri Bochkarev Date: Sun, 3 Jun 2018 11:29:40 +0300 Subject: [PATCH 44/87] Bump version (0.11.1 -> 0.11.2) Bugfixes: - Use TLS v1.2 instead of v1.0 - Switched to api.coursera.org subdomain for subtitles requests (#664) --- coursera/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/coursera/__init__.py b/coursera/__init__.py index ae4865c..2b3823f 100644 --- a/coursera/__init__.py +++ b/coursera/__init__.py @@ -1 +1 @@ -__version__ = '0.11.1' +__version__ = '0.11.2' From ca21f41582eaa6549ddf09c069ed54731644d056 Mon Sep 17 00:00:00 2001 From: Yuri Bochkarev Date: Sat, 9 Jun 2018 21:15:19 +0300 Subject: [PATCH 45/87] Update appveyor.yml according to the error message recomendations They say it should be like this: ERROR: To modify pip, please run the following command: c:\python35-x64\python.exe -m pip install --disable-pip-version-check --user --upgrade pip --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 452468e..9b3a1d4 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -83,7 +83,7 @@ install: # Upgrade to the latest version of pip to avoid it displaying warnings # about it being out of date. - - "pip install --disable-pip-version-check --user --upgrade pip" + - "python -m pip install --disable-pip-version-check --user --upgrade pip" # Install requirements - "%CMD_IN_ENV% pip install -r requirements.txt" From 0667dd45da52debe1d6da39e42ddc4fa44082854 Mon Sep 17 00:00:00 2001 From: Yuri Bochkarev Date: Sat, 9 Jun 2018 21:33:38 +0300 Subject: [PATCH 46/87] Use io.open in setup.py for compatibility with Python 2.7 --- setup.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index bdb6775..f3e2ff6 100644 --- a/setup.py +++ b/setup.py @@ -10,6 +10,8 @@ from __future__ import print_function import os.path import subprocess import sys +# For compatibility with Python2.7 +from io import open from setuptools import setup @@ -100,7 +102,8 @@ setup( description='Script for downloading Coursera.org videos and naming them.', long_description=long_description, - keywords=['coursera-dl', 'coursera', 'download', 'education', 'MOOCs', 'video'], + keywords=['coursera-dl', 'coursera', + 'download', 'education', 'MOOCs', 'video'], classifiers=trove_classifiers, packages=["coursera"], From c0ae84d12a70fa11f271ccd6f74735ea84645349 Mon Sep 17 00:00:00 2001 From: Yuri Bochkarev Date: Sat, 9 Jun 2018 22:02:01 +0300 Subject: [PATCH 47/87] Update .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 1726811..648ae3d 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,4 @@ venv3 .python-version .ipynb_checkpoints .ropeproject +.mypy_cache From ce6f94022f099a097aac8a2feb0ee5cf0c7e0d10 Mon Sep 17 00:00:00 2001 From: Yuri Bochkarev Date: Sat, 9 Jun 2018 22:03:00 +0300 Subject: [PATCH 48/87] Add py36 to tox and a note to myself to remember to activate pyenv before using tox --- tox.ini | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index d2870c4..bb378d5 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py26,py27,py33,py34,py35 +envlist = py26,py27,py33,py34,py35,py36 [testenv] downloadcache = .tox/_download/ @@ -21,3 +21,12 @@ commands = py.test -v --junitxml={envlogdir}/result.xml coursera/test # {opts} is remove to prevent passing option "--download-cache" to pip # which is already gone. install_command = pip install {packages} + +# Notes for developers. Depending on your system configuration, +# you may find this bash function useful to run before running tox: +# +# activate_pyenv () { +# export PYENV_ROOT="$HOME/.pyenv" +# export PATH="$PYENV_ROOT/bin:$PATH" +# eval "$(pyenv init -)" +# } From fa8cb2fbbd76e628b506bc46470dff82ea9bdb5d Mon Sep 17 00:00:00 2001 From: Yuri Bochkarev Date: Sat, 9 Jun 2018 22:07:28 +0300 Subject: [PATCH 49/87] Use old pip upgrade command only for Python 2.6 in appveyor.yml --- appveyor.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 9b3a1d4..97ce6e7 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -16,6 +16,8 @@ environment: # /E:ON and /V:ON options are not enabled in the batch script intepreter # See: http://stackoverflow.com/a/13751649/163740 CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\appveyor\\run_with_env.cmd" + # This command works for most versions of Python in AppVeyor except of Python 2.6 + PIP_UPGRADE_CMD: "python -m pip install --disable-pip-version-check --user --upgrade pip" #PANDOC_URL: "https://github.com/jgm/pandoc/releases/download/1.17.1/pandoc-1.17.1-1-windows.msi" #PANDOC_MSI: "C:\\pandoc.msi" @@ -27,6 +29,8 @@ environment: - PYTHON: "C:\\Python26" PYTHON_VERSION: "2.6.x" # currently 2.6.6 PYTHON_ARCH: "32" + # For Python 2.6 we are using old version of pip upgrade command + PIP_UPGRADE_CMD: "pip install --disable-pip-version-check --user --upgrade pip" - PYTHON: "C:\\Python26-x64" PYTHON_VERSION: "2.6.x" # currently 2.6.6 @@ -83,7 +87,7 @@ install: # Upgrade to the latest version of pip to avoid it displaying warnings # about it being out of date. - - "python -m pip install --disable-pip-version-check --user --upgrade pip" + - "%PIP_UPGRADE_CMD%" # Install requirements - "%CMD_IN_ENV% pip install -r requirements.txt" From 0327015be985d04cb244e14a8228119589d8bb6b Mon Sep 17 00:00:00 2001 From: Yuri Bochkarev Date: Sat, 9 Jun 2018 22:24:03 +0300 Subject: [PATCH 50/87] Drop Python 2.6 from matrix build in appveyor I think it's time we bury this stewardess already. Python 2.6 is not supported even by core Python team. --- appveyor.yml | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 97ce6e7..91a5d44 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -16,8 +16,6 @@ environment: # /E:ON and /V:ON options are not enabled in the batch script intepreter # See: http://stackoverflow.com/a/13751649/163740 CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\appveyor\\run_with_env.cmd" - # This command works for most versions of Python in AppVeyor except of Python 2.6 - PIP_UPGRADE_CMD: "python -m pip install --disable-pip-version-check --user --upgrade pip" #PANDOC_URL: "https://github.com/jgm/pandoc/releases/download/1.17.1/pandoc-1.17.1-1-windows.msi" #PANDOC_MSI: "C:\\pandoc.msi" @@ -26,16 +24,6 @@ environment: # a later point release. # See: http://www.appveyor.com/docs/installed-software#python - - PYTHON: "C:\\Python26" - PYTHON_VERSION: "2.6.x" # currently 2.6.6 - PYTHON_ARCH: "32" - # For Python 2.6 we are using old version of pip upgrade command - PIP_UPGRADE_CMD: "pip install --disable-pip-version-check --user --upgrade pip" - - - PYTHON: "C:\\Python26-x64" - PYTHON_VERSION: "2.6.x" # currently 2.6.6 - PYTHON_ARCH: "64" - - PYTHON: "C:\\Python27" PYTHON_VERSION: "2.7.x" # currently 2.7.11 PYTHON_ARCH: "32" @@ -87,7 +75,7 @@ install: # Upgrade to the latest version of pip to avoid it displaying warnings # about it being out of date. - - "%PIP_UPGRADE_CMD%" + - "python -m pip install --disable-pip-version-check --user --upgrade pip" # Install requirements - "%CMD_IN_ENV% pip install -r requirements.txt" From 88832628b76cd3109533762ad3e847ec0766762a Mon Sep 17 00:00:00 2001 From: Yuri Bochkarev Date: Sat, 9 Jun 2018 22:38:59 +0300 Subject: [PATCH 51/87] Add Python 3.6 to appveyor --- appveyor.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/appveyor.yml b/appveyor.yml index 91a5d44..f9ea741 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -56,6 +56,14 @@ environment: PYTHON_VERSION: "3.5.x" # currently 3.5.1 PYTHON_ARCH: "64" + - PYTHON: "C:\\Python36" + PYTHON_VERSION: "3.6.x" # currently 3.6.? + PYTHON_ARCH: "32" + + - PYTHON: "C:\\Python36-x64" + PYTHON_VERSION: "3.6.x" # currently 3.6.? + PYTHON_ARCH: "64" + init: - "ECHO %PYTHON%" - ps: "ls C:/Python*" From de2ba5bdce577823dc7ccb3bccdcc76d266e7f39 Mon Sep 17 00:00:00 2001 From: Yuri Bochkarev Date: Sat, 9 Jun 2018 23:06:23 +0300 Subject: [PATCH 52/87] Add comment about split('/') in _get_notebook_folder --- coursera/api.py | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/coursera/api.py b/coursera/api.py index e9c5000..330b560 100644 --- a/coursera/api.py +++ b/coursera/api.py @@ -482,22 +482,33 @@ class CourseraOnDemand(object): for content in reply['content']: if content['type'] == 'directory': - a = self._get_notebook_folder(OPENCOURSE_NOTEBOOK_TREE, jupyterId, jupId=jupyterId, path=content['path'], timestamp=int(time.time())) + a = self._get_notebook_folder( + OPENCOURSE_NOTEBOOK_TREE, jupyterId, jupId=jupyterId, path=content['path'], timestamp=int(time.time())) supplement_links.update(a) elif content['type'] == 'file': - tmpUrl = OPENCOURSE_NOTEBOOK_DOWNLOAD.format(path=content['path'], jupId=jupyterId, timestamp=int(time.time())) + tmpUrl = OPENCOURSE_NOTEBOOK_DOWNLOAD.format( + path=content['path'], jupId=jupyterId, timestamp=int(time.time())) filename, extension = os.path.splitext(clean_url(tmpUrl)) head, tail = os.path.split(content['path']) - head = '/'.join([clean_filename(dir, minimal_change=True) for dir in head.split('/')]) + # '/' in the following line is for a reason: + # @noureddin says: "I split head using split('/') not + # os.path.split() because it's seems to me that it comes from a + # web page, so the separator will always be /, so using the + # native path splitting function is not the most portable way to + # do it." + # Original pull request: https://github.com/coursera-dl/coursera-dl/pull/654 + head = '/'.join([clean_filename(dir, minimal_change=True) + for dir in head.split('/')]) tail = clean_filename(tail, minimal_change=True) if os.path.isdir(self._course_name + "/notebook/" + head + "/") == False: logging.info('Creating [{}] directories...'.format(head)) os.makedirs(self._course_name + "/notebook/" + head + "/") - r = requests.get(tmpUrl.replace(" ", "%20"), cookies=self._session.cookies) + r = requests.get(tmpUrl.replace(" ", "%20"), + cookies=self._session.cookies) if os.path.exists(self._course_name + "/notebook/" + head + "/" + tail) == False: logging.info('Downloading {} into {}'.format(tail, head)) with open(self._course_name + "/notebook/" + head + "/" + tail, 'wb+') as f: @@ -505,14 +516,15 @@ class CourseraOnDemand(object): else: logging.info('Skipping {}... (file exists)'.format(tail)) - if not str(extension[1:]) in supplement_links: supplement_links[str(extension[1:])] = [] - supplement_links[str(extension[1:])].append((tmpUrl.replace(" ", "%20"), filename)) + supplement_links[str(extension[1:])].append( + (tmpUrl.replace(" ", "%20"), filename)) elif content['type'] == 'notebook': - tmpUrl = OPENCOURSE_NOTEBOOK_DOWNLOAD.format(path=content['path'], jupId=jupyterId, timestamp=int(time.time())) + tmpUrl = OPENCOURSE_NOTEBOOK_DOWNLOAD.format( + path=content['path'], jupId=jupyterId, timestamp=int(time.time())) filename, extension = os.path.splitext(clean_url(tmpUrl)) head, tail = os.path.split(content['path']) @@ -521,9 +533,11 @@ class CourseraOnDemand(object): logging.info('Creating [{}] directories...'.format(head)) os.makedirs(self._course_name + "/notebook/" + head + "/") - r = requests.get(tmpUrl.replace(" ", "%20"), cookies=self._session.cookies) + r = requests.get(tmpUrl.replace(" ", "%20"), + cookies=self._session.cookies) if os.path.exists(self._course_name + "/notebook/" + head + "/" + tail) == False: - logging.info('Downloading Jupyter {} into {}'.format(tail, head)) + logging.info( + 'Downloading Jupyter {} into {}'.format(tail, head)) with open(self._course_name + "/notebook/" + head + "/" + tail, 'wb+') as f: f.write(r.content) else: @@ -532,10 +546,12 @@ class CourseraOnDemand(object): if not "ipynb" in supplement_links: supplement_links["ipynb"] = [] - supplement_links["ipynb"].append((tmpUrl.replace(" ", "%20"), filename)) + supplement_links["ipynb"].append( + (tmpUrl.replace(" ", "%20"), filename)) else: - logging.info('Unsupported typename {} in notebook'.format(content['type'])) + logging.info( + 'Unsupported typename {} in notebook'.format(content['type'])) return supplement_links From 45824ef4b84b20159c5ded0b6bfdf9fd1b96e15d Mon Sep 17 00:00:00 2001 From: Yuri Bochkarev Date: Sat, 9 Jun 2018 23:07:05 +0300 Subject: [PATCH 53/87] autopep8 --- coursera/api.py | 75 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 49 insertions(+), 26 deletions(-) diff --git a/coursera/api.py b/coursera/api.py index 330b560..657e4b5 100644 --- a/coursera/api.py +++ b/coursera/api.py @@ -127,7 +127,8 @@ class QuizExamToMarkupConverter(object): result = ['

'] for option in options: - option_text = unescape_html(option['display']['definition']['value']) + option_text = unescape_html( + option['display']['definition']['value']) # We need to replace with so that answer text # stays on the same line with checkbox/radio button @@ -238,7 +239,8 @@ class MarkupToHTMLConverter(object): asset = self._asset_retriever[image['assetid']] if asset.data is not None: encoded64 = base64.b64encode(asset.data).decode() - image['src'] = 'data:%s;base64,%s' % (asset.content_type, encoded64) + image['src'] = 'data:%s;base64,%s' % ( + asset.content_type, encoded64) def _convert_markup_audios(self, soup): """ @@ -264,9 +266,11 @@ class MarkupToHTMLConverter(object): asset = self._asset_retriever[audio['id']] if asset.data is not None: encoded64 = base64.b64encode(asset.data).decode() - data_string = 'data:%s;base64,%s' % (asset.content_type, encoded64) + data_string = 'data:%s;base64,%s' % ( + asset.content_type, encoded64) - source_tag = soup.new_tag('source', src=data_string, type=asset.content_type) + source_tag = soup.new_tag( + 'source', src=data_string, type=asset.content_type) controls_tag = soup.new_tag('audio', controls="") controls_tag.string = 'Your browser does not support the audio element.' @@ -278,6 +282,7 @@ class OnDemandCourseMaterialItems(object): """ Helper class that allows accessing lecture JSONs by lesson IDs. """ + def __init__(self, items): """ Initialization. Build a map from lessonId to Lecture (item) @@ -347,6 +352,7 @@ class Asset(namedtuple('Asset', 'id name type_name url content_type data')): This class contains information about an asset. """ __slots__ = () + def __repr__(self): return 'Asset(id="%s", name="%s", type_name="%s", url="%s", content_type="%s", data="<...>")' % ( self.id, self.name, self.type_name, self.url, self.content_type) @@ -356,6 +362,7 @@ class AssetRetriever(object): """ This class helps download assets by their ID. """ + def __init__(self, session): self._session = session self._asset_mapping = {} @@ -372,7 +379,8 @@ class AssetRetriever(object): id=','.join(asset_ids)) # Create a map "asset_id => asset" for easier access - asset_map = dict((asset['id'], asset) for asset in asset_list['elements']) + asset_map = dict((asset['id'], asset) + for asset in asset_list['elements']) for asset_id in asset_ids: # Download each asset @@ -434,7 +442,8 @@ class CourseraOnDemand(object): self._user_id = None self._quiz_to_markup = QuizExamToMarkupConverter(session) - self._markup_to_html = MarkupToHTMLConverter(session, mathjax_cdn_url=mathjax_cdn_url) + self._markup_to_html = MarkupToHTMLConverter( + session, mathjax_cdn_url=mathjax_cdn_url) self._asset_retriever = AssetRetriever(session) def obtain_user_id(self): @@ -463,7 +472,8 @@ class CourseraOnDemand(object): except requests.exceptions.HTTPError as exception: logging.error('Could not download exam %s: %s', exam_id, exception) if is_debug_run(): - logging.exception('Could not download exam %s: %s', exam_id, exception) + logging.exception( + 'Could not download exam %s: %s', exam_id, exception) return None def _get_notebook_folder(self, url, jupyterId, **kwargs): @@ -574,7 +584,8 @@ class CourseraOnDemand(object): jupyterId = jupyterId[0] newReq = requests.Session() - req = newReq.get(OPENCOURSE_NOTEBOOK_TREE.format(jupId=jupyterId, path="/", timestamp=int(time.time())), headers=headers) + req = newReq.get(OPENCOURSE_NOTEBOOK_TREE.format( + jupId=jupyterId, path="/", timestamp=int(time.time())), headers=headers) return self._get_notebook_folder(OPENCOURSE_NOTEBOOK_TREE, jupyterId, jupId=jupyterId, path="/", timestamp=int(time.time())) @@ -585,9 +596,11 @@ class CourseraOnDemand(object): ret = self._get_notebook_json(notebook_id, authorizationId) return ret except requests.exceptions.HTTPError as exception: - logging.error('Could not download notebook %s: %s', notebook_id, exception) + logging.error('Could not download notebook %s: %s', + notebook_id, exception) if is_debug_run(): - logging.exception('Could not download notebook %s: %s', notebook_id, exception) + logging.exception( + 'Could not download notebook %s: %s', notebook_id, exception) return None def extract_links_from_quiz(self, quiz_id): @@ -598,7 +611,8 @@ class CourseraOnDemand(object): except requests.exceptions.HTTPError as exception: logging.error('Could not download quiz %s: %s', quiz_id, exception) if is_debug_run(): - logging.exception('Could not download quiz %s: %s', quiz_id, exception) + logging.exception( + 'Could not download quiz %s: %s', quiz_id, exception) return None def _convert_quiz_json_to_links(self, quiz_json, filename_suffix): @@ -653,7 +667,7 @@ class CourseraOnDemand(object): def _get_quiz_session_id(self, quiz_id): headers = self._auth_headers_with_json() - data = {"contentRequestBody":[]} + data = {"contentRequestBody": []} reply = get_page(self._session, POST_OPENCOURSE_API_QUIZ_SESSION, json=True, @@ -706,9 +720,11 @@ class CourseraOnDemand(object): return links except requests.exceptions.HTTPError as exception: - logging.error('Could not download lecture %s: %s', video_id, exception) + logging.error('Could not download lecture %s: %s', + video_id, exception) if is_debug_run(): - logging.exception('Could not download lecture %s: %s', video_id, exception) + logging.exception( + 'Could not download lecture %s: %s', video_id, exception) return None def _normalize_assets(self, assets): @@ -871,7 +887,7 @@ class CourseraOnDemand(object): video_content['mp4'] = video_url subtitle_link = self._extract_subtitles_from_video_dom( - dom, subtitle_language, video_id) + dom, subtitle_language, video_id) for key, value in iteritems(subtitle_link): video_content[key] = value @@ -939,7 +955,8 @@ class CourseraOnDemand(object): if subtitle_url is not None: # some subtitle urls are relative! subtitle_links[ - "%s.%s" % (current_subtitle_language, subtitle_extension) + "%s.%s" % (current_subtitle_language, + subtitle_extension) ] = make_coursera_absolute_url(subtitle_url) return subtitle_links @@ -988,7 +1005,8 @@ class CourseraOnDemand(object): @return: @see CourseraOnDemand._extract_links_from_text """ - logging.debug('Gathering supplement URLs for element_id <%s>.', element_id) + logging.debug( + 'Gathering supplement URLs for element_id <%s>.', element_id) try: # Assignment text (instructions) contains asset tags which describe @@ -1021,7 +1039,8 @@ class CourseraOnDemand(object): @return: @see CourseraOnDemand._extract_links_from_text """ - logging.debug('Gathering supplement URLs for element_id <%s>.', element_id) + logging.debug( + 'Gathering supplement URLs for element_id <%s>.', element_id) try: # Assignment text (instructions) contains asset tags which describe @@ -1051,13 +1070,14 @@ class CourseraOnDemand(object): @return: @see CourseraOnDemand._extract_links_from_text """ - logging.debug('Gathering supplement URLs for element_id <%s>.', element_id) + logging.debug( + 'Gathering supplement URLs for element_id <%s>.', element_id) try: dom = get_page(self._session, OPENCOURSE_SUPPLEMENT_URL, - json=True, - course_id=self._course_id, - element_id=element_id) + json=True, + course_id=self._course_id, + element_id=element_id) supplement_content = {} @@ -1228,7 +1248,8 @@ class CourseraOnDemand(object): @rtype: [str] """ headers = self._auth_headers_with_json() - data = {'courseId': self._course_id, 'learnerId': self._user_id, 'itemId': element_id} + data = {'courseId': self._course_id, + 'learnerId': self._user_id, 'itemId': element_id} dom = get_page(self._session, OPENCOURSE_NOTEBOOK_LAUNCHES, post=True, json=True, @@ -1237,7 +1258,7 @@ class CourseraOnDemand(object): headers=headers, element_id=element_id, data=json.dumps(data) - ) + ) # Return authorization id. This id changes on each request return dom['elements'][0]['authorizationId'] @@ -1282,7 +1303,8 @@ class CourseraOnDemand(object): for element in dom['elements']: # There is only one section with Instructions if 'introduction' in element['instructions']: - result.append(element['instructions']['introduction']['definition']['value']) + result.append(element['instructions'] + ['introduction']['definition']['value']) # But there may be multiple sections in Sections for section in element['instructions'].get('sections', []): @@ -1291,7 +1313,8 @@ class CourseraOnDemand(object): if section_title is not None: # If section title is present, put it in the beginning of # section value as if it was there. - section_value = ('%s' % section_title) + section_value + section_value = ('%s' % + section_title) + section_value result.append(section_value) return result From ef3268677e4d66a779d3aece67a97fda803de5a2 Mon Sep 17 00:00:00 2001 From: Yuri Bochkarev Date: Sun, 10 Jun 2018 01:33:39 +0300 Subject: [PATCH 54/87] Update Dockerfile to use Python3.6 and install from PyPI package Also update README and CONTRIBUTING guides with the information on how to build and run Docker image. --- CONTRIBUTING.md | 18 ++++++++++++++++++ Dockerfile | 15 ++++----------- README.md | 25 ++++++++++++++++++------- 3 files changed, 40 insertions(+), 18 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index cca172d..004a1ee 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -241,3 +241,21 @@ DRAFT I think this is required for PyPI description to look nice. 7. `python setup.py sdist` to build the package 8. `twine upload dist/coursera-dl-0.6.1.tar.gz` to deploy the package. + +## Docker + +Build new Docker image from PyPI package: + +``` +docker build --tag courseradl/courseradl --build-arg VERSION=0.11.2 . +``` + +Run the image: +``` +docker run --rm -it -v "$(pwd):/courses" -v "$HOME/.netrc:/netrc" courseradl -n /netrc -- google-machine-learning +``` + +Publish the image: +``` +docker push courseradl/courseradl +``` diff --git a/Dockerfile b/Dockerfile index 65915b4..cef9e3c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,21 +1,14 @@ -FROM python:3.4-slim +FROM python:3.6-slim -LABEL maintainer "opsxcq@strm.sh" - -WORKDIR /src -COPY requirements.txt /src - -COPY requirements-dev.txt /src +LABEL maintainer "https://github.com/coursera-dl/" RUN apt-get update && \ apt-get install -y --no-install-recommends gcc g++ libssl-dev && \ rm -rf /var/lib/apt/lists/* && \ - pip install -r requirements.txt && \ - pip install -r requirements-dev.txt && \ apt-get purge -y --auto-remove gcc g++ libssl-dev -COPY . /src -RUN python setup.py install +ARG VERSION +RUN pip install coursera-dl==$VERSION WORKDIR /courses ENTRYPOINT ["coursera-dl"] diff --git a/README.md b/README.md index 349b8b8..4f2e55c 100644 --- a/README.md +++ b/README.md @@ -220,26 +220,37 @@ applicable). If you prefer you can run this software inside Docker: ``` -docker run --rm -it \ - -v "$(pwd):/courses" \ - strm/coursera-dl \ - -u -p +docker run --rm -it -v \ + "$(pwd):/courses" \ + courseradl/courseradl -u -p ``` -The actual working dir for coursera-dl is /courses, all courses will be downloaded there if you don't specify otherwise. +Or using netrc file: + +``` +docker run --rm -it \ + -v "$(pwd):/courses" -v "$HOME/.netrc:/netrc" \ + courseradl/courseradl -n /netrc +``` + +The actual working dir for coursera-dl is /courses, all courses will be +downloaded there if you don't specify otherwise. ## Windows `python -m pip install coursera-dl` -Be sure that the Python install path is added to the PATH system environment variables. This can be found in Control Panel > System > Advanced System Settings > Environment Variables. +Be sure that the Python install path is added to the PATH system environment +variables. This can be found in Control Panel > System > Advanced System +Settings > Environment Variables. ``` Example: C:\Python35\Scripts\;C:\Python35\; ``` -Or if you have restricted installation permissions and you've installed Python under AppData, add this to your PATH. +Or if you have restricted installation permissions and you've installed Python +under AppData, add this to your PATH. ``` Example: From 98e4c141065c67dd6d64041b14057e92f36cd496 Mon Sep 17 00:00:00 2001 From: Yuri Bochkarev Date: Mon, 11 Jun 2018 18:14:48 +0300 Subject: [PATCH 55/87] autopep8 + trailing spaces --- README.md | 6 +++--- coursera/commandline.py | 14 ++++++++------ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 4f2e55c..083b7a6 100644 --- a/README.md +++ b/README.md @@ -242,7 +242,7 @@ downloaded there if you don't specify otherwise. Be sure that the Python install path is added to the PATH system environment variables. This can be found in Control Panel > System > Advanced System -Settings > Environment Variables. +Settings > Environment Variables. ``` Example: @@ -409,7 +409,7 @@ one of the following actions solve your problem: * You get an error when using `-n` to specify that you want to use a `.netrc` file and, * You want the script to use your default netrc file and, - * You get a message saying `coursera-dl: error: too few arguments` + * You get a message saying `coursera-dl: error: too few arguments` Then you should specify `--` as an argument after `-n`, that is, `-n --` or change the order in which you pass the arguments to the script, so that @@ -539,7 +539,7 @@ https://urllib3.readthedocs.io/en/latest/security.html#insecureplatformwarning When saving a course page, we enabled `MathJax` rendering for math equations, by injecting `MathJax.js` in the header. The script is using a cdn service provided by [mathjax.org](https://cdn.mathjax.org/mathjax/latest/MathJax.js). However, that -url is not accessible in some countries/regions, you can provide a +url is not accessible in some countries/regions, you can provide a `--mathjax-cdn ` parameter to specify the `MathJax.js` file that is accessible in your region. diff --git a/coursera/commandline.py b/coursera/commandline.py index 2ea69b1..bb17504 100644 --- a/coursera/commandline.py +++ b/coursera/commandline.py @@ -112,7 +112,8 @@ def parse_args(args=None): ) # Selection of material to download - group_material = parser.add_argument_group('Selection of material to download') + group_material = parser.add_argument_group( + 'Selection of material to download') group_material.add_argument('--only-syllabus', dest='only_syllabus', @@ -289,7 +290,8 @@ def parse_args(args=None): help='Do not limit filenames to be ASCII-only') # Advanced authentication - group_adv_auth = parser.add_argument_group('Advanced authentication options') + group_adv_auth = parser.add_argument_group( + 'Advanced authentication options') group_adv_auth.add_argument('-c', '--cookies_file', @@ -323,7 +325,8 @@ def parse_args(args=None): help='clear cached cookies') # Advanced miscellaneous options - group_adv_misc = parser.add_argument_group('Advanced miscellaneous options') + group_adv_misc = parser.add_argument_group( + 'Advanced miscellaneous options') group_adv_misc.add_argument('--hook', dest='hooks', @@ -416,7 +419,8 @@ def parse_args(args=None): # check arguments if args.use_keyring and args.password: - logging.warning('--keyring and --password cannot be specified together') + logging.warning( + '--keyring and --password cannot be specified together') args.use_keyring = False if args.use_keyring and not keyring: @@ -437,5 +441,3 @@ def parse_args(args=None): sys.exit(1) return args - - From ad61d52a5b999b8b8d823119826a9502401c052e Mon Sep 17 00:00:00 2001 From: Yuri Bochkarev Date: Mon, 11 Jun 2018 18:16:09 +0300 Subject: [PATCH 56/87] Mention that username is email address in command-line options fix #625 --- coursera/commandline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/coursera/commandline.py b/coursera/commandline.py index bb17504..4c5508f 100644 --- a/coursera/commandline.py +++ b/coursera/commandline.py @@ -57,7 +57,7 @@ def parse_args(args=None): dest='username', action='store', default=None, - help='coursera username') + help='username (email) that you use to login to Coursera') group_basic.add_argument('-p', '--password', From e788aed7982f1e80f69f304dc29f9a2886d8f371 Mon Sep 17 00:00:00 2001 From: Yuri Bochkarev Date: Tue, 12 Jun 2018 20:14:19 +0300 Subject: [PATCH 57/87] Update TOC in README.md --- README.md | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 083b7a6..8d72afc 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,9 @@ [![Latest version on PyPI](https://img.shields.io/pypi/v/coursera-dl.svg)](https://pypi.python.org/pypi/coursera-dl) [![Code Climate](https://codeclimate.com/github/coursera-dl/coursera-dl/badges/gpa.svg)](https://codeclimate.com/github/coursera-dl/coursera-dl) + + +- [Coursera Downloader](#coursera-downloader) - [Introduction](#introduction) - [Features](#features) - [Disclaimer](#disclaimer) @@ -13,23 +16,28 @@ - [Recommended installation method for all Operating Systems](#recommended-installation-method-for-all-operating-systems) - [Alternative ways of installing missing dependencies](#alternative-ways-of-installing-missing-dependencies) - [Alternative installation method for Unix systems](#alternative-installation-method-for-unix-systems) + - [ArchLinux](#archlinux) - [Installing dependencies on your own](#installing-dependencies-on-your-own) + - [Docker](#docker) - [Windows](#windows) - [Create an account with Coursera](#create-an-account-with-coursera) - [Running the script](#running-the-script) - [Resuming downloads](#resuming-downloads) - [Troubleshooting](#troubleshooting) - [China issues](#china-issues) - - [Download timeouts](#download-timeouts) - [Found 0 sections and 0 lectures on this page](#found-0-sections-and-0-lectures-on-this-page) - - [Windows: Proxy support](#windows-proxy-support) + - [Download timeouts](#download-timeouts) + - [Windows: proxy support](#windows-proxy-support) - [Windows: Failed to create process](#windows-failed-to-create-process) - - [SSLError: Errno 1 _ssl.c:504: error:14094410:SSL routines:SSL3_READ_BYTES:sslv3 alert handshake failure](#sslerror-errno-1-_sslc504-error14094410ssl-routinesssl3_read_bytessslv3-alert-handshake-failure) + - [SSLError: [Errno 1] _ssl.c:504: error:14094410:SSL routines:SSL3_READ_BYTES:sslv3 alert handshake failure](#sslerror-errno-1-_sslc504-error14094410ssl-routinesssl3_read_bytessslv3-alert-handshake-failure) + - [Alternative CDN for `MathJax.js`](#alternative-cdn-for-mathjaxjs) - [Reporting issues](#reporting-issues) - [Filing an issue/Reporting a bug](#filing-an-issuereporting-a-bug) - [Feedback](#feedback) - [Contact](#contact) + + # Introduction [Coursera][1] is arguably the leader in *massive open online courses* (MOOC) @@ -534,7 +542,7 @@ If you still have the problem, please read the following issues for more ideas o This is also worth reading: https://urllib3.readthedocs.io/en/latest/security.html#insecureplatformwarning -## Use an alternative cdn url for `MathJax.js` +## Alternative CDN for `MathJax.js` When saving a course page, we enabled `MathJax` rendering for math equations, by injecting `MathJax.js` in the header. The script is using a cdn service provided From 5bc5cd9c77e8f0572d5230f6ac9896c1172ef660 Mon Sep 17 00:00:00 2001 From: John Doe Date: Sat, 16 Jun 2018 23:42:36 -0700 Subject: [PATCH 58/87] Typo (#671) --- coursera/api.py | 6 +++--- coursera/commandline.py | 4 ++-- coursera/cookies.py | 10 +++++----- coursera/coursera_dl.py | 16 ++++++++-------- coursera/define.py | 4 ++-- coursera/extractors.py | 14 +++++++------- coursera/test/test_api.py | 12 ++++++------ coursera/test/test_parsing.py | 4 ++-- 8 files changed, 35 insertions(+), 35 deletions(-) diff --git a/coursera/api.py b/coursera/api.py index 657e4b5..4fc6613 100644 --- a/coursera/api.py +++ b/coursera/api.py @@ -52,7 +52,7 @@ from .define import (OPENCOURSE_SUPPLEMENT_URL, IN_MEMORY_MARKER) -from .cookies import prepape_auth_headers +from .cookies import prepare_auth_headers class QuizExamToMarkupConverter(object): @@ -681,7 +681,7 @@ class CourseraOnDemand(object): return reply['contentResponseBody']['session']['id'] def _auth_headers_with_json(self): - headers = prepape_auth_headers(self._session, include_cauth=True) + headers = prepare_auth_headers(self._session, include_cauth=True) headers.update({ 'Content-Type': 'application/json; charset=UTF-8' }) @@ -943,7 +943,7 @@ class CourseraOnDemand(object): ", ".join(subtitle_set_nonexist), video_id, subtitle_description) if not subtitle_set_download: - logging.warning("%s all requested subtitles are unavaliable," + logging.warning("%s all requested subtitles are unavailable," "with video id: [%s], falling back to 'en' " "%s", subtitle_description.capitalize(), video_id, diff --git a/coursera/commandline.py b/coursera/commandline.py index 4c5508f..a09e77a 100644 --- a/coursera/commandline.py +++ b/coursera/commandline.py @@ -103,7 +103,7 @@ def parse_args(args=None): help='Choose language to download subtitles and transcripts. (Default: all)' 'Use special value "all" to download all available.' 'To download subtitles and transcripts of multiple languages,' - 'use comma(s) (without spaces) to seperate the names of the languages, i.e., "en,zh-CN".' + 'use comma(s) (without spaces) to separate the names of the languages, i.e., "en,zh-CN".' 'To download subtitles and transcripts of alternative language(s) ' 'if only the current language is not available,' 'put an "|" for each of the alternative languages after ' @@ -132,7 +132,7 @@ def parse_args(args=None): dest='download_notebooks', action='store_true', default=False, - help='download Python Jupyther Notebooks. (Default: False)') + help='download Python Jupyter Notebooks. (Default: False)') group_material.add_argument('--about', # FIXME: should be --about-course dest='about', diff --git a/coursera/cookies.py b/coursera/cookies.py index 6beac2a..a66f126 100644 --- a/coursera/cookies.py +++ b/coursera/cookies.py @@ -69,15 +69,15 @@ class AuthenticationFailed(BaseException): """ -def prepape_auth_headers(session, include_cauth=False): +def prepare_auth_headers(session, include_cauth=False): """ - This function prepapes headers with CSRF/CAUTH tokens that can + This function prepares headers with CSRF/CAUTH tokens that can be used in POST requests such as login/get_quiz. @param session: Requests session. @type session: requests.Session - @param include_cauth: Flag that indicates whethe CAUTH cookies should be + @param include_cauth: Flag that indicates whether CAUTH cookies should be included as well. @type include_cauth: bool @@ -133,7 +133,7 @@ def login(session, username, password, class_name=None): logging.error(e) raise ClassNotFound(class_name) - headers = prepape_auth_headers(session, include_cauth=False) + headers = prepare_auth_headers(session, include_cauth=False) data = { 'email': username, @@ -355,7 +355,7 @@ def get_cookies_for_class(session, class_name, Get the cookies for the given class. We do not validate the cookies if they are loaded from a cookies file - because this is intented for debugging purposes or if the coursera + because this is intended for debugging purposes or if the coursera authentication process has changed. """ if cookies_file: diff --git a/coursera/coursera_dl.py b/coursera/coursera_dl.py index 4c20689..a883b0c 100644 --- a/coursera/coursera_dl.py +++ b/coursera/coursera_dl.py @@ -115,12 +115,12 @@ def download_on_demand_class(args, class_name): Download all requested resources from the on-demand class given in class_name. @return: Tuple of (bool, bool), where the first bool indicates whether - errors occured while parsing syllabus, the second bool indicaters + errors occurred while parsing syllabus, the second bool indicates whether the course appears to be completed. @rtype: (bool, bool) """ - error_occured = False + error_occurred = False session = get_session() extractor = CourseraExtractor(session, args.username, args.password) @@ -129,7 +129,7 @@ def download_on_demand_class(args, class_name): with open(cached_syllabus_filename) as syllabus_file: modules = json.load(syllabus_file) else: - error_occured, modules = extractor.get_modules( + error_occurred, modules = extractor.get_modules( class_name, args.reverse, args.unrestricted_filenames, @@ -145,7 +145,7 @@ def download_on_demand_class(args, class_name): json.dump(modules, file_object, indent=4) if args.only_syllabus: - return error_occured, False + return error_occurred, False downloader = get_downloader(session, class_name, args) downloader_wrapper = ParallelDownloader(downloader, args.jobs) \ @@ -177,7 +177,7 @@ def download_on_demand_class(args, class_name): if course_downloader.failed_urls: print_failed_urls(course_downloader.failed_urls) - return error_occured, completed + return error_occurred, completed def print_skipped_urls(skipped_urls): @@ -205,7 +205,7 @@ def download_class(args, class_name): Try to download on-demand class. @return: Tuple of (bool, bool), where the first bool indicates whether - errors occured while parsing syllabus, the second bool indicaters + errors occurred while parsing syllabus, the second bool indicaters whether the course appears to be completed. @rtype: (bool, bool) """ @@ -235,10 +235,10 @@ def main(): try: logging.info('Downloading class: %s (%d / %d)', class_name, class_index + 1, len(args.class_names)) - error_occured, completed = download_class(args, class_name) + error_occurred, completed = download_class(args, class_name) if completed: completed_classes.append(class_name) - if error_occured: + if error_occurred: classes_with_errors.append(class_name) except requests.exceptions.HTTPError as e: logging.error('HTTPError %s', e) diff --git a/coursera/define.py b/coursera/define.py index e2ec105..a586392 100644 --- a/coursera/define.py +++ b/coursera/define.py @@ -16,7 +16,7 @@ AUTH_URL = 'https://accounts.coursera.org/api/v1/login' AUTH_URL_V3 = 'https://api.coursera.org/api/login/v3' CLASS_URL = 'https://class.coursera.org/{class_name}' -# The following link is left just for illustative purposes: +# The following link is left just for illustrative purposes: # https://api.coursera.org/api/courses.v1?fields=display%2CpartnerIds%2CphotoUrl%2CstartDate%2Cpartners.v1(homeLink%2Cname)&includes=partnerIds&q=watchlist&start=0 # Reply is as follows: # { @@ -36,7 +36,7 @@ CLASS_URL = 'https://class.coursera.org/{class_name}' # } OPENCOURSE_LIST_COURSES = 'https://api.coursera.org/api/courses.v1?q=watchlist&start={start}' -# The following link is left just for illustative purposes: +# The following link is left just for illustrative purposes: # https://api.coursera.org/api/memberships.v1?fields=courseId,enrolledTimestamp,grade,id,lastAccessedTimestamp,onDemandSessionMembershipIds,onDemandSessionMemberships,role,v1SessionId,vc,vcMembershipId,courses.v1(courseStatus,display,partnerIds,photoUrl,specializations,startDate,v1Details,v2Details),partners.v1(homeLink,name),v1Details.v1(sessionIds),v1Sessions.v1(active,certificatesReleased,dbEndDate,durationString,hasSigTrack,startDay,startMonth,startYear),v2Details.v1(onDemandSessions,plannedLaunchDate,sessionsEnabledAt),specializations.v1(logo,name,partnerIds,shortName)&includes=courseId,onDemandSessionMemberships,vcMembershipId,courses.v1(partnerIds,specializations,v1Details,v2Details),v1Details.v1(sessionIds),v2Details.v1(onDemandSessions),specializations.v1(partnerIds)&q=me&showHidden=true&filter=current,preEnrolled # Sample reply: # { diff --git a/coursera/extractors.py b/coursera/extractors.py index 7d23b96..46f3683 100644 --- a/coursera/extractors.py +++ b/coursera/extractors.py @@ -51,11 +51,11 @@ class CourseraExtractor(PlatformExtractor): download_notebooks=False): page = self._get_on_demand_syllabus(class_name) - error_occured, modules = self._parse_on_demand_syllabus( + error_occurred, modules = self._parse_on_demand_syllabus( page, reverse, unrestricted_filenames, subtitle_language, video_resolution, download_quizzes, mathjax_cdn_url, download_notebooks) - return error_occured, modules + return error_occurred, modules def _get_on_demand_syllabus(self, class_name): """ @@ -107,7 +107,7 @@ class CourseraExtractor(PlatformExtractor): with open('%s-course-material-items.json' % course_name, 'w') as file_object: json.dump(ondemand_material_items._items, file_object, indent=4) - error_occured = False + error_occurred = False for module in json_modules: module_slug = module['slug'] @@ -135,7 +135,7 @@ class CourseraExtractor(PlatformExtractor): logging.info('Processing lecture %s (%s)', lecture_slug, typename) # Empty dictionary means there were no data - # None means an error occured + # None means an error occurred links = {} if typename == 'lecture': @@ -180,7 +180,7 @@ class CourseraExtractor(PlatformExtractor): continue if links is None: - error_occured = True + error_occurred = True elif links: lectures.append((lecture_slug, links)) @@ -206,7 +206,7 @@ class CourseraExtractor(PlatformExtractor): links = course.extract_links_from_reference(json_reference['shortId']) if links is None: - error_occured = True + error_occurred = True elif links: reference.append(('', links)) @@ -216,4 +216,4 @@ class CourseraExtractor(PlatformExtractor): if references: modules.append(("Resources", references)) - return error_occured, modules + return error_occurred, modules diff --git a/coursera/test/test_api.py b/coursera/test/test_api.py index 063d274..6f50723 100644 --- a/coursera/test/test_api.py +++ b/coursera/test/test_api.py @@ -149,7 +149,7 @@ def test_ondemand_programming_supplement_no_instructions(get_page, course): ('peer-assignment-no-instructions.json', ''), ] ) -def test_ondemand_from_peer_assgnment_instructions( +def test_ondemand_from_peer_assignment_instructions( get_page, course, input_filename, expected_output): instructions = slurp_fixture('json/%s' % input_filename) get_page.return_value = json.loads(instructions) @@ -176,7 +176,7 @@ def test_ondemand_programming_supplement_empty_instructions(get_page, course): output = course.extract_links_from_programming('0') # Make sure that SOME html content has been extracted, but remove - # it immeditely because it's a hassle to properly prepare test input + # it immediately because it's a hassle to properly prepare test input # for it. FIXME later. assert 'html' in output del output['html'] @@ -193,7 +193,7 @@ def test_ondemand_programming_immediate_instructions_empty_instructions( output = course.extract_links_from_programming_immediate_instructions('0') # Make sure that SOME html content has been extracted, but remove - # it immeditely because it's a hassle to properly prepare test input + # it immediately because it's a hassle to properly prepare test input # for it. FIXME later. assert 'html' in output del output['html'] @@ -214,7 +214,7 @@ def test_ondemand_programming_supplement_one_asset(get_page, course): output = course.extract_links_from_programming('0') # Make sure that SOME html content has been extracted, but remove - # it immeditely because it's a hassle to properly prepare test input + # it immediately because it's a hassle to properly prepare test input # for it. FIXME later. assert 'html' in output del output['html'] @@ -249,7 +249,7 @@ def test_ondemand_programming_immediate_instructions_one_asset(get_page, course) output = course.extract_links_from_programming_immediate_instructions('0') # Make sure that SOME html content has been extracted, but remove - # it immeditely because it's a hassle to properly prepare test input + # it immediately because it's a hassle to properly prepare test input # for it. FIXME later. assert 'html' in output del output['html'] @@ -269,7 +269,7 @@ def test_ondemand_programming_supplement_three_assets(get_page, course): output = json.loads(json.dumps(output)) # Make sure that SOME html content has been extracted, but remove - # it immeditely because it's a hassle to properly prepare test input + # it immediately because it's a hassle to properly prepare test input # for it. FIXME later. assert 'html' in output del output['html'] diff --git a/coursera/test/test_parsing.py b/coursera/test/test_parsing.py index 385e506..4eac879 100644 --- a/coursera/test/test_parsing.py +++ b/coursera/test/test_parsing.py @@ -65,7 +65,7 @@ def test_that_we_parse_and_write_json_correctly(get_page, json_path): def get_old_style_video(monkeypatch): pytest.skip() """ - Mock some methods that would, otherwise, create repeateadly many web + Mock some methods that would, otherwise, create repeatedly many web requests. More specifically, we mock: @@ -139,7 +139,7 @@ def test_get_on_demand_supplement_url_accumulates_assets(mocked): output = course.extract_links_from_supplement('element_id') # Make sure that SOME html content has been extracted, but remove - # it immeditely because it's a hassle to properly prepare test input + # it immediately because it's a hassle to properly prepare test input # for it. FIXME later. assert 'html' in output del output['html'] From d17027da41efb418384d70cc484390613ec5f850 Mon Sep 17 00:00:00 2001 From: John Doe Date: Sun, 17 Jun 2018 10:36:53 -0700 Subject: [PATCH 59/87] Typo (#672) --- README.md | 8 ++++---- coursera/cookies.py | 4 ++-- coursera/coursera_dl.py | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 8d72afc..a0059f8 100644 --- a/README.md +++ b/README.md @@ -120,7 +120,7 @@ particular courses that you want to use with `coursera-dl`. ## Recommended installation method for all Operating Systems -From a command line (preferrably, from a virtual environment), simply issue +From a command line (preferably, from a virtual environment), simply issue the command: pip install coursera-dl @@ -327,7 +327,7 @@ where the script is supposed to be executed, with the following format: #--mathjax-cdn https://cdn.bootcss.com/mathjax/2.7.1/MathJax.js # more other parameters -Parameter which is stored in the file will be overriden if it is again specifed +Parameter which is stored in the file will be overriden if it is again specified in your commandline script **Note:** In `coursera-dl.conf`, all the parameters should not be wrapped @@ -337,11 +337,11 @@ with quotes. In default mode when you interrupt the download process by pressing CTRL+C, partially downloaded files will be deleted from your disk and -you have to start the download process from the begining. If your +you have to start the download process from the beginning. If your download was interrupted by something other than KeyboardInterrupt (CTRL+C) like sudden system crash, partially downloaded files will remain on your disk and the next time you start the process again, -these files will be discraded from download list!, therefore it's your +these files will be discarded from download list!, therefore it's your job to delete them manually before next start. For this reason we added an option called `--resume` which continues your downloads from where they stopped: diff --git a/coursera/cookies.py b/coursera/cookies.py index a66f126..f12ed8f 100644 --- a/coursera/cookies.py +++ b/coursera/cookies.py @@ -25,7 +25,7 @@ from .utils import mkdir_p, random_string # Monkey patch cookielib.Cookie.__init__. # Reason: The expires value may be a decimal string, # but the Cookie class uses int() ... -__orginal_init__ = cookielib.Cookie.__init__ +__original_init__ = cookielib.Cookie.__init__ def __fixed_init__(self, version, name, value, @@ -41,7 +41,7 @@ def __fixed_init__(self, version, name, value, rfc2109=False): if expires is not None: expires = float(expires) - __orginal_init__(self, version, name, value, + __original_init__(self, version, name, value, port, port_specified, domain, domain_specified, domain_initial_dot, path, path_specified, diff --git a/coursera/coursera_dl.py b/coursera/coursera_dl.py index a883b0c..eb019ce 100644 --- a/coursera/coursera_dl.py +++ b/coursera/coursera_dl.py @@ -205,7 +205,7 @@ def download_class(args, class_name): Try to download on-demand class. @return: Tuple of (bool, bool), where the first bool indicates whether - errors occurred while parsing syllabus, the second bool indicaters + errors occurred while parsing syllabus, the second bool indicates whether the course appears to be completed. @rtype: (bool, bool) """ From b0d1cc0cffbfb5516c0febe8ed74e0c09278c3ca Mon Sep 17 00:00:00 2001 From: Richard Decal Date: Wed, 20 Jun 2018 11:27:17 -0700 Subject: [PATCH 60/87] better error msg for incorrect .netrc permissions (#674) --- coursera/credentials.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/coursera/credentials.py b/coursera/credentials.py index d6a36cd..aacd1d1 100644 --- a/coursera/credentials.py +++ b/coursera/credentials.py @@ -134,7 +134,8 @@ def authenticate_through_netrc(path=None): error_messages = '\n'.join(str(e) for e in errors) raise CredentialsError( - 'Did not find valid netrc file:\n' + error_messages) + 'Did not find valid netrc file:\n' + error_messages + + '\nPlease run this command: chmod og-rw ~/.netrc') def get_credentials(username=None, password=None, netrc=None, use_keyring=False): From 3df019a6611d687842849546f6ed57fafd23bb28 Mon Sep 17 00:00:00 2001 From: Yuri Bochkarev Date: Sun, 24 Jun 2018 00:09:31 +0300 Subject: [PATCH 61/87] Move to newer API for syllabus and lecture retrieval ref #665 ref #673 ref #634 --- coursera/api.py | 208 +++++++++++++++++++++++++++++++------- coursera/coursera_dl.py | 9 +- coursera/define.py | 42 ++++++-- coursera/extractors.py | 136 ++++++++++++++----------- coursera/test/test_api.py | 2 +- coursera/utils.py | 15 ++- requirements.txt | 1 + 7 files changed, 305 insertions(+), 108 deletions(-) diff --git a/coursera/api.py b/coursera/api.py index 657e4b5..d1832ba 100644 --- a/coursera/api.py +++ b/coursera/api.py @@ -12,9 +12,11 @@ import logging import time import requests import urllib -from collections import namedtuple + +from collections import namedtuple, OrderedDict from six import iterkeys, iteritems from six.moves.urllib_parse import quote_plus +import attr from .utils import (BeautifulSoup, make_coursera_absolute_url, extend_supplement_links, clean_url, clean_filename, @@ -26,7 +28,10 @@ from .define import (OPENCOURSE_SUPPLEMENT_URL, OPENCOURSE_ASSETS_URL, OPENCOURSE_API_ASSETS_V1_URL, OPENCOURSE_ONDEMAND_COURSE_MATERIALS, - OPENCOURSE_VIDEO_URL, + OPENCOURSE_ONDEMAND_COURSE_MATERIALS_V2, + OPENCOURSE_ONDEMAND_COURSES_V1, + OPENCOURSE_ONDEMAND_LECTURE_VIDEOS_URL, + OPENCOURSE_ONDEMAND_LECTURE_ASSETS_URL, OPENCOURSE_MEMBERSHIPS, OPENCOURSE_REFERENCES_POLL_URL, OPENCOURSE_REFERENCE_ITEM_URL, @@ -278,7 +283,7 @@ class MarkupToHTMLConverter(object): audio.insert_after(controls_tag) -class OnDemandCourseMaterialItems(object): +class OnDemandCourseMaterialItemsV1(object): """ Helper class that allows accessing lecture JSONs by lesson IDs. """ @@ -312,7 +317,7 @@ class OnDemandCourseMaterialItems(object): dom = get_page(session, OPENCOURSE_ONDEMAND_COURSE_MATERIALS, json=True, class_name=course_name) - return OnDemandCourseMaterialItems( + return OnDemandCourseMaterialItemsV1( dom['linked']['onDemandCourseMaterialItems.v1']) def get(self, lesson_id): @@ -408,6 +413,134 @@ class AssetRetriever(object): return result +@attr.s +class ModuleV1(object): + name = attr.ib() + id = attr.ib() + slug = attr.ib() + child_ids = attr.ib() + + def children(self, all_children): + return [all_children[child] for child in self.child_ids] + + +@attr.s +class ModulesV1(object): + children = attr.ib() + + @staticmethod + def from_json(data): + return ModulesV1(OrderedDict( + (item['id'], + ModuleV1(item['name'], + item['id'], + item['slug'], + item['lessonIds'])) + for item in data + )) + + def __getitem__(self, key): + return self.children[key] + + def __iter__(self): + return iter(self.children.values()) + + +@attr.s +class LessonV1(object): + name = attr.ib() + id = attr.ib() + slug = attr.ib() + child_ids = attr.ib() + + def children(self, all_children): + return [all_children[child] for child in self.child_ids] + + +@attr.s +class LessonsV1(object): + children = attr.ib() + + @staticmethod + def from_json(data): + return LessonsV1(OrderedDict( + (item['id'], + LessonV1(item['name'], + item['id'], + item['slug'], + item['itemIds'])) + for item in data + )) + + def __getitem__(self, key): + return self.children[key] + + +@attr.s +class ItemV2(object): + name = attr.ib() + id = attr.ib() + slug = attr.ib() + type_name = attr.ib() + lesson_id = attr.ib() + module_id = attr.ib() + + +@attr.s +class ItemsV2(object): + children = attr.ib() + + @staticmethod + def from_json(data): + return ItemsV2({ + item['id']: + ItemV2(item['name'], + item['id'], + item['slug'], + item['contentSummary']['typeName'], + item['lessonId'], + item['moduleId']) + for item in data + }) + + def __getitem__(self, key): + return self.children[key] + + +@attr.s +class VideoV1(object): + resolution = attr.ib() + mp4_video_url = attr.ib() + + +@attr.s +class VideosV1(object): + children = attr.ib() + + @staticmethod + def from_json(data): + + videos = [VideoV1(resolution, links['mp4VideoUrl']) + for resolution, links + in data['sources']['byResolution'].items()] + videos.sort(key=lambda video: video.resolution, reverse=True) + + videos = OrderedDict( + (video.resolution, video) + for video in videos + ) + return VideosV1(videos) + + def __contains__(self, key): + return key in self.children + + def __getitem__(self, key): + return self.children[key] + + def get_best(self): + return next(iter(self.children.values())) + + class CourseraOnDemand(object): """ This is a class that provides a friendly interface to extract certain @@ -687,9 +820,9 @@ class CourseraOnDemand(object): }) return headers - def extract_links_from_lecture(self, + def extract_links_from_lecture(self, course_id, video_id, subtitle_language='en', - resolution='540p', assets=None): + resolution='540p'): """ Return the download URLs of on-demand course video. @@ -702,18 +835,13 @@ class CourseraOnDemand(object): @param resolution: Preferred video resolution. @type resolution: str - @param assets: List of assets that may present in the video. - @type assets: [str] - @return: @see CourseraOnDemand._extract_links_from_text """ - if assets is None: - assets = [] - try: links = self._extract_videos_and_subtitles_from_lecture( - video_id, subtitle_language, resolution) + course_id, video_id, subtitle_language, resolution) + assets = self._get_lecture_asset_ids(course_id, video_id) assets = self._normalize_assets(assets) extend_supplement_links( links, self._extract_links_from_lecture_assets(assets)) @@ -727,6 +855,17 @@ class CourseraOnDemand(object): 'Could not download lecture %s: %s', video_id, exception) return None + def _get_lecture_asset_ids(self, course_id, video_id): + """ + Obtain a list of asset ids from a lecture. + """ + dom = get_page(self._session, OPENCOURSE_ONDEMAND_LECTURE_ASSETS_URL, + json=True, course_id=course_id, video_id=video_id) + # Note that we extract here "id", not definition -> assetId, as it + # be extracted later. + return [asset['id'] + for asset in dom['linked']['openCourseAssets.v1']] + def _normalize_assets(self, assets): """ Perform asset normalization. For some reason, assets that are sometimes @@ -850,41 +989,34 @@ class CourseraOnDemand(object): return urls def _extract_videos_and_subtitles_from_lecture(self, + course_id, video_id, subtitle_language='en', resolution='540p'): - dom = get_page(self._session, OPENCOURSE_VIDEO_URL, - json=True, - video_id=video_id) - logging.debug('Parsing JSON for video_id <%s>.', video_id) + + dom = get_page(self._session, OPENCOURSE_ONDEMAND_LECTURE_VIDEOS_URL, + json=True, + course_id=course_id, + video_id=video_id) + dom = dom['linked']['onDemandVideos.v1'][0] + + videos = VideosV1.from_json(dom) video_content = {} - # videos - logging.debug('Gathering video URLs for video_id <%s>.', video_id) - sources = dom['sources'] - sources.sort(key=lambda src: src['resolution']) - sources.reverse() - - # Try to select resolution requested by the user. - filtered_sources = [source - for source in sources - if source['resolution'] == resolution] - - if len(filtered_sources) == 0: - # We will just use the 'vanilla' version of sources here, instead of - # filtered_sources. - logging.warning('Requested resolution %s not available for <%s>. ' - 'Downloading highest resolution available instead.', - resolution, video_id) - else: + if resolution in videos: + source = videos[resolution] logging.debug('Proceeding with download of resolution %s of <%s>.', resolution, video_id) - sources = filtered_sources + else: + source = videos.get_best() + logging.warning( + 'Requested resolution %s not available for <%s>. ' + 'Downloading highest resolution (%s) available instead.', + resolution, video_id, source.resolution) - video_url = sources[0]['formatSources']['video/mp4'] - video_content['mp4'] = video_url + video_content['mp4'] = source.mp4_video_url subtitle_link = self._extract_subtitles_from_video_dom( dom, subtitle_language, video_id) diff --git a/coursera/coursera_dl.py b/coursera/coursera_dl.py index 4c20689..a389106 100644 --- a/coursera/coursera_dl.py +++ b/coursera/coursera_dl.py @@ -67,7 +67,8 @@ from .workflow import CourseraDownloader from .parallel import ConsecutiveDownloader, ParallelDownloader from .utils import (clean_filename, get_anchor_format, mkdir_p, fix_url, print_ssl_error_message, - decode_input, BeautifulSoup, is_debug_run) + decode_input, BeautifulSoup, is_debug_run, + spit_json, slurp_json) from .network import get_page, get_page_and_url from .commandline import parse_args @@ -126,8 +127,7 @@ def download_on_demand_class(args, class_name): cached_syllabus_filename = '%s-syllabus-parsed.json' % class_name if args.cache_syllabus and os.path.isfile(cached_syllabus_filename): - with open(cached_syllabus_filename) as syllabus_file: - modules = json.load(syllabus_file) + modules = slurp_json(cached_syllabus_filename) else: error_occured, modules = extractor.get_modules( class_name, @@ -141,8 +141,7 @@ def download_on_demand_class(args, class_name): ) if is_debug_run or args.cache_syllabus(): - with open(cached_syllabus_filename, 'w') as file_object: - json.dump(modules, file_object, indent=4) + spit_json(modules, cached_syllabus_filename) if args.only_syllabus: return error_occured, False diff --git a/coursera/define.py b/coursera/define.py index e2ec105..ff8c51e 100644 --- a/coursera/define.py +++ b/coursera/define.py @@ -61,8 +61,10 @@ OPENCOURSE_LIST_COURSES = 'https://api.coursera.org/api/courses.v1?q=watchlist&s # } # } OPENCOURSE_MEMBERSHIPS = 'https://api.coursera.org/api/memberships.v1?includes=courseId,courses.v1&q=me&showHidden=true&filter=current,preEnrolled' -OPENCOURSE_CONTENT_URL = 'https://api.coursera.org/api/opencourse.v1/course/{class_name}?showLockedItems=true' -OPENCOURSE_VIDEO_URL = 'https://api.coursera.org/api/opencourse.v1/video/{video_id}' +OPENCOURSE_ONDEMAND_LECTURE_VIDEOS_URL = \ + 'https://api.coursera.org/api/onDemandLectureVideos.v1/'\ + '{course_id}~{video_id}?includes=video&'\ + 'fields=onDemandVideos.v1(sources%2Csubtitles%2CsubtitlesVtt%2CsubtitlesTxt)' OPENCOURSE_SUPPLEMENT_URL = 'https://api.coursera.org/api/onDemandSupplements.v1/'\ '{course_id}~{element_id}?includes=asset&fields=openCourseAssets.v1%28typeName%29,openCourseAssets.v1%28definition%29' OPENCOURSE_PROGRAMMING_ASSIGNMENTS_URL = \ @@ -97,6 +99,23 @@ OPENCOURSE_REFERENCE_ITEM_URL = \ OPENCOURSE_ASSET_URL = \ 'https://api.coursera.org/api/assetUrls.v1?ids={ids}' +# Sample response: +# "linked": { +# "openCourseAssets.v1": [ +# { +# "typeName": "asset", +# "definition": { +# "assetId": "fytYX5rYEeedWRLokafKRg", +# "name": "Lecture slides" +# }, +# "id": "j6g7VZrYEeeUVgpv-dYMig" +# } +# ] +# } +OPENCOURSE_ONDEMAND_LECTURE_ASSETS_URL = \ + 'https://api.coursera.org/api/onDemandLectureAssets.v1/'\ + '{course_id}~{video_id}/?includes=openCourseAssets' + # These ids are provided in lecture json: # # { @@ -170,9 +189,20 @@ OPENCOURSE_API_ASSETS_V1_URL = \ OPENCOURSE_ONDEMAND_COURSE_MATERIALS = \ 'https://api.coursera.org/api/onDemandCourseMaterials.v1/?'\ - 'q=slug&slug={class_name}&includes=moduleIds%2ClessonIds%2CpassableItemGroups%2CpassableItemGroupChoices%2CpassableLessonElements%2CitemIds%2Ctracks'\ - '&fields=moduleIds%2ConDemandCourseMaterialModules.v1(name%2Cslug%2Cdescription%2CtimeCommitment%2ClessonIds%2Coptional)%2ConDemandCourseMaterialLessons.v1(name%2Cslug%2CtimeCommitment%2CelementIds%2Coptional%2CtrackId)%2ConDemandCourseMaterialPassableItemGroups.v1(requiredPassedCount%2CpassableItemGroupChoiceIds%2CtrackId)%2ConDemandCourseMaterialPassableItemGroupChoices.v1(name%2Cdescription%2CitemIds)%2ConDemandCourseMaterialPassableLessonElements.v1(gradingWeight)%2ConDemandCourseMaterialItems.v1(name%2Cslug%2CtimeCommitment%2Ccontent%2CisLocked%2ClockableByItem%2CitemLockedReasonCode%2CtrackId)%2ConDemandCourseMaterialTracks.v1(passablesCount)'\ - '&showLockedItems=true' + 'q=slug&slug={class_name}&includes=moduleIds%2ClessonIds%2CpassableItemGroups%2CpassableItemGroupChoices%2CpassableLessonElements%2CitemIds%2Ctracks'\ + '&fields=moduleIds%2ConDemandCourseMaterialModules.v1(name%2Cslug%2Cdescription%2CtimeCommitment%2ClessonIds%2Coptional)%2ConDemandCourseMaterialLessons.v1(name%2Cslug%2CtimeCommitment%2CelementIds%2Coptional%2CtrackId)%2ConDemandCourseMaterialPassableItemGroups.v1(requiredPassedCount%2CpassableItemGroupChoiceIds%2CtrackId)%2ConDemandCourseMaterialPassableItemGroupChoices.v1(name%2Cdescription%2CitemIds)%2ConDemandCourseMaterialPassableLessonElements.v1(gradingWeight)%2ConDemandCourseMaterialItems.v1(name%2Cslug%2CtimeCommitment%2Ccontent%2CisLocked%2ClockableByItem%2CitemLockedReasonCode%2CtrackId)%2ConDemandCourseMaterialTracks.v1(passablesCount)'\ + '&showLockedItems=true' + +OPENCOURSE_ONDEMAND_COURSE_MATERIALS_V2 = \ + 'https://api.coursera.org/api/onDemandCourseMaterials.v2/?q=slug&slug={class_name}'\ + '&includes=modules%2Clessons%2CpassableItemGroups%2CpassableItemGroupChoices%2CpassableLessonElements%2Citems%2Ctracks%2CgradePolicy&'\ + '&fields=moduleIds%2ConDemandCourseMaterialModules.v1(name%2Cslug%2Cdescription%2CtimeCommitment%2ClessonIds%2Coptional%2ClearningObjectives)%2ConDemandCourseMaterialLessons.v1(name%2Cslug%2CtimeCommitment%2CelementIds%2Coptional%2CtrackId)%2ConDemandCourseMaterialPassableItemGroups.v1(requiredPassedCount%2CpassableItemGroupChoiceIds%2CtrackId)%2ConDemandCourseMaterialPassableItemGroupChoices.v1(name%2Cdescription%2CitemIds)%2ConDemandCourseMaterialPassableLessonElements.v1(gradingWeight%2CisRequiredForPassing)%2ConDemandCourseMaterialItems.v2(name%2Cslug%2CtimeCommitment%2CcontentSummary%2CisLocked%2ClockableByItem%2CitemLockedReasonCode%2CtrackId%2ClockedStatus%2CitemLockSummary)%2ConDemandCourseMaterialTracks.v1(passablesCount)'\ + '&showLockedItems=true' + +OPENCOURSE_ONDEMAND_COURSES_V1 = \ + 'https://api.coursera.org/api/onDemandCourses.v1?q=slug&slug={class_name}&'\ + 'includes=instructorIds%2CpartnerIds%2C_links&'\ + 'fields=brandingImage%2CcertificatePurchaseEnabledAt%2Cpartners.v1(squareLogo%2CrectangularLogo)%2Cinstructors.v1(fullName)%2CoverridePartnerLogos%2CsessionsEnabledAt%2CdomainTypes%2CpremiumExperienceVariant%2CisRestrictedMembership' ABOUT_URL = ('https://api.coursera.org/api/catalog.v1/courses?' 'fields=largeIcon,photo,previewLink,shortDescription,smallIcon,' @@ -924,7 +954,7 @@ pre {