diff --git a/coursera/coursera_dl.py b/coursera/coursera_dl.py index d812baf..df835c9 100644 --- a/coursera/coursera_dl.py +++ b/coursera/coursera_dl.py @@ -42,16 +42,11 @@ For further documentation and examples, visit the project's home at: """ -import datetime -import glob import json import logging import os import re import shutil -import subprocess -import time -import codecs from distutils.version import LooseVersion as V @@ -60,23 +55,19 @@ from distutils.version import LooseVersion as V # We may, perhaps, want to move these elsewhere. import bs4 import six -from six import iteritems import requests from .cookies import ( AuthenticationFailed, ClassNotFound, - get_cookies_for_class, make_cookie_values, login, TLSAdapter) -from .define import (CLASS_URL, ABOUT_URL, PATH_CACHE, - OPENCOURSE_CONTENT_URL, IN_MEMORY_MARKER, - FORMAT_MAX_LENGTH, TITLE_MAX_LENGTH) + get_cookies_for_class, make_cookie_values, TLSAdapter) +from .define import (CLASS_URL, ABOUT_URL, PATH_CACHE) from .downloaders import get_downloader +from .workflow import CourseraDownloader from .utils import (clean_filename, get_anchor_format, mkdir_p, fix_url, - print_ssl_error_message, normalize_path, + print_ssl_error_message, decode_input, BeautifulSoup, is_debug_run) from .network import get_page, get_page_and_url -from .api import CourseraOnDemand, OnDemandCourseMaterialItems -from .filter import skip_format_url from .commandline import parse_args from .extractors import CourseraExtractor @@ -323,333 +314,6 @@ def download_about(session, class_name, path='', overwrite=False, about_file.write(json_data) return element - -def is_course_complete(last_update): - """ - Determine is the course is likely to have been terminated or not. - - We return True if the timestamp given by last_update is 30 days or older - than today's date. Otherwise, we return True. - - The intended use case for this is to detect if a given courses has not - seen any update in the last 30 days or more. Otherwise, we return True, - since it is probably too soon to declare the course complete. - """ - rv = False - if last_update >= 0: - delta = time.time() - last_update - max_delta = total_seconds(datetime.timedelta(days=30)) - if delta > max_delta: - rv = True - return rv - - -def format_section(num, section, class_name, verbose_dirs): - sec = '%02d_%s' % (num, section) - if verbose_dirs: - sec = class_name.upper() + '_' + sec - return sec - - -def format_resource(num, name, title, fmt): - if title: - title = '_' + title - return '%02d_%s%s.%s' % (num, name, title, fmt) - - -def format_combine_number_resource(secnum, lecnum, lecname, title, fmt): - if title: - title = '_' + title - return '%02d_%02d_%s%s.%s' % (secnum, lecnum, lecname, title, fmt) - - -def find_resources_to_get(lecture, file_formats, resource_filter, ignored_formats=None): - """ - Select formats to download. - """ - resources_to_get = [] - - if ignored_formats is None: - ignored_formats = [] - - if len(ignored_formats): - logging.info("The following file formats will be ignored: " + ",".join(ignored_formats)) - - for fmt, resources in iteritems(lecture): - - fmt0 = fmt - if '.' in fmt: - fmt = fmt.split('.')[1] - - if fmt in ignored_formats: - continue - - if fmt in file_formats or 'all' in file_formats: - for r in resources: - if resource_filter and r[1] and not re.search(resource_filter, r[1]): - logging.debug('Skipping b/c of rf: %s %s', - resource_filter, r[1]) - continue - resources_to_get.append((fmt0, r[0], r[1])) - else: - logging.debug( - 'Skipping b/c format %s not in %s', fmt, file_formats) - - return resources_to_get - - -def create_m3u_playlist(section_dir): - """ - Create M3U playlist with contents of `section_dir`/*.mp4. The playlist - will be created in that directory. - - @param section_dir: Path where to scan for *.mp4 files. - @type section_dir: str - """ - path_to_return = os.getcwd() - - for (_path, subdirs, files) in os.walk(section_dir): - os.chdir(_path) - globbed_videos = sorted(glob.glob("*.mp4")) - m3u_name = os.path.split(_path)[1] + ".m3u" - - if len(globbed_videos): - with open(m3u_name, "w") as m3u: - for video in globbed_videos: - m3u.write(video + "\n") - os.chdir(path_to_return) - os.chdir(path_to_return) - - -def handle_resource(downloader, - lecture_filename, - fmt, - url, - overwrite, - resume, - skip_download, - section_dir, - skipped_urls, - last_update): - """ - Handle resource. This function builds up resource file name and - downloads it if necessary. - - @param downloader: Resource downloader instance. - @type downloader: downloaders.Downloader - - @param fmt: Format of the resource (pdf, csv, etc) - @type fmt: str - - @param url: URL of the resource. - @type url: str - - @param overwrite: Flag that indicates whether files should be overwritten. - @type overwrite: bool - - @param resume: Flag that indicates whether download should be resumed. - @type resume: bool - - @param skip_download: Flag that indicates whether download should be skipped. - @type skip_download: bool - - @param section_dir: Path to current section directory. - @type section_dir: str - - @param skipped_urls: List of skipped urls to update. - @type skipped_urls: None or list - - @param last_update: Latest mtime across files. - @type last_update: timestamp - - @return: Updated latest mtime. - @rtype: timestamp - """ - # Decide whether we need to download it - if overwrite or not os.path.exists(lecture_filename) or resume: - if not skip_download: - if url.startswith(IN_MEMORY_MARKER): - page_content = url[len(IN_MEMORY_MARKER):] - logging.info('Saving page contents to: %s', lecture_filename) - with codecs.open(lecture_filename, 'w', 'utf-8') as file_object: - file_object.write(page_content) - else: - if skipped_urls is not None and skip_format_url(fmt, url): - skipped_urls.append(url) - else: - logging.info('Downloading: %s', lecture_filename) - downloader.download(url, lecture_filename, resume=resume) - else: - open(lecture_filename, 'w').close() # touch - last_update = time.time() - else: - logging.info('%s already downloaded', lecture_filename) - # if this file hasn't been modified in a long time, - # record that time - last_update = max(last_update, os.path.getmtime(lecture_filename)) - - return last_update - - -def get_lecture_filename(combined_section_lectures_nums, - section_dir, - secnum, - lecnum, - lecname, - title, - fmt): - """ - Prepare a destination lecture filename. - - @param combined_section_lectures_nums: Flag that indicates whether - section lectures should have combined numbering. - @type combined_section_lectures_nums: bool - - @param section_dir: Path to current section directory. - @type section_dir: str - - @param secnum: Section number. - @type secnum: int - - @param lecnum: Lecture number. - @type lecnum: int - - @param lecname: Lecture name. - @type lecname: str - - @param title: Resource title. - @type title: str - - @param fmt: Format of the resource (pdf, csv, etc) - @type fmt: str - - @return: Lecture file name. - @rtype: str - """ - # FIXME: this is a quick and dirty solution to Filename too long - # problem. We need to think of a more general way to solve this - # issue. - fmt = fmt[:FORMAT_MAX_LENGTH] - title = title[:TITLE_MAX_LENGTH] - - # Format lecture file name - if combined_section_lectures_nums: - lecture_filename = os.path.join( - section_dir, - format_combine_number_resource( - secnum + 1, lecnum + 1, lecname, title, fmt)) - else: - lecture_filename = os.path.join( - section_dir, format_resource(lecnum + 1, lecname, title, fmt)) - - return lecture_filename - - -def download_lectures(downloader, - class_name, - sections, - file_formats, - overwrite=False, - skip_download=False, - section_filter=None, - lecture_filter=None, - resource_filter=None, - path='', - verbose_dirs=False, - preview=False, - combined_section_lectures_nums=False, - hooks=None, - playlist=False, - unrestricted_filenames=False, - ignored_formats=None, - resume=False, - skipped_urls=None, - failed_urls=None, - video_resolution='540p'): - """ - Download lecture resources described by sections. - - Returns True if the class appears completed, False otherwise. - """ - last_update = -1 - - for (secnum, (section, lectures)) in enumerate(sections): - if section_filter and not re.search(section_filter, section): - logging.debug('Skipping b/c of sf: %s %s', section_filter, - section) - continue - - section_dir = os.path.join( - path, class_name, - format_section(secnum + 1, section, class_name, verbose_dirs)) - for (lecnum, (lecname, lecture)) in enumerate(lectures): - if lecture_filter and not re.search(lecture_filter, - lecname): - logging.debug('Skipping b/c of lf: %s %s', lecture_filter, - lecname) - continue - - if not os.path.exists(section_dir): - mkdir_p(normalize_path(section_dir)) - - resources_to_get = find_resources_to_get(lecture, - file_formats, - resource_filter, - ignored_formats) - - # write lecture resources - for fmt, url, title in resources_to_get: - lecture_filename = get_lecture_filename( - combined_section_lectures_nums, - section_dir, secnum, lecnum, lecname, title, fmt) - - lecture_filename = normalize_path(lecture_filename) - - try: - last_update = handle_resource( - downloader, lecture_filename, fmt, url, - overwrite, resume, skip_download, - section_dir, skipped_urls, last_update) - except requests.exceptions.RequestException as e: - logging.error('The following error has occurred while ' - 'downloading URL %s: %s', url, str(e)) - if failed_urls is None: - logging.info('If you want to ignore HTTP errors, ' - 'please use "--ignore-http-errors" option') - raise - else: - failed_urls.append(url) - - # After fetching resources, create a playlist in M3U format with the - # videos downloaded. - if playlist: - create_m3u_playlist(section_dir) - - if hooks: - for hook in hooks: - logging.info('Running hook %s for section %s.', hook, section_dir) - os.chdir(section_dir) - subprocess.call(hook) - - # if we haven't updated any files in 1 month, we're probably - # done with this course - rv = is_course_complete(last_update) - if rv: - logging.info('COURSE PROBABLY COMPLETE: ' + class_name) - - return rv - - -def total_seconds(td): - """ - Compute total seconds for a timedelta. - - Added for backward compatibility, pre 2.7. - """ - return (td.microseconds + - (td.seconds + td.days * 24 * 3600) * 10 ** 6) // 10 ** 6 - - def download_old_style_class(args, class_name): """ Download all requested resources from the class given in class_name. @@ -742,25 +406,9 @@ def download_on_demand_class(args, class_name): Returns True if the class appears completed. """ - ignored_formats = [] - if args.ignore_formats: - ignored_formats = args.ignore_formats.split(",") - session = get_session() extractor = CourseraExtractor(session, args.username, args.password) - # login(session, args.username, args.password) - - # get the syllabus listing - # page = get_on_demand_syllabus(session, class_name) - - # parse it - # modules = parse_on_demand_syllabus(session, page, - # args.reverse, - # args.unrestricted_filenames, - # args.subtitle_language, - # args.video_resolution) - modules = extractor.get_modules(class_name, args.reverse, args.unrestricted_filenames, @@ -775,46 +423,29 @@ def download_on_demand_class(args, class_name): # obtain the resources - skipped_urls = [] - failed_urls = [] + ignored_formats = [] + if args.ignore_formats: + ignored_formats = args.ignore_formats.split(",") - completed = True - for idx, module in enumerate(modules): - module_name = '%02d_%s' % (idx + 1, module[0]) - sections = module[1] + course_downloader = CourseraDownloader( + downloader, + commandline_args=args, + class_name=class_name, + path=args.path, + ignored_formats=ignored_formats, + disable_url_skipping=args.disable_url_skipping + ) - result = download_lectures( - downloader, - module_name, - sections, - args.file_formats, - args.overwrite, - args.skip_download, - args.section_filter, - args.lecture_filter, - args.resource_filter, - os.path.join(args.path, class_name), - args.verbose_dirs, - args.preview, - args.combined_section_lectures_nums, - args.hooks, - args.playlist, - args.unrestricted_filenames, - ignored_formats, - args.resume, - None if args.disable_url_skipping else skipped_urls, - failed_urls - ) - completed = completed and result + completed = course_downloader.download_modules(modules) # Print skipped URLs if any - if skipped_urls: - print_skipped_urls(skipped_urls) + if course_downloader.skipped_urls: + print_skipped_urls(course_downloader.skipped_urls) # Print failed URLs if any # FIXME: should we set non-zero exit code if we have failed URLs? - if failed_urls: - print_failed_urls(failed_urls) + if course_downloader.failed_urls: + print_failed_urls(course_downloader.failed_urls) return completed diff --git a/coursera/downloaders.py b/coursera/downloaders.py index ccd3ef2..5da439b 100644 --- a/coursera/downloaders.py +++ b/coursera/downloaders.py @@ -12,13 +12,17 @@ from __future__ import print_function import logging import math import os -import requests import subprocess import sys import time +import requests + from six import iteritems +# +# Below are file downloaders, they are wrappers for external downloaders. +# class Downloader(object): """ diff --git a/coursera/extractors.py b/coursera/extractors.py index 58c8b73..9dec927 100644 --- a/coursera/extractors.py +++ b/coursera/extractors.py @@ -65,7 +65,7 @@ class CourseraExtractor(PlatformExtractor): course_name = dom['slug'] logging.info('Parsing syllabus of on-demand course. ' - 'This may take some time, please be patient ...') + 'This may take some time, please be patient ...') modules = [] json_modules = dom['courseMaterial']['elements'] course = CourseraOnDemand(session=self._session, course_id=dom['id'], diff --git a/coursera/filter.py b/coursera/filter.py index 18eb6e4..d9ed5d5 100644 --- a/coursera/filter.py +++ b/coursera/filter.py @@ -3,7 +3,9 @@ This module contains filtering functions. """ import re +import logging +from six import iteritems from six.moves.urllib_parse import urlparse @@ -77,3 +79,38 @@ def skip_format_url(format_, url): # Do not skip return False + + +def find_resources_to_get(lecture, file_formats, resource_filter, ignored_formats=None): + """ + Select formats to download. + """ + resources_to_get = [] + + if ignored_formats is None: + ignored_formats = [] + + if len(ignored_formats): + logging.info("The following file formats will be ignored: " + ",".join(ignored_formats)) + + for fmt, resources in iteritems(lecture): + + fmt0 = fmt + if '.' in fmt: + fmt = fmt.split('.')[1] + + if fmt in ignored_formats: + continue + + if fmt in file_formats or 'all' in file_formats: + for r in resources: + if resource_filter and r[1] and not re.search(resource_filter, r[1]): + logging.debug('Skipping b/c of rf: %s %s', + resource_filter, r[1]) + continue + resources_to_get.append((fmt0, r[0], r[1])) + else: + logging.debug( + 'Skipping b/c format %s not in %s', fmt, file_formats) + + return resources_to_get diff --git a/coursera/formatting.py b/coursera/formatting.py new file mode 100644 index 0000000..98c08ec --- /dev/null +++ b/coursera/formatting.py @@ -0,0 +1,76 @@ +import os + +from .define import FORMAT_MAX_LENGTH, TITLE_MAX_LENGTH + + +def format_section(num, section, class_name, verbose_dirs): + sec = '%02d_%s' % (num, section) + if verbose_dirs: + sec = class_name.upper() + '_' + sec + return sec + + +def format_resource(num, name, title, fmt): + if title: + title = '_' + title + return '%02d_%s%s.%s' % (num, name, title, fmt) + + +def format_combine_number_resource(secnum, lecnum, lecname, title, fmt): + if title: + title = '_' + title + return '%02d_%02d_%s%s.%s' % (secnum, lecnum, lecname, title, fmt) + + +def get_lecture_filename(combined_section_lectures_nums, + section_dir, + secnum, + lecnum, + lecname, + title, + fmt): + """ + Prepare a destination lecture filename. + + @param combined_section_lectures_nums: Flag that indicates whether + section lectures should have combined numbering. + @type combined_section_lectures_nums: bool + + @param section_dir: Path to current section directory. + @type section_dir: str + + @param secnum: Section number. + @type secnum: int + + @param lecnum: Lecture number. + @type lecnum: int + + @param lecname: Lecture name. + @type lecname: str + + @param title: Resource title. + @type title: str + + @param fmt: Format of the resource (pdf, csv, etc) + @type fmt: str + + @return: Lecture file name. + @rtype: str + """ + # FIXME: this is a quick and dirty solution to Filename too long + # problem. We need to think of a more general way to solve this + # issue. + fmt = fmt[:FORMAT_MAX_LENGTH] + title = title[:TITLE_MAX_LENGTH] + + # Format lecture file name + if combined_section_lectures_nums: + lecture_filename = os.path.join( + section_dir, + format_combine_number_resource( + secnum + 1, lecnum + 1, lecname, title, fmt)) + else: + lecture_filename = os.path.join( + section_dir, format_resource(lecnum + 1, lecname, title, fmt)) + + return lecture_filename diff --git a/coursera/playlist.py b/coursera/playlist.py new file mode 100644 index 0000000..f273595 --- /dev/null +++ b/coursera/playlist.py @@ -0,0 +1,25 @@ +import os + + +def create_m3u_playlist(section_dir): + """ + Create M3U playlist with contents of `section_dir`/*.mp4. The playlist + will be created in that directory. + + @param section_dir: Path where to scan for *.mp4 files. + @type section_dir: str + """ + path_to_return = os.getcwd() + + for (_path, subdirs, files) in os.walk(section_dir): + os.chdir(_path) + globbed_videos = sorted(glob.glob("*.mp4")) + m3u_name = os.path.split(_path)[1] + ".m3u" + + if len(globbed_videos): + with open(m3u_name, "w") as m3u: + for video in globbed_videos: + m3u.write(video + "\n") + os.chdir(path_to_return) + os.chdir(path_to_return) + diff --git a/coursera/test/test_downloaders.py b/coursera/test/test_downloaders.py index d8d006d..0f65867 100644 --- a/coursera/test/test_downloaders.py +++ b/coursera/test/test_downloaders.py @@ -6,6 +6,7 @@ Test the downloaders. from coursera import downloaders from coursera import coursera_dl +from coursera.filter import find_resources_to_get import pytest @@ -21,7 +22,7 @@ def sample_bag(): def test_collect_all_resources(sample_bag): - res = coursera_dl.find_resources_to_get(sample_bag, 'all', None) + res = find_resources_to_get(sample_bag, 'all', None) assert [('mp4', 'h://url1/lc1.mp4', 'video'), ('pdf', 'h://url2/lc2.pdf', 'slides'), @@ -29,13 +30,13 @@ def test_collect_all_resources(sample_bag): def test_collect_only_pdfs(sample_bag): - res = coursera_dl.find_resources_to_get(sample_bag, 'pdf', None) + res = find_resources_to_get(sample_bag, 'pdf', None) assert [('pdf', 'h://url2/lc2.pdf', 'slides')] == sorted(res) def test_collect_with_filtering(sample_bag): - res = coursera_dl.find_resources_to_get(sample_bag, 'all', 'de') + res = find_resources_to_get(sample_bag, 'all', 'de') res = sorted(res) assert [('mp4', 'h://url1/lc1.mp4', 'video'), diff --git a/coursera/test/test_utils.py b/coursera/test/test_utils.py index 725f5e2..028d1f9 100644 --- a/coursera/test/test_utils.py +++ b/coursera/test/test_utils.py @@ -19,6 +19,9 @@ from coursera import coursera_dl from coursera import api from coursera.test.utils import slurp_fixture +from coursera.formatting import (format_section, format_resource, + format_combine_number_resource) +from coursera.utils import total_seconds, is_course_complete @pytest.mark.parametrize( @@ -98,32 +101,32 @@ def test_fix_url_removes_spaces(): def test_format_combine_resource_works_correctly(): - rv = coursera_dl.format_combine_number_resource(5, 4, "Moving_the_furniture", 'The_Basics', "mp4") + rv = format_combine_number_resource(5, 4, "Moving_the_furniture", 'The_Basics', "mp4") assert '05_04_Moving_the_furniture_The_Basics.mp4' == rv def test_format_combine_resource_works_correctly_without_title(): - rv = coursera_dl.format_combine_number_resource(5, 1, "Introduction", '', "mp4") + rv = format_combine_number_resource(5, 1, "Introduction", '', "mp4") assert '05_01_Introduction.mp4' == rv def test_format_resource_works_correctly(): - rv = coursera_dl.format_resource(2, "Washing", "Dishes", "mp9") + rv = format_resource(2, "Washing", "Dishes", "mp9") assert '02_Washing_Dishes.mp9' == rv def test_format_resource_works_correctly_without_title(): - rv = coursera_dl.format_resource(1, "Introduction", '', "mp2") + rv = format_resource(1, "Introduction", '', "mp2") assert '01_Introduction.mp2' == rv def test_format_section_works_correctly(): - rv = coursera_dl.format_section(9, 'bob', 'WEAVING', False) + rv = format_section(9, 'bob', 'WEAVING', False) assert '09_bob' == rv def test_format_section_works_correctly_with_verbose(): - rv = coursera_dl.format_section(9, 'bill', 'WEAVING', True) + rv = format_section(9, 'bill', 'WEAVING', True) assert 'WEAVING_09_bill' == rv @@ -146,25 +149,25 @@ def test_decode_input(): def test_total_seconds(): - ts = coursera_dl.total_seconds(datetime.timedelta(days=30)) + ts = total_seconds(datetime.timedelta(days=30)) assert ts == 2592000 def test_is_course_complete_should_give_false_if_there_was_recent_update(): - delta = coursera_dl.total_seconds(datetime.timedelta(days=29)) + delta = total_seconds(datetime.timedelta(days=29)) tm = time() - delta - rv = coursera_dl.is_course_complete(tm) + rv = is_course_complete(tm) assert rv is False def test_is_course_complete_should_give_true_if_there_was_no_recent_update(): - delta = coursera_dl.total_seconds(datetime.timedelta(days=31)) + delta = total_seconds(datetime.timedelta(days=31)) tm = time() - delta - rv = coursera_dl.is_course_complete(tm) + rv = is_course_complete(tm) assert rv is True diff --git a/coursera/utils.py b/coursera/utils.py index 9642946..edbb270 100644 --- a/coursera/utils.py +++ b/coursera/utils.py @@ -4,24 +4,22 @@ This module provides utility functions that are used within the script. """ -import errno import os -import random import re -import string import sys +import time +import errno +import random +import string import logging +import datetime + -import six from bs4 import BeautifulSoup as BeautifulSoup_ -# Force us of bs4 with html5lib -BeautifulSoup = lambda page: BeautifulSoup_(page, 'html5lib') - -from .define import COURSERA_URL, WINDOWS_UNC_PREFIX - -from six.moves import html_parser +import six from six import iteritems +from six.moves import html_parser from six.moves.urllib.parse import ParseResult from six.moves.urllib_parse import unquote_plus @@ -39,6 +37,11 @@ else: from string import letters as string_ascii_letters from string import digits as string_digits +from .define import COURSERA_URL, WINDOWS_UNC_PREFIX + +# Force us of bs4 with html5lib +BeautifulSoup = lambda page: BeautifulSoup_(page, 'html5lib') + if six.PY2: def decode_input(x): @@ -184,6 +187,36 @@ def fix_url(url): return url +def is_course_complete(last_update): + """ + Determine is the course is likely to have been terminated or not. + + We return True if the timestamp given by last_update is 30 days or older + than today's date. Otherwise, we return True. + + The intended use case for this is to detect if a given courses has not + seen any update in the last 30 days or more. Otherwise, we return True, + since it is probably too soon to declare the course complete. + """ + rv = False + if last_update >= 0: + delta = time.time() - last_update + max_delta = total_seconds(datetime.timedelta(days=30)) + if delta > max_delta: + rv = True + return rv + + +def total_seconds(td): + """ + Compute total seconds for a timedelta. + + Added for backward compatibility, pre 2.7. + """ + return (td.microseconds + + (td.seconds + td.days * 24 * 3600) * 10 ** 6) // 10 ** 6 + + def make_coursera_absolute_url(url): """ If given url is relative adds coursera netloc, diff --git a/coursera/workflow.py b/coursera/workflow.py new file mode 100644 index 0000000..3f67520 --- /dev/null +++ b/coursera/workflow.py @@ -0,0 +1,213 @@ +import os +import re +import abc +import time +import codecs +import logging +import subprocess + +import requests + +from .formatting import format_section, get_lecture_filename +from .playlist import create_m3u_playlist +from .utils import is_course_complete, mkdir_p, normalize_path +from .filter import find_resources_to_get, skip_format_url +from .define import IN_MEMORY_MARKER + + +def handle_resource(downloader, + lecture_filename, + fmt, + url, + overwrite, + resume, + skip_download, + skipped_urls, + last_update): + """ + Handle resource. This function builds up resource file name and + downloads it if necessary. + + @param downloader: Resource downloader instance. + @type downloader: downloaders.Downloader + + @param fmt: Format of the resource (pdf, csv, etc) + @type fmt: str + + @param url: URL of the resource. + @type url: str + + @param overwrite: Flag that indicates whether files should be overwritten. + @type overwrite: bool + + @param resume: Flag that indicates whether download should be resumed. + @type resume: bool + + @param skip_download: Flag that indicates whether download should be skipped. + @type skip_download: bool + + @param skipped_urls: List of skipped urls to update. + @type skipped_urls: None or list + + @param last_update: Latest mtime across files. + @type last_update: timestamp + + @return: Updated latest mtime. + @rtype: timestamp + """ + # Decide whether we need to download it + if overwrite or not os.path.exists(lecture_filename) or resume: + if not skip_download: + if url.startswith(IN_MEMORY_MARKER): + page_content = url[len(IN_MEMORY_MARKER):] + logging.info('Saving page contents to: %s', lecture_filename) + with codecs.open(lecture_filename, 'w', 'utf-8') as file_object: + file_object.write(page_content) + else: + if skipped_urls is not None and skip_format_url(fmt, url): + skipped_urls.append(url) + else: + logging.info('Downloading: %s', lecture_filename) + downloader.download(url, lecture_filename, resume=resume) + else: + open(lecture_filename, 'w').close() # touch + last_update = time.time() + else: + logging.info('%s already downloaded', lecture_filename) + # if this file hasn't been modified in a long time, + # record that time + last_update = max(last_update, os.path.getmtime(lecture_filename)) + + return last_update + + +class CourseDownloader(object): + __metaclass__ = abc.ABCMeta + + def __init__(self): + pass + + @abc.abstractmethod + def download_modules(self, modules): + pass + + +class CourseraDownloader(CourseDownloader): + def __init__(self, + downloader, + commandline_args, + class_name, + path='', + ignored_formats=None, + disable_url_skipping=False): + super(CourseraDownloader, self).__init__() + + self._downloader = downloader + self._args = commandline_args + self._class_name = class_name + self._path = path + self._ignored_formats = ignored_formats + self._disable_url_skipping = disable_url_skipping + + self.skipped_urls = [] if disable_url_skipping else None + self.failed_urls = [] + + self._last_update = -1 + + def download_modules(self, modules): + completed = True + for idx, module in enumerate(modules): + module_name = '%02d_%s' % (idx + 1, module[0]) + sections = module[1] + result = self._download_sections(module_name, sections) + completed = completed and result + return completed + + def _download_sections(self, module_name, sections): + """ + Download lecture resources described by sections. + + Returns True if the class appears completed, False otherwise. + """ + self._last_update = -1 + + section_filter = self._args.section_filter + verbose_dirs = self._args.verbose_dirs + hooks = self._args.hooks + playlist = self._args.playlist + + for (secnum, (section, lectures)) in enumerate(sections): + if section_filter and not re.search(section_filter, section): + logging.debug('Skipping b/c of sf: %s %s', section_filter, + section) + continue + + section_dir = os.path.join( + self._path, self._class_name, module_name, + format_section(secnum + 1, section, + self._class_name, verbose_dirs)) + + self._download_lectures(lectures, secnum, section_dir) + + # After fetching resources, create a playlist in M3U format with the + # videos downloaded. + if playlist: + create_m3u_playlist(section_dir) + + if hooks: + original_dir = os.getcwd() + for hook in hooks: + logging.info('Running hook %s for section %s.', hook, section_dir) + os.chdir(section_dir) + subprocess.call(hook) + os.chdir(original_dir) + + # if we haven't updated any files in 1 month, we're probably + # done with this course + is_complete = is_course_complete(self._last_update) + if is_complete: + logging.info('COURSE PROBABLY COMPLETE: ' + self._class_name) + + return is_complete + + def _download_lectures(self, lectures, secnum, section_dir): + lecture_filter = self._args.lecture_filter + file_formats = self._args.file_formats + resource_filter = self._args.resource_filter + combined_section_lectures_nums = self._args.combined_section_lectures_nums + overwrite = self._args.overwrite + resume = self._args.resume + skip_download = self._args.skip_download + + for (lecnum, (lecname, lecture)) in enumerate(lectures): + if lecture_filter and not re.search(lecture_filter, + lecname): + logging.debug('Skipping b/c of lf: %s %s', lecture_filter, + lecname) + continue + + if not os.path.exists(section_dir): + mkdir_p(normalize_path(section_dir)) + + resources_to_get = find_resources_to_get(lecture, + file_formats, + resource_filter, + self._ignored_formats) + + # write lecture resources + for fmt, url, title in resources_to_get: + lecture_filename = get_lecture_filename( + combined_section_lectures_nums, + section_dir, secnum, lecnum, lecname, title, fmt) + + lecture_filename = normalize_path(lecture_filename) + + try: + self._last_update = handle_resource( + self._downloader, lecture_filename, fmt, url, + overwrite, resume, skip_download, + self.skipped_urls, self._last_update) + except requests.exceptions.RequestException as e: + logging.error('The following error has occurred while ' + 'downloading URL %s: %s', url, str(e)) + self.failed_urls.append(url)