Moves things around, decompose into modules (WIP)

2026-01-23 02:35:37 +00:00 · 2016-07-06 13:37:40 +03:00 · 2016-07-06 13:37:40 +03:00 · f308e12e53
commit f308e12e53
parent afb6384e9c
10 changed files with 438 additions and 415 deletions
--- a/coursera/coursera_dl.py
+++ b/coursera/coursera_dl.py
@ -42,16 +42,11 @@ For further documentation and examples, visit the project's home at:
 """


-import datetime
-import glob
 import json
 import logging
 import os
 import re
 import shutil
-import subprocess
-import time
-import codecs

 from distutils.version import LooseVersion as V

@ -60,23 +55,19 @@ from distutils.version import LooseVersion as V
 # We may, perhaps, want to move these elsewhere.
 import bs4
 import six
-from six import iteritems
 import requests

 from .cookies import (
    AuthenticationFailed, ClassNotFound,
-    get_cookies_for_class, make_cookie_values, login, TLSAdapter)
-from .define import (CLASS_URL, ABOUT_URL, PATH_CACHE,
-                     OPENCOURSE_CONTENT_URL, IN_MEMORY_MARKER,
-                     FORMAT_MAX_LENGTH, TITLE_MAX_LENGTH)
+    get_cookies_for_class, make_cookie_values, TLSAdapter)
+from .define import (CLASS_URL, ABOUT_URL, PATH_CACHE)
 from .downloaders import get_downloader
+from .workflow import CourseraDownloader
 from .utils import (clean_filename, get_anchor_format, mkdir_p, fix_url,
-                    print_ssl_error_message, normalize_path,
+                    print_ssl_error_message,
                    decode_input, BeautifulSoup, is_debug_run)

 from .network import get_page, get_page_and_url
-from .api import CourseraOnDemand, OnDemandCourseMaterialItems
-from .filter import skip_format_url
 from .commandline import parse_args
 from .extractors import CourseraExtractor

@ -323,333 +314,6 @@ def download_about(session, class_name, path='', overwrite=False,
                about_file.write(json_data)
                return element

-
-def is_course_complete(last_update):
-    """
-    Determine is the course is likely to have been terminated or not.
-
-    We return True if the timestamp given by last_update is 30 days or older
-    than today's date.  Otherwise, we return True.
-
-    The intended use case for this is to detect if a given courses has not
-    seen any update in the last 30 days or more.  Otherwise, we return True,
-    since it is probably too soon to declare the course complete.
-    """
-    rv = False
-    if last_update >= 0:
-        delta = time.time() - last_update
-        max_delta = total_seconds(datetime.timedelta(days=30))
-        if delta > max_delta:
-            rv = True
-    return rv
-
-
-def format_section(num, section, class_name, verbose_dirs):
-    sec = '%02d_%s' % (num, section)
-    if verbose_dirs:
-        sec = class_name.upper() + '_' + sec
-    return sec
-
-
-def format_resource(num, name, title, fmt):
-    if title:
-        title = '_' + title
-    return '%02d_%s%s.%s' % (num, name, title, fmt)
-
-
-def format_combine_number_resource(secnum, lecnum, lecname, title, fmt):
-    if title:
-        title = '_' + title
-    return '%02d_%02d_%s%s.%s' % (secnum, lecnum, lecname, title, fmt)
-
-
-def find_resources_to_get(lecture, file_formats, resource_filter, ignored_formats=None):
-    """
-    Select formats to download.
-    """
-    resources_to_get = []
-
-    if ignored_formats is None:
-        ignored_formats = []
-
-    if len(ignored_formats):
-        logging.info("The following file formats will be ignored: " + ",".join(ignored_formats))
-
-    for fmt, resources in iteritems(lecture):
-
-        fmt0 = fmt
-        if '.' in fmt:
-            fmt = fmt.split('.')[1]
-
-        if fmt in ignored_formats:
-            continue
-
-        if fmt in file_formats or 'all' in file_formats:
-            for r in resources:
-                if resource_filter and r[1] and not re.search(resource_filter, r[1]):
-                    logging.debug('Skipping b/c of rf: %s %s',
-                                  resource_filter, r[1])
-                    continue
-                resources_to_get.append((fmt0, r[0], r[1]))
-        else:
-            logging.debug(
-                'Skipping b/c format %s not in %s', fmt, file_formats)
-
-    return resources_to_get
-
-
-def create_m3u_playlist(section_dir):
-    """
-    Create M3U playlist with contents of `section_dir`/*.mp4. The playlist
-    will be created in that directory.
-
-    @param section_dir: Path where to scan for *.mp4 files.
-    @type section_dir: str
-    """
-    path_to_return = os.getcwd()
-
-    for (_path, subdirs, files) in os.walk(section_dir):
-        os.chdir(_path)
-        globbed_videos = sorted(glob.glob("*.mp4"))
-        m3u_name = os.path.split(_path)[1] + ".m3u"
-
-        if len(globbed_videos):
-            with open(m3u_name, "w") as m3u:
-                for video in globbed_videos:
-                    m3u.write(video + "\n")
-            os.chdir(path_to_return)
-    os.chdir(path_to_return)
-
-
-def handle_resource(downloader,
-                    lecture_filename,
-                    fmt,
-                    url,
-                    overwrite,
-                    resume,
-                    skip_download,
-                    section_dir,
-                    skipped_urls,
-                    last_update):
-    """
-    Handle resource. This function builds up resource file name and
-    downloads it if necessary.
-
-    @param downloader: Resource downloader instance.
-    @type downloader: downloaders.Downloader
-
-    @param fmt: Format of the resource (pdf, csv, etc)
-    @type fmt: str
-
-    @param url: URL of the resource.
-    @type url: str
-
-    @param overwrite: Flag that indicates whether files should be overwritten.
-    @type overwrite: bool
-
-    @param resume: Flag that indicates whether download should be resumed.
-    @type resume: bool
-
-    @param skip_download: Flag that indicates whether download should be skipped.
-    @type skip_download: bool
-
-    @param section_dir: Path to current section directory.
-    @type section_dir: str
-
-    @param skipped_urls: List of skipped urls to update.
-    @type skipped_urls: None or list
-
-    @param last_update: Latest mtime across files.
-    @type last_update: timestamp
-
-    @return: Updated latest mtime.
-    @rtype: timestamp
-    """
-    # Decide whether we need to download it
-    if overwrite or not os.path.exists(lecture_filename) or resume:
-        if not skip_download:
-            if url.startswith(IN_MEMORY_MARKER):
-                page_content = url[len(IN_MEMORY_MARKER):]
-                logging.info('Saving page contents to: %s', lecture_filename)
-                with codecs.open(lecture_filename, 'w', 'utf-8') as file_object:
-                    file_object.write(page_content)
-            else:
-                if skipped_urls is not None and skip_format_url(fmt, url):
-                    skipped_urls.append(url)
-                else:
-                    logging.info('Downloading: %s', lecture_filename)
-                    downloader.download(url, lecture_filename, resume=resume)
-        else:
-            open(lecture_filename, 'w').close()  # touch
-        last_update = time.time()
-    else:
-        logging.info('%s already downloaded', lecture_filename)
-        # if this file hasn't been modified in a long time,
-        # record that time
-        last_update = max(last_update, os.path.getmtime(lecture_filename))
-
-    return last_update
-
-
-def get_lecture_filename(combined_section_lectures_nums,
-                         section_dir,
-                         secnum,
-                         lecnum,
-                         lecname,
-                         title,
-                         fmt):
-    """
-    Prepare a destination lecture filename.
-
-    @param combined_section_lectures_nums: Flag that indicates whether
-        section lectures should have combined numbering.
-    @type combined_section_lectures_nums: bool
-
-    @param section_dir: Path to current section directory.
-    @type section_dir: str
-
-    @param secnum: Section number.
-    @type secnum: int
-
-    @param lecnum: Lecture number.
-    @type lecnum: int
-
-    @param lecname: Lecture name.
-    @type lecname: str
-
-    @param title: Resource title.
-    @type title: str
-
-    @param fmt: Format of the resource (pdf, csv, etc)
-    @type fmt: str
-
-    @return: Lecture file name.
-    @rtype: str
-    """
-    # FIXME: this is a quick and dirty solution to Filename too long
-    # problem. We need to think of a more general way to solve this
-    # issue.
-    fmt = fmt[:FORMAT_MAX_LENGTH]
-    title = title[:TITLE_MAX_LENGTH]
-
-    # Format lecture file name
-    if combined_section_lectures_nums:
-        lecture_filename = os.path.join(
-            section_dir,
-            format_combine_number_resource(
-                secnum + 1, lecnum + 1, lecname, title, fmt))
-    else:
-        lecture_filename = os.path.join(
-            section_dir, format_resource(lecnum + 1, lecname, title, fmt))
-
-    return lecture_filename
-
-
-def download_lectures(downloader,
-                      class_name,
-                      sections,
-                      file_formats,
-                      overwrite=False,
-                      skip_download=False,
-                      section_filter=None,
-                      lecture_filter=None,
-                      resource_filter=None,
-                      path='',
-                      verbose_dirs=False,
-                      preview=False,
-                      combined_section_lectures_nums=False,
-                      hooks=None,
-                      playlist=False,
-                      unrestricted_filenames=False,
-                      ignored_formats=None,
-                      resume=False,
-                      skipped_urls=None,
-                      failed_urls=None,
-                      video_resolution='540p'):
-    """
-    Download lecture resources described by sections.
-
-    Returns True if the class appears completed, False otherwise.
-    """
-    last_update = -1
-
-    for (secnum, (section, lectures)) in enumerate(sections):
-        if section_filter and not re.search(section_filter, section):
-            logging.debug('Skipping b/c of sf: %s %s', section_filter,
-                          section)
-            continue
-
-        section_dir = os.path.join(
-            path, class_name,
-            format_section(secnum + 1, section, class_name, verbose_dirs))
-        for (lecnum, (lecname, lecture)) in enumerate(lectures):
-            if lecture_filter and not re.search(lecture_filter,
-                                                lecname):
-                logging.debug('Skipping b/c of lf: %s %s', lecture_filter,
-                              lecname)
-                continue
-
-            if not os.path.exists(section_dir):
-                mkdir_p(normalize_path(section_dir))
-
-            resources_to_get = find_resources_to_get(lecture,
-                                                     file_formats,
-                                                     resource_filter,
-                                                     ignored_formats)
-
-            # write lecture resources
-            for fmt, url, title in resources_to_get:
-                lecture_filename = get_lecture_filename(
-                    combined_section_lectures_nums,
-                    section_dir, secnum, lecnum, lecname, title, fmt)
-
-                lecture_filename = normalize_path(lecture_filename)
-
-                try:
-                    last_update = handle_resource(
-                        downloader, lecture_filename, fmt, url,
-                        overwrite, resume, skip_download,
-                        section_dir, skipped_urls, last_update)
-                except requests.exceptions.RequestException as e:
-                    logging.error('The following error has occurred while '
-                                  'downloading URL %s: %s', url, str(e))
-                    if failed_urls is None:
-                        logging.info('If you want to ignore HTTP errors, '
-                                     'please use "--ignore-http-errors" option')
-                        raise
-                    else:
-                        failed_urls.append(url)
-
-        # After fetching resources, create a playlist in M3U format with the
-        # videos downloaded.
-        if playlist:
-            create_m3u_playlist(section_dir)
-
-        if hooks:
-            for hook in hooks:
-                logging.info('Running hook %s for section %s.', hook, section_dir)
-                os.chdir(section_dir)
-                subprocess.call(hook)
-
-    # if we haven't updated any files in 1 month, we're probably
-    # done with this course
-    rv = is_course_complete(last_update)
-    if rv:
-        logging.info('COURSE PROBABLY COMPLETE: ' + class_name)
-
-    return rv
-
-
-def total_seconds(td):
-    """
-    Compute total seconds for a timedelta.
-
-    Added for backward compatibility, pre 2.7.
-    """
-    return (td.microseconds +
-            (td.seconds + td.days * 24 * 3600) * 10 ** 6) // 10 ** 6
-
-
 def download_old_style_class(args, class_name):
    """
    Download all requested resources from the class given in class_name.
@ -742,25 +406,9 @@ def download_on_demand_class(args, class_name):
    Returns True if the class appears completed.
    """

-    ignored_formats = []
-    if args.ignore_formats:
-        ignored_formats = args.ignore_formats.split(",")
-
    session = get_session()
    extractor = CourseraExtractor(session, args.username, args.password)

-    # login(session, args.username, args.password)
-
-    # get the syllabus listing
-    # page = get_on_demand_syllabus(session, class_name)
-
-    # parse it
-    # modules = parse_on_demand_syllabus(session, page,
-    #                                    args.reverse,
-    #                                    args.unrestricted_filenames,
-    #                                    args.subtitle_language,
-    #                                    args.video_resolution)
-
    modules = extractor.get_modules(class_name,
                                    args.reverse,
                                    args.unrestricted_filenames,
@ -775,46 +423,29 @@ def download_on_demand_class(args, class_name):

    # obtain the resources

-    skipped_urls = []
-    failed_urls = []
+    ignored_formats = []
+    if args.ignore_formats:
+        ignored_formats = args.ignore_formats.split(",")

-    completed = True
-    for idx, module in enumerate(modules):
-        module_name = '%02d_%s' % (idx + 1, module[0])
-        sections = module[1]
+    course_downloader = CourseraDownloader(
+        downloader,
+        commandline_args=args,
+        class_name=class_name,
+        path=args.path,
+        ignored_formats=ignored_formats,
+        disable_url_skipping=args.disable_url_skipping
+    )

-        result = download_lectures(
-            downloader,
-            module_name,
-            sections,
-            args.file_formats,
-            args.overwrite,
-            args.skip_download,
-            args.section_filter,
-            args.lecture_filter,
-            args.resource_filter,
-            os.path.join(args.path, class_name),
-            args.verbose_dirs,
-            args.preview,
-            args.combined_section_lectures_nums,
-            args.hooks,
-            args.playlist,
-            args.unrestricted_filenames,
-            ignored_formats,
-            args.resume,
-            None if args.disable_url_skipping else skipped_urls,
-            failed_urls
-        )
-        completed = completed and result
+    completed = course_downloader.download_modules(modules)

    # Print skipped URLs if any
-    if skipped_urls:
-        print_skipped_urls(skipped_urls)
+    if course_downloader.skipped_urls:
+        print_skipped_urls(course_downloader.skipped_urls)

    # Print failed URLs if any
    # FIXME: should we set non-zero exit code if we have failed URLs?
-    if failed_urls:
-        print_failed_urls(failed_urls)
+    if course_downloader.failed_urls:
+        print_failed_urls(course_downloader.failed_urls)

    return completed

--- a/coursera/downloaders.py
+++ b/coursera/downloaders.py
@ -12,13 +12,17 @@ from __future__ import print_function
 import logging
 import math
 import os
-import requests
 import subprocess
 import sys
 import time

+import requests
+
 from six import iteritems

+#
+# Below are file downloaders, they are wrappers for external downloaders.
+#

 class Downloader(object):
    """
--- a/coursera/extractors.py
+++ b/coursera/extractors.py
@ -65,7 +65,7 @@ class CourseraExtractor(PlatformExtractor):
        course_name = dom['slug']

        logging.info('Parsing syllabus of on-demand course. '
-                    'This may take some time, please be patient ...')
+                     'This may take some time, please be patient ...')
        modules = []
        json_modules = dom['courseMaterial']['elements']
        course = CourseraOnDemand(session=self._session, course_id=dom['id'],
--- a/coursera/filter.py
+++ b/coursera/filter.py
@ -3,7 +3,9 @@ This module contains filtering functions.
 """

 import re
+import logging

+from six import iteritems
 from six.moves.urllib_parse import urlparse


@ -77,3 +79,38 @@ def skip_format_url(format_, url):

    # Do not skip
    return False
+
+
+def find_resources_to_get(lecture, file_formats, resource_filter, ignored_formats=None):
+    """
+    Select formats to download.
+    """
+    resources_to_get = []
+
+    if ignored_formats is None:
+        ignored_formats = []
+
+    if len(ignored_formats):
+        logging.info("The following file formats will be ignored: " + ",".join(ignored_formats))
+
+    for fmt, resources in iteritems(lecture):
+
+        fmt0 = fmt
+        if '.' in fmt:
+            fmt = fmt.split('.')[1]
+
+        if fmt in ignored_formats:
+            continue
+
+        if fmt in file_formats or 'all' in file_formats:
+            for r in resources:
+                if resource_filter and r[1] and not re.search(resource_filter, r[1]):
+                    logging.debug('Skipping b/c of rf: %s %s',
+                                  resource_filter, r[1])
+                    continue
+                resources_to_get.append((fmt0, r[0], r[1]))
+        else:
+            logging.debug(
+                'Skipping b/c format %s not in %s', fmt, file_formats)
+
+    return resources_to_get
--- a/coursera/formatting.py
+++ b/coursera/formatting.py
@ -0,0 +1,76 @@
+import os
+
+from .define import FORMAT_MAX_LENGTH, TITLE_MAX_LENGTH
+
+
+def format_section(num, section, class_name, verbose_dirs):
+    sec = '%02d_%s' % (num, section)
+    if verbose_dirs:
+        sec = class_name.upper() + '_' + sec
+    return sec
+
+
+def format_resource(num, name, title, fmt):
+    if title:
+        title = '_' + title
+    return '%02d_%s%s.%s' % (num, name, title, fmt)
+
+
+def format_combine_number_resource(secnum, lecnum, lecname, title, fmt):
+    if title:
+        title = '_' + title
+    return '%02d_%02d_%s%s.%s' % (secnum, lecnum, lecname, title, fmt)
+
+
+def get_lecture_filename(combined_section_lectures_nums,
+                         section_dir,
+                         secnum,
+                         lecnum,
+                         lecname,
+                         title,
+                         fmt):
+    """
+    Prepare a destination lecture filename.
+
+    @param combined_section_lectures_nums: Flag that indicates whether
+        section lectures should have combined numbering.
+    @type combined_section_lectures_nums: bool
+
+    @param section_dir: Path to current section directory.
+    @type section_dir: str
+
+    @param secnum: Section number.
+    @type secnum: int
+
+    @param lecnum: Lecture number.
+    @type lecnum: int
+
+    @param lecname: Lecture name.
+    @type lecname: str
+
+    @param title: Resource title.
+    @type title: str
+
+    @param fmt: Format of the resource (pdf, csv, etc)
+    @type fmt: str
+
+    @return: Lecture file name.
+    @rtype: str
+    """
+    # FIXME: this is a quick and dirty solution to Filename too long
+    # problem. We need to think of a more general way to solve this
+    # issue.
+    fmt = fmt[:FORMAT_MAX_LENGTH]
+    title = title[:TITLE_MAX_LENGTH]
+
+    # Format lecture file name
+    if combined_section_lectures_nums:
+        lecture_filename = os.path.join(
+            section_dir,
+            format_combine_number_resource(
+                secnum + 1, lecnum + 1, lecname, title, fmt))
+    else:
+        lecture_filename = os.path.join(
+            section_dir, format_resource(lecnum + 1, lecname, title, fmt))
+
+    return lecture_filename
--- a/coursera/playlist.py
+++ b/coursera/playlist.py
@ -0,0 +1,25 @@
+import os
+
+
+def create_m3u_playlist(section_dir):
+    """
+    Create M3U playlist with contents of `section_dir`/*.mp4. The playlist
+    will be created in that directory.
+
+    @param section_dir: Path where to scan for *.mp4 files.
+    @type section_dir: str
+    """
+    path_to_return = os.getcwd()
+
+    for (_path, subdirs, files) in os.walk(section_dir):
+        os.chdir(_path)
+        globbed_videos = sorted(glob.glob("*.mp4"))
+        m3u_name = os.path.split(_path)[1] + ".m3u"
+
+        if len(globbed_videos):
+            with open(m3u_name, "w") as m3u:
+                for video in globbed_videos:
+                    m3u.write(video + "\n")
+            os.chdir(path_to_return)
+    os.chdir(path_to_return)
+
--- a/coursera/test/test_downloaders.py
+++ b/coursera/test/test_downloaders.py
@ -6,6 +6,7 @@ Test the downloaders.

 from coursera import downloaders
 from coursera import coursera_dl
+from coursera.filter import find_resources_to_get

 import pytest

@ -21,7 +22,7 @@ def sample_bag():


 def test_collect_all_resources(sample_bag):
-    res = coursera_dl.find_resources_to_get(sample_bag, 'all', None)
+    res = find_resources_to_get(sample_bag, 'all', None)

    assert [('mp4', 'h://url1/lc1.mp4', 'video'),
            ('pdf', 'h://url2/lc2.pdf', 'slides'),
@ -29,13 +30,13 @@ def test_collect_all_resources(sample_bag):


 def test_collect_only_pdfs(sample_bag):
-    res = coursera_dl.find_resources_to_get(sample_bag, 'pdf', None)
+    res = find_resources_to_get(sample_bag, 'pdf', None)

    assert [('pdf', 'h://url2/lc2.pdf', 'slides')] == sorted(res)


 def test_collect_with_filtering(sample_bag):
-    res = coursera_dl.find_resources_to_get(sample_bag, 'all', 'de')
+    res = find_resources_to_get(sample_bag, 'all', 'de')
    res = sorted(res)

    assert [('mp4', 'h://url1/lc1.mp4', 'video'),
--- a/coursera/test/test_utils.py
+++ b/coursera/test/test_utils.py
@ -19,6 +19,9 @@ from coursera import coursera_dl
 from coursera import api

 from coursera.test.utils import slurp_fixture
+from coursera.formatting import (format_section, format_resource,
+                                 format_combine_number_resource)
+from coursera.utils import total_seconds, is_course_complete


@pytest.mark.parametrize(
@ -98,32 +101,32 @@ def test_fix_url_removes_spaces():


 def test_format_combine_resource_works_correctly():
-    rv = coursera_dl.format_combine_number_resource(5, 4, "Moving_the_furniture", 'The_Basics', "mp4")
+    rv = format_combine_number_resource(5, 4, "Moving_the_furniture", 'The_Basics', "mp4")
    assert '05_04_Moving_the_furniture_The_Basics.mp4' == rv


 def test_format_combine_resource_works_correctly_without_title():
-    rv = coursera_dl.format_combine_number_resource(5, 1, "Introduction", '', "mp4")
+    rv = format_combine_number_resource(5, 1, "Introduction", '', "mp4")
    assert '05_01_Introduction.mp4' == rv


 def test_format_resource_works_correctly():
-    rv = coursera_dl.format_resource(2, "Washing", "Dishes", "mp9")
+    rv = format_resource(2, "Washing", "Dishes", "mp9")
    assert '02_Washing_Dishes.mp9' == rv


 def test_format_resource_works_correctly_without_title():
-    rv = coursera_dl.format_resource(1, "Introduction", '', "mp2")
+    rv = format_resource(1, "Introduction", '', "mp2")
    assert '01_Introduction.mp2' == rv


 def test_format_section_works_correctly():
-    rv = coursera_dl.format_section(9, 'bob', 'WEAVING', False)
+    rv = format_section(9, 'bob', 'WEAVING', False)
    assert '09_bob' == rv


 def test_format_section_works_correctly_with_verbose():
-    rv = coursera_dl.format_section(9, 'bill', 'WEAVING', True)
+    rv = format_section(9, 'bill', 'WEAVING', True)
    assert 'WEAVING_09_bill' == rv


@ -146,25 +149,25 @@ def test_decode_input():


 def test_total_seconds():
-    ts = coursera_dl.total_seconds(datetime.timedelta(days=30))
+    ts = total_seconds(datetime.timedelta(days=30))
    assert ts == 2592000


 def test_is_course_complete_should_give_false_if_there_was_recent_update():

-    delta = coursera_dl.total_seconds(datetime.timedelta(days=29))
+    delta = total_seconds(datetime.timedelta(days=29))
    tm = time() - delta

-    rv = coursera_dl.is_course_complete(tm)
+    rv = is_course_complete(tm)
    assert rv is False


 def test_is_course_complete_should_give_true_if_there_was_no_recent_update():

-    delta = coursera_dl.total_seconds(datetime.timedelta(days=31))
+    delta = total_seconds(datetime.timedelta(days=31))
    tm = time() - delta

-    rv = coursera_dl.is_course_complete(tm)
+    rv = is_course_complete(tm)
    assert rv is True


--- a/coursera/utils.py
+++ b/coursera/utils.py
@ -4,24 +4,22 @@
 This module provides utility functions that are used within the script.
 """

-import errno
 import os
-import random
 import re
-import string
 import sys
+import time
+import errno
+import random
+import string
 import logging
+import datetime
+

-import six
 from bs4 import BeautifulSoup as BeautifulSoup_

-# Force us of bs4 with html5lib
-BeautifulSoup = lambda page: BeautifulSoup_(page, 'html5lib')
-
-from .define import COURSERA_URL, WINDOWS_UNC_PREFIX
-
-from six.moves import html_parser
+import six
 from six import iteritems
+from six.moves import html_parser
 from six.moves.urllib.parse import ParseResult
 from six.moves.urllib_parse import unquote_plus

@ -39,6 +37,11 @@ else:
    from string import letters as string_ascii_letters
    from string import digits as string_digits

+from .define import COURSERA_URL, WINDOWS_UNC_PREFIX
+
+# Force us of bs4 with html5lib
+BeautifulSoup = lambda page: BeautifulSoup_(page, 'html5lib')
+

 if six.PY2:
    def decode_input(x):
@ -184,6 +187,36 @@ def fix_url(url):
    return url


+def is_course_complete(last_update):
+    """
+    Determine is the course is likely to have been terminated or not.
+
+    We return True if the timestamp given by last_update is 30 days or older
+    than today's date.  Otherwise, we return True.
+
+    The intended use case for this is to detect if a given courses has not
+    seen any update in the last 30 days or more.  Otherwise, we return True,
+    since it is probably too soon to declare the course complete.
+    """
+    rv = False
+    if last_update >= 0:
+        delta = time.time() - last_update
+        max_delta = total_seconds(datetime.timedelta(days=30))
+        if delta > max_delta:
+            rv = True
+    return rv
+
+
+def total_seconds(td):
+    """
+    Compute total seconds for a timedelta.
+
+    Added for backward compatibility, pre 2.7.
+    """
+    return (td.microseconds +
+            (td.seconds + td.days * 24 * 3600) * 10 ** 6) // 10 ** 6
+
+
 def make_coursera_absolute_url(url):
    """
    If given url is relative adds coursera netloc,
--- a/coursera/workflow.py
+++ b/coursera/workflow.py
@ -0,0 +1,213 @@
+import os
+import re
+import abc
+import time
+import codecs
+import logging
+import subprocess
+
+import requests
+
+from .formatting import format_section, get_lecture_filename
+from .playlist import create_m3u_playlist
+from .utils import is_course_complete, mkdir_p, normalize_path
+from .filter import find_resources_to_get, skip_format_url
+from .define import IN_MEMORY_MARKER
+
+
+def handle_resource(downloader,
+                    lecture_filename,
+                    fmt,
+                    url,
+                    overwrite,
+                    resume,
+                    skip_download,
+                    skipped_urls,
+                    last_update):
+    """
+    Handle resource. This function builds up resource file name and
+    downloads it if necessary.
+
+    @param downloader: Resource downloader instance.
+    @type downloader: downloaders.Downloader
+
+    @param fmt: Format of the resource (pdf, csv, etc)
+    @type fmt: str
+
+    @param url: URL of the resource.
+    @type url: str
+
+    @param overwrite: Flag that indicates whether files should be overwritten.
+    @type overwrite: bool
+
+    @param resume: Flag that indicates whether download should be resumed.
+    @type resume: bool
+
+    @param skip_download: Flag that indicates whether download should be skipped.
+    @type skip_download: bool
+
+    @param skipped_urls: List of skipped urls to update.
+    @type skipped_urls: None or list
+
+    @param last_update: Latest mtime across files.
+    @type last_update: timestamp
+
+    @return: Updated latest mtime.
+    @rtype: timestamp
+    """
+    # Decide whether we need to download it
+    if overwrite or not os.path.exists(lecture_filename) or resume:
+        if not skip_download:
+            if url.startswith(IN_MEMORY_MARKER):
+                page_content = url[len(IN_MEMORY_MARKER):]
+                logging.info('Saving page contents to: %s', lecture_filename)
+                with codecs.open(lecture_filename, 'w', 'utf-8') as file_object:
+                    file_object.write(page_content)
+            else:
+                if skipped_urls is not None and skip_format_url(fmt, url):
+                    skipped_urls.append(url)
+                else:
+                    logging.info('Downloading: %s', lecture_filename)
+                    downloader.download(url, lecture_filename, resume=resume)
+        else:
+            open(lecture_filename, 'w').close()  # touch
+        last_update = time.time()
+    else:
+        logging.info('%s already downloaded', lecture_filename)
+        # if this file hasn't been modified in a long time,
+        # record that time
+        last_update = max(last_update, os.path.getmtime(lecture_filename))
+
+    return last_update
+
+
+class CourseDownloader(object):
+    __metaclass__ = abc.ABCMeta
+
+    def __init__(self):
+        pass
+
+    @abc.abstractmethod
+    def download_modules(self, modules):
+        pass
+
+
+class CourseraDownloader(CourseDownloader):
+    def __init__(self,
+                 downloader,
+                 commandline_args,
+                 class_name,
+                 path='',
+                 ignored_formats=None,
+                 disable_url_skipping=False):
+        super(CourseraDownloader, self).__init__()
+
+        self._downloader = downloader
+        self._args = commandline_args
+        self._class_name = class_name
+        self._path = path
+        self._ignored_formats = ignored_formats
+        self._disable_url_skipping = disable_url_skipping
+
+        self.skipped_urls = [] if disable_url_skipping else None
+        self.failed_urls = []
+
+        self._last_update = -1
+
+    def download_modules(self, modules):
+        completed = True
+        for idx, module in enumerate(modules):
+            module_name = '%02d_%s' % (idx + 1, module[0])
+            sections = module[1]
+            result = self._download_sections(module_name, sections)
+            completed = completed and result
+        return completed
+
+    def _download_sections(self, module_name, sections):
+        """
+        Download lecture resources described by sections.
+
+        Returns True if the class appears completed, False otherwise.
+        """
+        self._last_update = -1
+
+        section_filter = self._args.section_filter
+        verbose_dirs = self._args.verbose_dirs
+        hooks = self._args.hooks
+        playlist = self._args.playlist
+
+        for (secnum, (section, lectures)) in enumerate(sections):
+            if section_filter and not re.search(section_filter, section):
+                logging.debug('Skipping b/c of sf: %s %s', section_filter,
+                              section)
+                continue
+
+            section_dir = os.path.join(
+                self._path, self._class_name, module_name,
+                format_section(secnum + 1, section,
+                               self._class_name, verbose_dirs))
+
+            self._download_lectures(lectures, secnum, section_dir)
+
+            # After fetching resources, create a playlist in M3U format with the
+            # videos downloaded.
+            if playlist:
+                create_m3u_playlist(section_dir)
+
+            if hooks:
+                original_dir = os.getcwd()
+                for hook in hooks:
+                    logging.info('Running hook %s for section %s.', hook, section_dir)
+                    os.chdir(section_dir)
+                    subprocess.call(hook)
+                os.chdir(original_dir)
+
+        # if we haven't updated any files in 1 month, we're probably
+        # done with this course
+        is_complete = is_course_complete(self._last_update)
+        if is_complete:
+            logging.info('COURSE PROBABLY COMPLETE: ' + self._class_name)
+
+        return is_complete
+
+    def _download_lectures(self, lectures, secnum, section_dir):
+        lecture_filter = self._args.lecture_filter
+        file_formats = self._args.file_formats
+        resource_filter = self._args.resource_filter
+        combined_section_lectures_nums = self._args.combined_section_lectures_nums
+        overwrite = self._args.overwrite
+        resume = self._args.resume
+        skip_download = self._args.skip_download
+
+        for (lecnum, (lecname, lecture)) in enumerate(lectures):
+            if lecture_filter and not re.search(lecture_filter,
+                                                lecname):
+                logging.debug('Skipping b/c of lf: %s %s', lecture_filter,
+                              lecname)
+                continue
+
+            if not os.path.exists(section_dir):
+                mkdir_p(normalize_path(section_dir))
+
+            resources_to_get = find_resources_to_get(lecture,
+                                                     file_formats,
+                                                     resource_filter,
+                                                     self._ignored_formats)
+
+            # write lecture resources
+            for fmt, url, title in resources_to_get:
+                lecture_filename = get_lecture_filename(
+                    combined_section_lectures_nums,
+                    section_dir, secnum, lecnum, lecname, title, fmt)
+
+                lecture_filename = normalize_path(lecture_filename)
+
+                try:
+                    self._last_update = handle_resource(
+                        self._downloader, lecture_filename, fmt, url,
+                        overwrite, resume, skip_download,
+                        self.skipped_urls, self._last_update)
+                except requests.exceptions.RequestException as e:
+                    logging.error('The following error has occurred while '
+                                  'downloading URL %s: %s', url, str(e))
+                    self.failed_urls.append(url)