Moves things around, decompose into modules (WIP)

This commit is contained in:
Yuri Bochkarev 2016-07-06 13:37:40 +03:00
parent afb6384e9c
commit f308e12e53
10 changed files with 438 additions and 415 deletions

View file

@ -42,16 +42,11 @@ For further documentation and examples, visit the project's home at:
"""
import datetime
import glob
import json
import logging
import os
import re
import shutil
import subprocess
import time
import codecs
from distutils.version import LooseVersion as V
@ -60,23 +55,19 @@ from distutils.version import LooseVersion as V
# We may, perhaps, want to move these elsewhere.
import bs4
import six
from six import iteritems
import requests
from .cookies import (
AuthenticationFailed, ClassNotFound,
get_cookies_for_class, make_cookie_values, login, TLSAdapter)
from .define import (CLASS_URL, ABOUT_URL, PATH_CACHE,
OPENCOURSE_CONTENT_URL, IN_MEMORY_MARKER,
FORMAT_MAX_LENGTH, TITLE_MAX_LENGTH)
get_cookies_for_class, make_cookie_values, TLSAdapter)
from .define import (CLASS_URL, ABOUT_URL, PATH_CACHE)
from .downloaders import get_downloader
from .workflow import CourseraDownloader
from .utils import (clean_filename, get_anchor_format, mkdir_p, fix_url,
print_ssl_error_message, normalize_path,
print_ssl_error_message,
decode_input, BeautifulSoup, is_debug_run)
from .network import get_page, get_page_and_url
from .api import CourseraOnDemand, OnDemandCourseMaterialItems
from .filter import skip_format_url
from .commandline import parse_args
from .extractors import CourseraExtractor
@ -323,333 +314,6 @@ def download_about(session, class_name, path='', overwrite=False,
about_file.write(json_data)
return element
def is_course_complete(last_update):
"""
Determine is the course is likely to have been terminated or not.
We return True if the timestamp given by last_update is 30 days or older
than today's date. Otherwise, we return True.
The intended use case for this is to detect if a given courses has not
seen any update in the last 30 days or more. Otherwise, we return True,
since it is probably too soon to declare the course complete.
"""
rv = False
if last_update >= 0:
delta = time.time() - last_update
max_delta = total_seconds(datetime.timedelta(days=30))
if delta > max_delta:
rv = True
return rv
def format_section(num, section, class_name, verbose_dirs):
sec = '%02d_%s' % (num, section)
if verbose_dirs:
sec = class_name.upper() + '_' + sec
return sec
def format_resource(num, name, title, fmt):
if title:
title = '_' + title
return '%02d_%s%s.%s' % (num, name, title, fmt)
def format_combine_number_resource(secnum, lecnum, lecname, title, fmt):
if title:
title = '_' + title
return '%02d_%02d_%s%s.%s' % (secnum, lecnum, lecname, title, fmt)
def find_resources_to_get(lecture, file_formats, resource_filter, ignored_formats=None):
"""
Select formats to download.
"""
resources_to_get = []
if ignored_formats is None:
ignored_formats = []
if len(ignored_formats):
logging.info("The following file formats will be ignored: " + ",".join(ignored_formats))
for fmt, resources in iteritems(lecture):
fmt0 = fmt
if '.' in fmt:
fmt = fmt.split('.')[1]
if fmt in ignored_formats:
continue
if fmt in file_formats or 'all' in file_formats:
for r in resources:
if resource_filter and r[1] and not re.search(resource_filter, r[1]):
logging.debug('Skipping b/c of rf: %s %s',
resource_filter, r[1])
continue
resources_to_get.append((fmt0, r[0], r[1]))
else:
logging.debug(
'Skipping b/c format %s not in %s', fmt, file_formats)
return resources_to_get
def create_m3u_playlist(section_dir):
"""
Create M3U playlist with contents of `section_dir`/*.mp4. The playlist
will be created in that directory.
@param section_dir: Path where to scan for *.mp4 files.
@type section_dir: str
"""
path_to_return = os.getcwd()
for (_path, subdirs, files) in os.walk(section_dir):
os.chdir(_path)
globbed_videos = sorted(glob.glob("*.mp4"))
m3u_name = os.path.split(_path)[1] + ".m3u"
if len(globbed_videos):
with open(m3u_name, "w") as m3u:
for video in globbed_videos:
m3u.write(video + "\n")
os.chdir(path_to_return)
os.chdir(path_to_return)
def handle_resource(downloader,
lecture_filename,
fmt,
url,
overwrite,
resume,
skip_download,
section_dir,
skipped_urls,
last_update):
"""
Handle resource. This function builds up resource file name and
downloads it if necessary.
@param downloader: Resource downloader instance.
@type downloader: downloaders.Downloader
@param fmt: Format of the resource (pdf, csv, etc)
@type fmt: str
@param url: URL of the resource.
@type url: str
@param overwrite: Flag that indicates whether files should be overwritten.
@type overwrite: bool
@param resume: Flag that indicates whether download should be resumed.
@type resume: bool
@param skip_download: Flag that indicates whether download should be skipped.
@type skip_download: bool
@param section_dir: Path to current section directory.
@type section_dir: str
@param skipped_urls: List of skipped urls to update.
@type skipped_urls: None or list
@param last_update: Latest mtime across files.
@type last_update: timestamp
@return: Updated latest mtime.
@rtype: timestamp
"""
# Decide whether we need to download it
if overwrite or not os.path.exists(lecture_filename) or resume:
if not skip_download:
if url.startswith(IN_MEMORY_MARKER):
page_content = url[len(IN_MEMORY_MARKER):]
logging.info('Saving page contents to: %s', lecture_filename)
with codecs.open(lecture_filename, 'w', 'utf-8') as file_object:
file_object.write(page_content)
else:
if skipped_urls is not None and skip_format_url(fmt, url):
skipped_urls.append(url)
else:
logging.info('Downloading: %s', lecture_filename)
downloader.download(url, lecture_filename, resume=resume)
else:
open(lecture_filename, 'w').close() # touch
last_update = time.time()
else:
logging.info('%s already downloaded', lecture_filename)
# if this file hasn't been modified in a long time,
# record that time
last_update = max(last_update, os.path.getmtime(lecture_filename))
return last_update
def get_lecture_filename(combined_section_lectures_nums,
section_dir,
secnum,
lecnum,
lecname,
title,
fmt):
"""
Prepare a destination lecture filename.
@param combined_section_lectures_nums: Flag that indicates whether
section lectures should have combined numbering.
@type combined_section_lectures_nums: bool
@param section_dir: Path to current section directory.
@type section_dir: str
@param secnum: Section number.
@type secnum: int
@param lecnum: Lecture number.
@type lecnum: int
@param lecname: Lecture name.
@type lecname: str
@param title: Resource title.
@type title: str
@param fmt: Format of the resource (pdf, csv, etc)
@type fmt: str
@return: Lecture file name.
@rtype: str
"""
# FIXME: this is a quick and dirty solution to Filename too long
# problem. We need to think of a more general way to solve this
# issue.
fmt = fmt[:FORMAT_MAX_LENGTH]
title = title[:TITLE_MAX_LENGTH]
# Format lecture file name
if combined_section_lectures_nums:
lecture_filename = os.path.join(
section_dir,
format_combine_number_resource(
secnum + 1, lecnum + 1, lecname, title, fmt))
else:
lecture_filename = os.path.join(
section_dir, format_resource(lecnum + 1, lecname, title, fmt))
return lecture_filename
def download_lectures(downloader,
class_name,
sections,
file_formats,
overwrite=False,
skip_download=False,
section_filter=None,
lecture_filter=None,
resource_filter=None,
path='',
verbose_dirs=False,
preview=False,
combined_section_lectures_nums=False,
hooks=None,
playlist=False,
unrestricted_filenames=False,
ignored_formats=None,
resume=False,
skipped_urls=None,
failed_urls=None,
video_resolution='540p'):
"""
Download lecture resources described by sections.
Returns True if the class appears completed, False otherwise.
"""
last_update = -1
for (secnum, (section, lectures)) in enumerate(sections):
if section_filter and not re.search(section_filter, section):
logging.debug('Skipping b/c of sf: %s %s', section_filter,
section)
continue
section_dir = os.path.join(
path, class_name,
format_section(secnum + 1, section, class_name, verbose_dirs))
for (lecnum, (lecname, lecture)) in enumerate(lectures):
if lecture_filter and not re.search(lecture_filter,
lecname):
logging.debug('Skipping b/c of lf: %s %s', lecture_filter,
lecname)
continue
if not os.path.exists(section_dir):
mkdir_p(normalize_path(section_dir))
resources_to_get = find_resources_to_get(lecture,
file_formats,
resource_filter,
ignored_formats)
# write lecture resources
for fmt, url, title in resources_to_get:
lecture_filename = get_lecture_filename(
combined_section_lectures_nums,
section_dir, secnum, lecnum, lecname, title, fmt)
lecture_filename = normalize_path(lecture_filename)
try:
last_update = handle_resource(
downloader, lecture_filename, fmt, url,
overwrite, resume, skip_download,
section_dir, skipped_urls, last_update)
except requests.exceptions.RequestException as e:
logging.error('The following error has occurred while '
'downloading URL %s: %s', url, str(e))
if failed_urls is None:
logging.info('If you want to ignore HTTP errors, '
'please use "--ignore-http-errors" option')
raise
else:
failed_urls.append(url)
# After fetching resources, create a playlist in M3U format with the
# videos downloaded.
if playlist:
create_m3u_playlist(section_dir)
if hooks:
for hook in hooks:
logging.info('Running hook %s for section %s.', hook, section_dir)
os.chdir(section_dir)
subprocess.call(hook)
# if we haven't updated any files in 1 month, we're probably
# done with this course
rv = is_course_complete(last_update)
if rv:
logging.info('COURSE PROBABLY COMPLETE: ' + class_name)
return rv
def total_seconds(td):
"""
Compute total seconds for a timedelta.
Added for backward compatibility, pre 2.7.
"""
return (td.microseconds +
(td.seconds + td.days * 24 * 3600) * 10 ** 6) // 10 ** 6
def download_old_style_class(args, class_name):
"""
Download all requested resources from the class given in class_name.
@ -742,25 +406,9 @@ def download_on_demand_class(args, class_name):
Returns True if the class appears completed.
"""
ignored_formats = []
if args.ignore_formats:
ignored_formats = args.ignore_formats.split(",")
session = get_session()
extractor = CourseraExtractor(session, args.username, args.password)
# login(session, args.username, args.password)
# get the syllabus listing
# page = get_on_demand_syllabus(session, class_name)
# parse it
# modules = parse_on_demand_syllabus(session, page,
# args.reverse,
# args.unrestricted_filenames,
# args.subtitle_language,
# args.video_resolution)
modules = extractor.get_modules(class_name,
args.reverse,
args.unrestricted_filenames,
@ -775,46 +423,29 @@ def download_on_demand_class(args, class_name):
# obtain the resources
skipped_urls = []
failed_urls = []
ignored_formats = []
if args.ignore_formats:
ignored_formats = args.ignore_formats.split(",")
completed = True
for idx, module in enumerate(modules):
module_name = '%02d_%s' % (idx + 1, module[0])
sections = module[1]
course_downloader = CourseraDownloader(
downloader,
commandline_args=args,
class_name=class_name,
path=args.path,
ignored_formats=ignored_formats,
disable_url_skipping=args.disable_url_skipping
)
result = download_lectures(
downloader,
module_name,
sections,
args.file_formats,
args.overwrite,
args.skip_download,
args.section_filter,
args.lecture_filter,
args.resource_filter,
os.path.join(args.path, class_name),
args.verbose_dirs,
args.preview,
args.combined_section_lectures_nums,
args.hooks,
args.playlist,
args.unrestricted_filenames,
ignored_formats,
args.resume,
None if args.disable_url_skipping else skipped_urls,
failed_urls
)
completed = completed and result
completed = course_downloader.download_modules(modules)
# Print skipped URLs if any
if skipped_urls:
print_skipped_urls(skipped_urls)
if course_downloader.skipped_urls:
print_skipped_urls(course_downloader.skipped_urls)
# Print failed URLs if any
# FIXME: should we set non-zero exit code if we have failed URLs?
if failed_urls:
print_failed_urls(failed_urls)
if course_downloader.failed_urls:
print_failed_urls(course_downloader.failed_urls)
return completed

View file

@ -12,13 +12,17 @@ from __future__ import print_function
import logging
import math
import os
import requests
import subprocess
import sys
import time
import requests
from six import iteritems
#
# Below are file downloaders, they are wrappers for external downloaders.
#
class Downloader(object):
"""

View file

@ -65,7 +65,7 @@ class CourseraExtractor(PlatformExtractor):
course_name = dom['slug']
logging.info('Parsing syllabus of on-demand course. '
'This may take some time, please be patient ...')
'This may take some time, please be patient ...')
modules = []
json_modules = dom['courseMaterial']['elements']
course = CourseraOnDemand(session=self._session, course_id=dom['id'],

View file

@ -3,7 +3,9 @@ This module contains filtering functions.
"""
import re
import logging
from six import iteritems
from six.moves.urllib_parse import urlparse
@ -77,3 +79,38 @@ def skip_format_url(format_, url):
# Do not skip
return False
def find_resources_to_get(lecture, file_formats, resource_filter, ignored_formats=None):
"""
Select formats to download.
"""
resources_to_get = []
if ignored_formats is None:
ignored_formats = []
if len(ignored_formats):
logging.info("The following file formats will be ignored: " + ",".join(ignored_formats))
for fmt, resources in iteritems(lecture):
fmt0 = fmt
if '.' in fmt:
fmt = fmt.split('.')[1]
if fmt in ignored_formats:
continue
if fmt in file_formats or 'all' in file_formats:
for r in resources:
if resource_filter and r[1] and not re.search(resource_filter, r[1]):
logging.debug('Skipping b/c of rf: %s %s',
resource_filter, r[1])
continue
resources_to_get.append((fmt0, r[0], r[1]))
else:
logging.debug(
'Skipping b/c format %s not in %s', fmt, file_formats)
return resources_to_get

76
coursera/formatting.py Normal file
View file

@ -0,0 +1,76 @@
import os
from .define import FORMAT_MAX_LENGTH, TITLE_MAX_LENGTH
def format_section(num, section, class_name, verbose_dirs):
sec = '%02d_%s' % (num, section)
if verbose_dirs:
sec = class_name.upper() + '_' + sec
return sec
def format_resource(num, name, title, fmt):
if title:
title = '_' + title
return '%02d_%s%s.%s' % (num, name, title, fmt)
def format_combine_number_resource(secnum, lecnum, lecname, title, fmt):
if title:
title = '_' + title
return '%02d_%02d_%s%s.%s' % (secnum, lecnum, lecname, title, fmt)
def get_lecture_filename(combined_section_lectures_nums,
section_dir,
secnum,
lecnum,
lecname,
title,
fmt):
"""
Prepare a destination lecture filename.
@param combined_section_lectures_nums: Flag that indicates whether
section lectures should have combined numbering.
@type combined_section_lectures_nums: bool
@param section_dir: Path to current section directory.
@type section_dir: str
@param secnum: Section number.
@type secnum: int
@param lecnum: Lecture number.
@type lecnum: int
@param lecname: Lecture name.
@type lecname: str
@param title: Resource title.
@type title: str
@param fmt: Format of the resource (pdf, csv, etc)
@type fmt: str
@return: Lecture file name.
@rtype: str
"""
# FIXME: this is a quick and dirty solution to Filename too long
# problem. We need to think of a more general way to solve this
# issue.
fmt = fmt[:FORMAT_MAX_LENGTH]
title = title[:TITLE_MAX_LENGTH]
# Format lecture file name
if combined_section_lectures_nums:
lecture_filename = os.path.join(
section_dir,
format_combine_number_resource(
secnum + 1, lecnum + 1, lecname, title, fmt))
else:
lecture_filename = os.path.join(
section_dir, format_resource(lecnum + 1, lecname, title, fmt))
return lecture_filename

25
coursera/playlist.py Normal file
View file

@ -0,0 +1,25 @@
import os
def create_m3u_playlist(section_dir):
"""
Create M3U playlist with contents of `section_dir`/*.mp4. The playlist
will be created in that directory.
@param section_dir: Path where to scan for *.mp4 files.
@type section_dir: str
"""
path_to_return = os.getcwd()
for (_path, subdirs, files) in os.walk(section_dir):
os.chdir(_path)
globbed_videos = sorted(glob.glob("*.mp4"))
m3u_name = os.path.split(_path)[1] + ".m3u"
if len(globbed_videos):
with open(m3u_name, "w") as m3u:
for video in globbed_videos:
m3u.write(video + "\n")
os.chdir(path_to_return)
os.chdir(path_to_return)

View file

@ -6,6 +6,7 @@ Test the downloaders.
from coursera import downloaders
from coursera import coursera_dl
from coursera.filter import find_resources_to_get
import pytest
@ -21,7 +22,7 @@ def sample_bag():
def test_collect_all_resources(sample_bag):
res = coursera_dl.find_resources_to_get(sample_bag, 'all', None)
res = find_resources_to_get(sample_bag, 'all', None)
assert [('mp4', 'h://url1/lc1.mp4', 'video'),
('pdf', 'h://url2/lc2.pdf', 'slides'),
@ -29,13 +30,13 @@ def test_collect_all_resources(sample_bag):
def test_collect_only_pdfs(sample_bag):
res = coursera_dl.find_resources_to_get(sample_bag, 'pdf', None)
res = find_resources_to_get(sample_bag, 'pdf', None)
assert [('pdf', 'h://url2/lc2.pdf', 'slides')] == sorted(res)
def test_collect_with_filtering(sample_bag):
res = coursera_dl.find_resources_to_get(sample_bag, 'all', 'de')
res = find_resources_to_get(sample_bag, 'all', 'de')
res = sorted(res)
assert [('mp4', 'h://url1/lc1.mp4', 'video'),

View file

@ -19,6 +19,9 @@ from coursera import coursera_dl
from coursera import api
from coursera.test.utils import slurp_fixture
from coursera.formatting import (format_section, format_resource,
format_combine_number_resource)
from coursera.utils import total_seconds, is_course_complete
@pytest.mark.parametrize(
@ -98,32 +101,32 @@ def test_fix_url_removes_spaces():
def test_format_combine_resource_works_correctly():
rv = coursera_dl.format_combine_number_resource(5, 4, "Moving_the_furniture", 'The_Basics', "mp4")
rv = format_combine_number_resource(5, 4, "Moving_the_furniture", 'The_Basics', "mp4")
assert '05_04_Moving_the_furniture_The_Basics.mp4' == rv
def test_format_combine_resource_works_correctly_without_title():
rv = coursera_dl.format_combine_number_resource(5, 1, "Introduction", '', "mp4")
rv = format_combine_number_resource(5, 1, "Introduction", '', "mp4")
assert '05_01_Introduction.mp4' == rv
def test_format_resource_works_correctly():
rv = coursera_dl.format_resource(2, "Washing", "Dishes", "mp9")
rv = format_resource(2, "Washing", "Dishes", "mp9")
assert '02_Washing_Dishes.mp9' == rv
def test_format_resource_works_correctly_without_title():
rv = coursera_dl.format_resource(1, "Introduction", '', "mp2")
rv = format_resource(1, "Introduction", '', "mp2")
assert '01_Introduction.mp2' == rv
def test_format_section_works_correctly():
rv = coursera_dl.format_section(9, 'bob', 'WEAVING', False)
rv = format_section(9, 'bob', 'WEAVING', False)
assert '09_bob' == rv
def test_format_section_works_correctly_with_verbose():
rv = coursera_dl.format_section(9, 'bill', 'WEAVING', True)
rv = format_section(9, 'bill', 'WEAVING', True)
assert 'WEAVING_09_bill' == rv
@ -146,25 +149,25 @@ def test_decode_input():
def test_total_seconds():
ts = coursera_dl.total_seconds(datetime.timedelta(days=30))
ts = total_seconds(datetime.timedelta(days=30))
assert ts == 2592000
def test_is_course_complete_should_give_false_if_there_was_recent_update():
delta = coursera_dl.total_seconds(datetime.timedelta(days=29))
delta = total_seconds(datetime.timedelta(days=29))
tm = time() - delta
rv = coursera_dl.is_course_complete(tm)
rv = is_course_complete(tm)
assert rv is False
def test_is_course_complete_should_give_true_if_there_was_no_recent_update():
delta = coursera_dl.total_seconds(datetime.timedelta(days=31))
delta = total_seconds(datetime.timedelta(days=31))
tm = time() - delta
rv = coursera_dl.is_course_complete(tm)
rv = is_course_complete(tm)
assert rv is True

View file

@ -4,24 +4,22 @@
This module provides utility functions that are used within the script.
"""
import errno
import os
import random
import re
import string
import sys
import time
import errno
import random
import string
import logging
import datetime
import six
from bs4 import BeautifulSoup as BeautifulSoup_
# Force us of bs4 with html5lib
BeautifulSoup = lambda page: BeautifulSoup_(page, 'html5lib')
from .define import COURSERA_URL, WINDOWS_UNC_PREFIX
from six.moves import html_parser
import six
from six import iteritems
from six.moves import html_parser
from six.moves.urllib.parse import ParseResult
from six.moves.urllib_parse import unquote_plus
@ -39,6 +37,11 @@ else:
from string import letters as string_ascii_letters
from string import digits as string_digits
from .define import COURSERA_URL, WINDOWS_UNC_PREFIX
# Force us of bs4 with html5lib
BeautifulSoup = lambda page: BeautifulSoup_(page, 'html5lib')
if six.PY2:
def decode_input(x):
@ -184,6 +187,36 @@ def fix_url(url):
return url
def is_course_complete(last_update):
"""
Determine is the course is likely to have been terminated or not.
We return True if the timestamp given by last_update is 30 days or older
than today's date. Otherwise, we return True.
The intended use case for this is to detect if a given courses has not
seen any update in the last 30 days or more. Otherwise, we return True,
since it is probably too soon to declare the course complete.
"""
rv = False
if last_update >= 0:
delta = time.time() - last_update
max_delta = total_seconds(datetime.timedelta(days=30))
if delta > max_delta:
rv = True
return rv
def total_seconds(td):
"""
Compute total seconds for a timedelta.
Added for backward compatibility, pre 2.7.
"""
return (td.microseconds +
(td.seconds + td.days * 24 * 3600) * 10 ** 6) // 10 ** 6
def make_coursera_absolute_url(url):
"""
If given url is relative adds coursera netloc,

213
coursera/workflow.py Normal file
View file

@ -0,0 +1,213 @@
import os
import re
import abc
import time
import codecs
import logging
import subprocess
import requests
from .formatting import format_section, get_lecture_filename
from .playlist import create_m3u_playlist
from .utils import is_course_complete, mkdir_p, normalize_path
from .filter import find_resources_to_get, skip_format_url
from .define import IN_MEMORY_MARKER
def handle_resource(downloader,
lecture_filename,
fmt,
url,
overwrite,
resume,
skip_download,
skipped_urls,
last_update):
"""
Handle resource. This function builds up resource file name and
downloads it if necessary.
@param downloader: Resource downloader instance.
@type downloader: downloaders.Downloader
@param fmt: Format of the resource (pdf, csv, etc)
@type fmt: str
@param url: URL of the resource.
@type url: str
@param overwrite: Flag that indicates whether files should be overwritten.
@type overwrite: bool
@param resume: Flag that indicates whether download should be resumed.
@type resume: bool
@param skip_download: Flag that indicates whether download should be skipped.
@type skip_download: bool
@param skipped_urls: List of skipped urls to update.
@type skipped_urls: None or list
@param last_update: Latest mtime across files.
@type last_update: timestamp
@return: Updated latest mtime.
@rtype: timestamp
"""
# Decide whether we need to download it
if overwrite or not os.path.exists(lecture_filename) or resume:
if not skip_download:
if url.startswith(IN_MEMORY_MARKER):
page_content = url[len(IN_MEMORY_MARKER):]
logging.info('Saving page contents to: %s', lecture_filename)
with codecs.open(lecture_filename, 'w', 'utf-8') as file_object:
file_object.write(page_content)
else:
if skipped_urls is not None and skip_format_url(fmt, url):
skipped_urls.append(url)
else:
logging.info('Downloading: %s', lecture_filename)
downloader.download(url, lecture_filename, resume=resume)
else:
open(lecture_filename, 'w').close() # touch
last_update = time.time()
else:
logging.info('%s already downloaded', lecture_filename)
# if this file hasn't been modified in a long time,
# record that time
last_update = max(last_update, os.path.getmtime(lecture_filename))
return last_update
class CourseDownloader(object):
__metaclass__ = abc.ABCMeta
def __init__(self):
pass
@abc.abstractmethod
def download_modules(self, modules):
pass
class CourseraDownloader(CourseDownloader):
def __init__(self,
downloader,
commandline_args,
class_name,
path='',
ignored_formats=None,
disable_url_skipping=False):
super(CourseraDownloader, self).__init__()
self._downloader = downloader
self._args = commandline_args
self._class_name = class_name
self._path = path
self._ignored_formats = ignored_formats
self._disable_url_skipping = disable_url_skipping
self.skipped_urls = [] if disable_url_skipping else None
self.failed_urls = []
self._last_update = -1
def download_modules(self, modules):
completed = True
for idx, module in enumerate(modules):
module_name = '%02d_%s' % (idx + 1, module[0])
sections = module[1]
result = self._download_sections(module_name, sections)
completed = completed and result
return completed
def _download_sections(self, module_name, sections):
"""
Download lecture resources described by sections.
Returns True if the class appears completed, False otherwise.
"""
self._last_update = -1
section_filter = self._args.section_filter
verbose_dirs = self._args.verbose_dirs
hooks = self._args.hooks
playlist = self._args.playlist
for (secnum, (section, lectures)) in enumerate(sections):
if section_filter and not re.search(section_filter, section):
logging.debug('Skipping b/c of sf: %s %s', section_filter,
section)
continue
section_dir = os.path.join(
self._path, self._class_name, module_name,
format_section(secnum + 1, section,
self._class_name, verbose_dirs))
self._download_lectures(lectures, secnum, section_dir)
# After fetching resources, create a playlist in M3U format with the
# videos downloaded.
if playlist:
create_m3u_playlist(section_dir)
if hooks:
original_dir = os.getcwd()
for hook in hooks:
logging.info('Running hook %s for section %s.', hook, section_dir)
os.chdir(section_dir)
subprocess.call(hook)
os.chdir(original_dir)
# if we haven't updated any files in 1 month, we're probably
# done with this course
is_complete = is_course_complete(self._last_update)
if is_complete:
logging.info('COURSE PROBABLY COMPLETE: ' + self._class_name)
return is_complete
def _download_lectures(self, lectures, secnum, section_dir):
lecture_filter = self._args.lecture_filter
file_formats = self._args.file_formats
resource_filter = self._args.resource_filter
combined_section_lectures_nums = self._args.combined_section_lectures_nums
overwrite = self._args.overwrite
resume = self._args.resume
skip_download = self._args.skip_download
for (lecnum, (lecname, lecture)) in enumerate(lectures):
if lecture_filter and not re.search(lecture_filter,
lecname):
logging.debug('Skipping b/c of lf: %s %s', lecture_filter,
lecname)
continue
if not os.path.exists(section_dir):
mkdir_p(normalize_path(section_dir))
resources_to_get = find_resources_to_get(lecture,
file_formats,
resource_filter,
self._ignored_formats)
# write lecture resources
for fmt, url, title in resources_to_get:
lecture_filename = get_lecture_filename(
combined_section_lectures_nums,
section_dir, secnum, lecnum, lecname, title, fmt)
lecture_filename = normalize_path(lecture_filename)
try:
self._last_update = handle_resource(
self._downloader, lecture_filename, fmt, url,
overwrite, resume, skip_download,
self.skipped_urls, self._last_update)
except requests.exceptions.RequestException as e:
logging.error('The following error has occurred while '
'downloading URL %s: %s', url, str(e))
self.failed_urls.append(url)