mirror of
https://github.com/coursera-dl/coursera-dl.git
synced 2026-01-23 02:35:37 +00:00
Moves things around, decompose into modules (WIP)
This commit is contained in:
parent
afb6384e9c
commit
f308e12e53
10 changed files with 438 additions and 415 deletions
|
|
@ -42,16 +42,11 @@ For further documentation and examples, visit the project's home at:
|
|||
"""
|
||||
|
||||
|
||||
import datetime
|
||||
import glob
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import time
|
||||
import codecs
|
||||
|
||||
from distutils.version import LooseVersion as V
|
||||
|
||||
|
|
@ -60,23 +55,19 @@ from distutils.version import LooseVersion as V
|
|||
# We may, perhaps, want to move these elsewhere.
|
||||
import bs4
|
||||
import six
|
||||
from six import iteritems
|
||||
import requests
|
||||
|
||||
from .cookies import (
|
||||
AuthenticationFailed, ClassNotFound,
|
||||
get_cookies_for_class, make_cookie_values, login, TLSAdapter)
|
||||
from .define import (CLASS_URL, ABOUT_URL, PATH_CACHE,
|
||||
OPENCOURSE_CONTENT_URL, IN_MEMORY_MARKER,
|
||||
FORMAT_MAX_LENGTH, TITLE_MAX_LENGTH)
|
||||
get_cookies_for_class, make_cookie_values, TLSAdapter)
|
||||
from .define import (CLASS_URL, ABOUT_URL, PATH_CACHE)
|
||||
from .downloaders import get_downloader
|
||||
from .workflow import CourseraDownloader
|
||||
from .utils import (clean_filename, get_anchor_format, mkdir_p, fix_url,
|
||||
print_ssl_error_message, normalize_path,
|
||||
print_ssl_error_message,
|
||||
decode_input, BeautifulSoup, is_debug_run)
|
||||
|
||||
from .network import get_page, get_page_and_url
|
||||
from .api import CourseraOnDemand, OnDemandCourseMaterialItems
|
||||
from .filter import skip_format_url
|
||||
from .commandline import parse_args
|
||||
from .extractors import CourseraExtractor
|
||||
|
||||
|
|
@ -323,333 +314,6 @@ def download_about(session, class_name, path='', overwrite=False,
|
|||
about_file.write(json_data)
|
||||
return element
|
||||
|
||||
|
||||
def is_course_complete(last_update):
|
||||
"""
|
||||
Determine is the course is likely to have been terminated or not.
|
||||
|
||||
We return True if the timestamp given by last_update is 30 days or older
|
||||
than today's date. Otherwise, we return True.
|
||||
|
||||
The intended use case for this is to detect if a given courses has not
|
||||
seen any update in the last 30 days or more. Otherwise, we return True,
|
||||
since it is probably too soon to declare the course complete.
|
||||
"""
|
||||
rv = False
|
||||
if last_update >= 0:
|
||||
delta = time.time() - last_update
|
||||
max_delta = total_seconds(datetime.timedelta(days=30))
|
||||
if delta > max_delta:
|
||||
rv = True
|
||||
return rv
|
||||
|
||||
|
||||
def format_section(num, section, class_name, verbose_dirs):
|
||||
sec = '%02d_%s' % (num, section)
|
||||
if verbose_dirs:
|
||||
sec = class_name.upper() + '_' + sec
|
||||
return sec
|
||||
|
||||
|
||||
def format_resource(num, name, title, fmt):
|
||||
if title:
|
||||
title = '_' + title
|
||||
return '%02d_%s%s.%s' % (num, name, title, fmt)
|
||||
|
||||
|
||||
def format_combine_number_resource(secnum, lecnum, lecname, title, fmt):
|
||||
if title:
|
||||
title = '_' + title
|
||||
return '%02d_%02d_%s%s.%s' % (secnum, lecnum, lecname, title, fmt)
|
||||
|
||||
|
||||
def find_resources_to_get(lecture, file_formats, resource_filter, ignored_formats=None):
|
||||
"""
|
||||
Select formats to download.
|
||||
"""
|
||||
resources_to_get = []
|
||||
|
||||
if ignored_formats is None:
|
||||
ignored_formats = []
|
||||
|
||||
if len(ignored_formats):
|
||||
logging.info("The following file formats will be ignored: " + ",".join(ignored_formats))
|
||||
|
||||
for fmt, resources in iteritems(lecture):
|
||||
|
||||
fmt0 = fmt
|
||||
if '.' in fmt:
|
||||
fmt = fmt.split('.')[1]
|
||||
|
||||
if fmt in ignored_formats:
|
||||
continue
|
||||
|
||||
if fmt in file_formats or 'all' in file_formats:
|
||||
for r in resources:
|
||||
if resource_filter and r[1] and not re.search(resource_filter, r[1]):
|
||||
logging.debug('Skipping b/c of rf: %s %s',
|
||||
resource_filter, r[1])
|
||||
continue
|
||||
resources_to_get.append((fmt0, r[0], r[1]))
|
||||
else:
|
||||
logging.debug(
|
||||
'Skipping b/c format %s not in %s', fmt, file_formats)
|
||||
|
||||
return resources_to_get
|
||||
|
||||
|
||||
def create_m3u_playlist(section_dir):
|
||||
"""
|
||||
Create M3U playlist with contents of `section_dir`/*.mp4. The playlist
|
||||
will be created in that directory.
|
||||
|
||||
@param section_dir: Path where to scan for *.mp4 files.
|
||||
@type section_dir: str
|
||||
"""
|
||||
path_to_return = os.getcwd()
|
||||
|
||||
for (_path, subdirs, files) in os.walk(section_dir):
|
||||
os.chdir(_path)
|
||||
globbed_videos = sorted(glob.glob("*.mp4"))
|
||||
m3u_name = os.path.split(_path)[1] + ".m3u"
|
||||
|
||||
if len(globbed_videos):
|
||||
with open(m3u_name, "w") as m3u:
|
||||
for video in globbed_videos:
|
||||
m3u.write(video + "\n")
|
||||
os.chdir(path_to_return)
|
||||
os.chdir(path_to_return)
|
||||
|
||||
|
||||
def handle_resource(downloader,
|
||||
lecture_filename,
|
||||
fmt,
|
||||
url,
|
||||
overwrite,
|
||||
resume,
|
||||
skip_download,
|
||||
section_dir,
|
||||
skipped_urls,
|
||||
last_update):
|
||||
"""
|
||||
Handle resource. This function builds up resource file name and
|
||||
downloads it if necessary.
|
||||
|
||||
@param downloader: Resource downloader instance.
|
||||
@type downloader: downloaders.Downloader
|
||||
|
||||
@param fmt: Format of the resource (pdf, csv, etc)
|
||||
@type fmt: str
|
||||
|
||||
@param url: URL of the resource.
|
||||
@type url: str
|
||||
|
||||
@param overwrite: Flag that indicates whether files should be overwritten.
|
||||
@type overwrite: bool
|
||||
|
||||
@param resume: Flag that indicates whether download should be resumed.
|
||||
@type resume: bool
|
||||
|
||||
@param skip_download: Flag that indicates whether download should be skipped.
|
||||
@type skip_download: bool
|
||||
|
||||
@param section_dir: Path to current section directory.
|
||||
@type section_dir: str
|
||||
|
||||
@param skipped_urls: List of skipped urls to update.
|
||||
@type skipped_urls: None or list
|
||||
|
||||
@param last_update: Latest mtime across files.
|
||||
@type last_update: timestamp
|
||||
|
||||
@return: Updated latest mtime.
|
||||
@rtype: timestamp
|
||||
"""
|
||||
# Decide whether we need to download it
|
||||
if overwrite or not os.path.exists(lecture_filename) or resume:
|
||||
if not skip_download:
|
||||
if url.startswith(IN_MEMORY_MARKER):
|
||||
page_content = url[len(IN_MEMORY_MARKER):]
|
||||
logging.info('Saving page contents to: %s', lecture_filename)
|
||||
with codecs.open(lecture_filename, 'w', 'utf-8') as file_object:
|
||||
file_object.write(page_content)
|
||||
else:
|
||||
if skipped_urls is not None and skip_format_url(fmt, url):
|
||||
skipped_urls.append(url)
|
||||
else:
|
||||
logging.info('Downloading: %s', lecture_filename)
|
||||
downloader.download(url, lecture_filename, resume=resume)
|
||||
else:
|
||||
open(lecture_filename, 'w').close() # touch
|
||||
last_update = time.time()
|
||||
else:
|
||||
logging.info('%s already downloaded', lecture_filename)
|
||||
# if this file hasn't been modified in a long time,
|
||||
# record that time
|
||||
last_update = max(last_update, os.path.getmtime(lecture_filename))
|
||||
|
||||
return last_update
|
||||
|
||||
|
||||
def get_lecture_filename(combined_section_lectures_nums,
|
||||
section_dir,
|
||||
secnum,
|
||||
lecnum,
|
||||
lecname,
|
||||
title,
|
||||
fmt):
|
||||
"""
|
||||
Prepare a destination lecture filename.
|
||||
|
||||
@param combined_section_lectures_nums: Flag that indicates whether
|
||||
section lectures should have combined numbering.
|
||||
@type combined_section_lectures_nums: bool
|
||||
|
||||
@param section_dir: Path to current section directory.
|
||||
@type section_dir: str
|
||||
|
||||
@param secnum: Section number.
|
||||
@type secnum: int
|
||||
|
||||
@param lecnum: Lecture number.
|
||||
@type lecnum: int
|
||||
|
||||
@param lecname: Lecture name.
|
||||
@type lecname: str
|
||||
|
||||
@param title: Resource title.
|
||||
@type title: str
|
||||
|
||||
@param fmt: Format of the resource (pdf, csv, etc)
|
||||
@type fmt: str
|
||||
|
||||
@return: Lecture file name.
|
||||
@rtype: str
|
||||
"""
|
||||
# FIXME: this is a quick and dirty solution to Filename too long
|
||||
# problem. We need to think of a more general way to solve this
|
||||
# issue.
|
||||
fmt = fmt[:FORMAT_MAX_LENGTH]
|
||||
title = title[:TITLE_MAX_LENGTH]
|
||||
|
||||
# Format lecture file name
|
||||
if combined_section_lectures_nums:
|
||||
lecture_filename = os.path.join(
|
||||
section_dir,
|
||||
format_combine_number_resource(
|
||||
secnum + 1, lecnum + 1, lecname, title, fmt))
|
||||
else:
|
||||
lecture_filename = os.path.join(
|
||||
section_dir, format_resource(lecnum + 1, lecname, title, fmt))
|
||||
|
||||
return lecture_filename
|
||||
|
||||
|
||||
def download_lectures(downloader,
|
||||
class_name,
|
||||
sections,
|
||||
file_formats,
|
||||
overwrite=False,
|
||||
skip_download=False,
|
||||
section_filter=None,
|
||||
lecture_filter=None,
|
||||
resource_filter=None,
|
||||
path='',
|
||||
verbose_dirs=False,
|
||||
preview=False,
|
||||
combined_section_lectures_nums=False,
|
||||
hooks=None,
|
||||
playlist=False,
|
||||
unrestricted_filenames=False,
|
||||
ignored_formats=None,
|
||||
resume=False,
|
||||
skipped_urls=None,
|
||||
failed_urls=None,
|
||||
video_resolution='540p'):
|
||||
"""
|
||||
Download lecture resources described by sections.
|
||||
|
||||
Returns True if the class appears completed, False otherwise.
|
||||
"""
|
||||
last_update = -1
|
||||
|
||||
for (secnum, (section, lectures)) in enumerate(sections):
|
||||
if section_filter and not re.search(section_filter, section):
|
||||
logging.debug('Skipping b/c of sf: %s %s', section_filter,
|
||||
section)
|
||||
continue
|
||||
|
||||
section_dir = os.path.join(
|
||||
path, class_name,
|
||||
format_section(secnum + 1, section, class_name, verbose_dirs))
|
||||
for (lecnum, (lecname, lecture)) in enumerate(lectures):
|
||||
if lecture_filter and not re.search(lecture_filter,
|
||||
lecname):
|
||||
logging.debug('Skipping b/c of lf: %s %s', lecture_filter,
|
||||
lecname)
|
||||
continue
|
||||
|
||||
if not os.path.exists(section_dir):
|
||||
mkdir_p(normalize_path(section_dir))
|
||||
|
||||
resources_to_get = find_resources_to_get(lecture,
|
||||
file_formats,
|
||||
resource_filter,
|
||||
ignored_formats)
|
||||
|
||||
# write lecture resources
|
||||
for fmt, url, title in resources_to_get:
|
||||
lecture_filename = get_lecture_filename(
|
||||
combined_section_lectures_nums,
|
||||
section_dir, secnum, lecnum, lecname, title, fmt)
|
||||
|
||||
lecture_filename = normalize_path(lecture_filename)
|
||||
|
||||
try:
|
||||
last_update = handle_resource(
|
||||
downloader, lecture_filename, fmt, url,
|
||||
overwrite, resume, skip_download,
|
||||
section_dir, skipped_urls, last_update)
|
||||
except requests.exceptions.RequestException as e:
|
||||
logging.error('The following error has occurred while '
|
||||
'downloading URL %s: %s', url, str(e))
|
||||
if failed_urls is None:
|
||||
logging.info('If you want to ignore HTTP errors, '
|
||||
'please use "--ignore-http-errors" option')
|
||||
raise
|
||||
else:
|
||||
failed_urls.append(url)
|
||||
|
||||
# After fetching resources, create a playlist in M3U format with the
|
||||
# videos downloaded.
|
||||
if playlist:
|
||||
create_m3u_playlist(section_dir)
|
||||
|
||||
if hooks:
|
||||
for hook in hooks:
|
||||
logging.info('Running hook %s for section %s.', hook, section_dir)
|
||||
os.chdir(section_dir)
|
||||
subprocess.call(hook)
|
||||
|
||||
# if we haven't updated any files in 1 month, we're probably
|
||||
# done with this course
|
||||
rv = is_course_complete(last_update)
|
||||
if rv:
|
||||
logging.info('COURSE PROBABLY COMPLETE: ' + class_name)
|
||||
|
||||
return rv
|
||||
|
||||
|
||||
def total_seconds(td):
|
||||
"""
|
||||
Compute total seconds for a timedelta.
|
||||
|
||||
Added for backward compatibility, pre 2.7.
|
||||
"""
|
||||
return (td.microseconds +
|
||||
(td.seconds + td.days * 24 * 3600) * 10 ** 6) // 10 ** 6
|
||||
|
||||
|
||||
def download_old_style_class(args, class_name):
|
||||
"""
|
||||
Download all requested resources from the class given in class_name.
|
||||
|
|
@ -742,25 +406,9 @@ def download_on_demand_class(args, class_name):
|
|||
Returns True if the class appears completed.
|
||||
"""
|
||||
|
||||
ignored_formats = []
|
||||
if args.ignore_formats:
|
||||
ignored_formats = args.ignore_formats.split(",")
|
||||
|
||||
session = get_session()
|
||||
extractor = CourseraExtractor(session, args.username, args.password)
|
||||
|
||||
# login(session, args.username, args.password)
|
||||
|
||||
# get the syllabus listing
|
||||
# page = get_on_demand_syllabus(session, class_name)
|
||||
|
||||
# parse it
|
||||
# modules = parse_on_demand_syllabus(session, page,
|
||||
# args.reverse,
|
||||
# args.unrestricted_filenames,
|
||||
# args.subtitle_language,
|
||||
# args.video_resolution)
|
||||
|
||||
modules = extractor.get_modules(class_name,
|
||||
args.reverse,
|
||||
args.unrestricted_filenames,
|
||||
|
|
@ -775,46 +423,29 @@ def download_on_demand_class(args, class_name):
|
|||
|
||||
# obtain the resources
|
||||
|
||||
skipped_urls = []
|
||||
failed_urls = []
|
||||
ignored_formats = []
|
||||
if args.ignore_formats:
|
||||
ignored_formats = args.ignore_formats.split(",")
|
||||
|
||||
completed = True
|
||||
for idx, module in enumerate(modules):
|
||||
module_name = '%02d_%s' % (idx + 1, module[0])
|
||||
sections = module[1]
|
||||
course_downloader = CourseraDownloader(
|
||||
downloader,
|
||||
commandline_args=args,
|
||||
class_name=class_name,
|
||||
path=args.path,
|
||||
ignored_formats=ignored_formats,
|
||||
disable_url_skipping=args.disable_url_skipping
|
||||
)
|
||||
|
||||
result = download_lectures(
|
||||
downloader,
|
||||
module_name,
|
||||
sections,
|
||||
args.file_formats,
|
||||
args.overwrite,
|
||||
args.skip_download,
|
||||
args.section_filter,
|
||||
args.lecture_filter,
|
||||
args.resource_filter,
|
||||
os.path.join(args.path, class_name),
|
||||
args.verbose_dirs,
|
||||
args.preview,
|
||||
args.combined_section_lectures_nums,
|
||||
args.hooks,
|
||||
args.playlist,
|
||||
args.unrestricted_filenames,
|
||||
ignored_formats,
|
||||
args.resume,
|
||||
None if args.disable_url_skipping else skipped_urls,
|
||||
failed_urls
|
||||
)
|
||||
completed = completed and result
|
||||
completed = course_downloader.download_modules(modules)
|
||||
|
||||
# Print skipped URLs if any
|
||||
if skipped_urls:
|
||||
print_skipped_urls(skipped_urls)
|
||||
if course_downloader.skipped_urls:
|
||||
print_skipped_urls(course_downloader.skipped_urls)
|
||||
|
||||
# Print failed URLs if any
|
||||
# FIXME: should we set non-zero exit code if we have failed URLs?
|
||||
if failed_urls:
|
||||
print_failed_urls(failed_urls)
|
||||
if course_downloader.failed_urls:
|
||||
print_failed_urls(course_downloader.failed_urls)
|
||||
|
||||
return completed
|
||||
|
||||
|
|
|
|||
|
|
@ -12,13 +12,17 @@ from __future__ import print_function
|
|||
import logging
|
||||
import math
|
||||
import os
|
||||
import requests
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
|
||||
import requests
|
||||
|
||||
from six import iteritems
|
||||
|
||||
#
|
||||
# Below are file downloaders, they are wrappers for external downloaders.
|
||||
#
|
||||
|
||||
class Downloader(object):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -65,7 +65,7 @@ class CourseraExtractor(PlatformExtractor):
|
|||
course_name = dom['slug']
|
||||
|
||||
logging.info('Parsing syllabus of on-demand course. '
|
||||
'This may take some time, please be patient ...')
|
||||
'This may take some time, please be patient ...')
|
||||
modules = []
|
||||
json_modules = dom['courseMaterial']['elements']
|
||||
course = CourseraOnDemand(session=self._session, course_id=dom['id'],
|
||||
|
|
|
|||
|
|
@ -3,7 +3,9 @@ This module contains filtering functions.
|
|||
"""
|
||||
|
||||
import re
|
||||
import logging
|
||||
|
||||
from six import iteritems
|
||||
from six.moves.urllib_parse import urlparse
|
||||
|
||||
|
||||
|
|
@ -77,3 +79,38 @@ def skip_format_url(format_, url):
|
|||
|
||||
# Do not skip
|
||||
return False
|
||||
|
||||
|
||||
def find_resources_to_get(lecture, file_formats, resource_filter, ignored_formats=None):
|
||||
"""
|
||||
Select formats to download.
|
||||
"""
|
||||
resources_to_get = []
|
||||
|
||||
if ignored_formats is None:
|
||||
ignored_formats = []
|
||||
|
||||
if len(ignored_formats):
|
||||
logging.info("The following file formats will be ignored: " + ",".join(ignored_formats))
|
||||
|
||||
for fmt, resources in iteritems(lecture):
|
||||
|
||||
fmt0 = fmt
|
||||
if '.' in fmt:
|
||||
fmt = fmt.split('.')[1]
|
||||
|
||||
if fmt in ignored_formats:
|
||||
continue
|
||||
|
||||
if fmt in file_formats or 'all' in file_formats:
|
||||
for r in resources:
|
||||
if resource_filter and r[1] and not re.search(resource_filter, r[1]):
|
||||
logging.debug('Skipping b/c of rf: %s %s',
|
||||
resource_filter, r[1])
|
||||
continue
|
||||
resources_to_get.append((fmt0, r[0], r[1]))
|
||||
else:
|
||||
logging.debug(
|
||||
'Skipping b/c format %s not in %s', fmt, file_formats)
|
||||
|
||||
return resources_to_get
|
||||
|
|
|
|||
76
coursera/formatting.py
Normal file
76
coursera/formatting.py
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
import os
|
||||
|
||||
from .define import FORMAT_MAX_LENGTH, TITLE_MAX_LENGTH
|
||||
|
||||
|
||||
def format_section(num, section, class_name, verbose_dirs):
|
||||
sec = '%02d_%s' % (num, section)
|
||||
if verbose_dirs:
|
||||
sec = class_name.upper() + '_' + sec
|
||||
return sec
|
||||
|
||||
|
||||
def format_resource(num, name, title, fmt):
|
||||
if title:
|
||||
title = '_' + title
|
||||
return '%02d_%s%s.%s' % (num, name, title, fmt)
|
||||
|
||||
|
||||
def format_combine_number_resource(secnum, lecnum, lecname, title, fmt):
|
||||
if title:
|
||||
title = '_' + title
|
||||
return '%02d_%02d_%s%s.%s' % (secnum, lecnum, lecname, title, fmt)
|
||||
|
||||
|
||||
def get_lecture_filename(combined_section_lectures_nums,
|
||||
section_dir,
|
||||
secnum,
|
||||
lecnum,
|
||||
lecname,
|
||||
title,
|
||||
fmt):
|
||||
"""
|
||||
Prepare a destination lecture filename.
|
||||
|
||||
@param combined_section_lectures_nums: Flag that indicates whether
|
||||
section lectures should have combined numbering.
|
||||
@type combined_section_lectures_nums: bool
|
||||
|
||||
@param section_dir: Path to current section directory.
|
||||
@type section_dir: str
|
||||
|
||||
@param secnum: Section number.
|
||||
@type secnum: int
|
||||
|
||||
@param lecnum: Lecture number.
|
||||
@type lecnum: int
|
||||
|
||||
@param lecname: Lecture name.
|
||||
@type lecname: str
|
||||
|
||||
@param title: Resource title.
|
||||
@type title: str
|
||||
|
||||
@param fmt: Format of the resource (pdf, csv, etc)
|
||||
@type fmt: str
|
||||
|
||||
@return: Lecture file name.
|
||||
@rtype: str
|
||||
"""
|
||||
# FIXME: this is a quick and dirty solution to Filename too long
|
||||
# problem. We need to think of a more general way to solve this
|
||||
# issue.
|
||||
fmt = fmt[:FORMAT_MAX_LENGTH]
|
||||
title = title[:TITLE_MAX_LENGTH]
|
||||
|
||||
# Format lecture file name
|
||||
if combined_section_lectures_nums:
|
||||
lecture_filename = os.path.join(
|
||||
section_dir,
|
||||
format_combine_number_resource(
|
||||
secnum + 1, lecnum + 1, lecname, title, fmt))
|
||||
else:
|
||||
lecture_filename = os.path.join(
|
||||
section_dir, format_resource(lecnum + 1, lecname, title, fmt))
|
||||
|
||||
return lecture_filename
|
||||
25
coursera/playlist.py
Normal file
25
coursera/playlist.py
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
import os
|
||||
|
||||
|
||||
def create_m3u_playlist(section_dir):
|
||||
"""
|
||||
Create M3U playlist with contents of `section_dir`/*.mp4. The playlist
|
||||
will be created in that directory.
|
||||
|
||||
@param section_dir: Path where to scan for *.mp4 files.
|
||||
@type section_dir: str
|
||||
"""
|
||||
path_to_return = os.getcwd()
|
||||
|
||||
for (_path, subdirs, files) in os.walk(section_dir):
|
||||
os.chdir(_path)
|
||||
globbed_videos = sorted(glob.glob("*.mp4"))
|
||||
m3u_name = os.path.split(_path)[1] + ".m3u"
|
||||
|
||||
if len(globbed_videos):
|
||||
with open(m3u_name, "w") as m3u:
|
||||
for video in globbed_videos:
|
||||
m3u.write(video + "\n")
|
||||
os.chdir(path_to_return)
|
||||
os.chdir(path_to_return)
|
||||
|
||||
|
|
@ -6,6 +6,7 @@ Test the downloaders.
|
|||
|
||||
from coursera import downloaders
|
||||
from coursera import coursera_dl
|
||||
from coursera.filter import find_resources_to_get
|
||||
|
||||
import pytest
|
||||
|
||||
|
|
@ -21,7 +22,7 @@ def sample_bag():
|
|||
|
||||
|
||||
def test_collect_all_resources(sample_bag):
|
||||
res = coursera_dl.find_resources_to_get(sample_bag, 'all', None)
|
||||
res = find_resources_to_get(sample_bag, 'all', None)
|
||||
|
||||
assert [('mp4', 'h://url1/lc1.mp4', 'video'),
|
||||
('pdf', 'h://url2/lc2.pdf', 'slides'),
|
||||
|
|
@ -29,13 +30,13 @@ def test_collect_all_resources(sample_bag):
|
|||
|
||||
|
||||
def test_collect_only_pdfs(sample_bag):
|
||||
res = coursera_dl.find_resources_to_get(sample_bag, 'pdf', None)
|
||||
res = find_resources_to_get(sample_bag, 'pdf', None)
|
||||
|
||||
assert [('pdf', 'h://url2/lc2.pdf', 'slides')] == sorted(res)
|
||||
|
||||
|
||||
def test_collect_with_filtering(sample_bag):
|
||||
res = coursera_dl.find_resources_to_get(sample_bag, 'all', 'de')
|
||||
res = find_resources_to_get(sample_bag, 'all', 'de')
|
||||
res = sorted(res)
|
||||
|
||||
assert [('mp4', 'h://url1/lc1.mp4', 'video'),
|
||||
|
|
|
|||
|
|
@ -19,6 +19,9 @@ from coursera import coursera_dl
|
|||
from coursera import api
|
||||
|
||||
from coursera.test.utils import slurp_fixture
|
||||
from coursera.formatting import (format_section, format_resource,
|
||||
format_combine_number_resource)
|
||||
from coursera.utils import total_seconds, is_course_complete
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
|
@ -98,32 +101,32 @@ def test_fix_url_removes_spaces():
|
|||
|
||||
|
||||
def test_format_combine_resource_works_correctly():
|
||||
rv = coursera_dl.format_combine_number_resource(5, 4, "Moving_the_furniture", 'The_Basics', "mp4")
|
||||
rv = format_combine_number_resource(5, 4, "Moving_the_furniture", 'The_Basics', "mp4")
|
||||
assert '05_04_Moving_the_furniture_The_Basics.mp4' == rv
|
||||
|
||||
|
||||
def test_format_combine_resource_works_correctly_without_title():
|
||||
rv = coursera_dl.format_combine_number_resource(5, 1, "Introduction", '', "mp4")
|
||||
rv = format_combine_number_resource(5, 1, "Introduction", '', "mp4")
|
||||
assert '05_01_Introduction.mp4' == rv
|
||||
|
||||
|
||||
def test_format_resource_works_correctly():
|
||||
rv = coursera_dl.format_resource(2, "Washing", "Dishes", "mp9")
|
||||
rv = format_resource(2, "Washing", "Dishes", "mp9")
|
||||
assert '02_Washing_Dishes.mp9' == rv
|
||||
|
||||
|
||||
def test_format_resource_works_correctly_without_title():
|
||||
rv = coursera_dl.format_resource(1, "Introduction", '', "mp2")
|
||||
rv = format_resource(1, "Introduction", '', "mp2")
|
||||
assert '01_Introduction.mp2' == rv
|
||||
|
||||
|
||||
def test_format_section_works_correctly():
|
||||
rv = coursera_dl.format_section(9, 'bob', 'WEAVING', False)
|
||||
rv = format_section(9, 'bob', 'WEAVING', False)
|
||||
assert '09_bob' == rv
|
||||
|
||||
|
||||
def test_format_section_works_correctly_with_verbose():
|
||||
rv = coursera_dl.format_section(9, 'bill', 'WEAVING', True)
|
||||
rv = format_section(9, 'bill', 'WEAVING', True)
|
||||
assert 'WEAVING_09_bill' == rv
|
||||
|
||||
|
||||
|
|
@ -146,25 +149,25 @@ def test_decode_input():
|
|||
|
||||
|
||||
def test_total_seconds():
|
||||
ts = coursera_dl.total_seconds(datetime.timedelta(days=30))
|
||||
ts = total_seconds(datetime.timedelta(days=30))
|
||||
assert ts == 2592000
|
||||
|
||||
|
||||
def test_is_course_complete_should_give_false_if_there_was_recent_update():
|
||||
|
||||
delta = coursera_dl.total_seconds(datetime.timedelta(days=29))
|
||||
delta = total_seconds(datetime.timedelta(days=29))
|
||||
tm = time() - delta
|
||||
|
||||
rv = coursera_dl.is_course_complete(tm)
|
||||
rv = is_course_complete(tm)
|
||||
assert rv is False
|
||||
|
||||
|
||||
def test_is_course_complete_should_give_true_if_there_was_no_recent_update():
|
||||
|
||||
delta = coursera_dl.total_seconds(datetime.timedelta(days=31))
|
||||
delta = total_seconds(datetime.timedelta(days=31))
|
||||
tm = time() - delta
|
||||
|
||||
rv = coursera_dl.is_course_complete(tm)
|
||||
rv = is_course_complete(tm)
|
||||
assert rv is True
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -4,24 +4,22 @@
|
|||
This module provides utility functions that are used within the script.
|
||||
"""
|
||||
|
||||
import errno
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import string
|
||||
import sys
|
||||
import time
|
||||
import errno
|
||||
import random
|
||||
import string
|
||||
import logging
|
||||
import datetime
|
||||
|
||||
|
||||
import six
|
||||
from bs4 import BeautifulSoup as BeautifulSoup_
|
||||
|
||||
# Force us of bs4 with html5lib
|
||||
BeautifulSoup = lambda page: BeautifulSoup_(page, 'html5lib')
|
||||
|
||||
from .define import COURSERA_URL, WINDOWS_UNC_PREFIX
|
||||
|
||||
from six.moves import html_parser
|
||||
import six
|
||||
from six import iteritems
|
||||
from six.moves import html_parser
|
||||
from six.moves.urllib.parse import ParseResult
|
||||
from six.moves.urllib_parse import unquote_plus
|
||||
|
||||
|
|
@ -39,6 +37,11 @@ else:
|
|||
from string import letters as string_ascii_letters
|
||||
from string import digits as string_digits
|
||||
|
||||
from .define import COURSERA_URL, WINDOWS_UNC_PREFIX
|
||||
|
||||
# Force us of bs4 with html5lib
|
||||
BeautifulSoup = lambda page: BeautifulSoup_(page, 'html5lib')
|
||||
|
||||
|
||||
if six.PY2:
|
||||
def decode_input(x):
|
||||
|
|
@ -184,6 +187,36 @@ def fix_url(url):
|
|||
return url
|
||||
|
||||
|
||||
def is_course_complete(last_update):
|
||||
"""
|
||||
Determine is the course is likely to have been terminated or not.
|
||||
|
||||
We return True if the timestamp given by last_update is 30 days or older
|
||||
than today's date. Otherwise, we return True.
|
||||
|
||||
The intended use case for this is to detect if a given courses has not
|
||||
seen any update in the last 30 days or more. Otherwise, we return True,
|
||||
since it is probably too soon to declare the course complete.
|
||||
"""
|
||||
rv = False
|
||||
if last_update >= 0:
|
||||
delta = time.time() - last_update
|
||||
max_delta = total_seconds(datetime.timedelta(days=30))
|
||||
if delta > max_delta:
|
||||
rv = True
|
||||
return rv
|
||||
|
||||
|
||||
def total_seconds(td):
|
||||
"""
|
||||
Compute total seconds for a timedelta.
|
||||
|
||||
Added for backward compatibility, pre 2.7.
|
||||
"""
|
||||
return (td.microseconds +
|
||||
(td.seconds + td.days * 24 * 3600) * 10 ** 6) // 10 ** 6
|
||||
|
||||
|
||||
def make_coursera_absolute_url(url):
|
||||
"""
|
||||
If given url is relative adds coursera netloc,
|
||||
|
|
|
|||
213
coursera/workflow.py
Normal file
213
coursera/workflow.py
Normal file
|
|
@ -0,0 +1,213 @@
|
|||
import os
|
||||
import re
|
||||
import abc
|
||||
import time
|
||||
import codecs
|
||||
import logging
|
||||
import subprocess
|
||||
|
||||
import requests
|
||||
|
||||
from .formatting import format_section, get_lecture_filename
|
||||
from .playlist import create_m3u_playlist
|
||||
from .utils import is_course_complete, mkdir_p, normalize_path
|
||||
from .filter import find_resources_to_get, skip_format_url
|
||||
from .define import IN_MEMORY_MARKER
|
||||
|
||||
|
||||
def handle_resource(downloader,
|
||||
lecture_filename,
|
||||
fmt,
|
||||
url,
|
||||
overwrite,
|
||||
resume,
|
||||
skip_download,
|
||||
skipped_urls,
|
||||
last_update):
|
||||
"""
|
||||
Handle resource. This function builds up resource file name and
|
||||
downloads it if necessary.
|
||||
|
||||
@param downloader: Resource downloader instance.
|
||||
@type downloader: downloaders.Downloader
|
||||
|
||||
@param fmt: Format of the resource (pdf, csv, etc)
|
||||
@type fmt: str
|
||||
|
||||
@param url: URL of the resource.
|
||||
@type url: str
|
||||
|
||||
@param overwrite: Flag that indicates whether files should be overwritten.
|
||||
@type overwrite: bool
|
||||
|
||||
@param resume: Flag that indicates whether download should be resumed.
|
||||
@type resume: bool
|
||||
|
||||
@param skip_download: Flag that indicates whether download should be skipped.
|
||||
@type skip_download: bool
|
||||
|
||||
@param skipped_urls: List of skipped urls to update.
|
||||
@type skipped_urls: None or list
|
||||
|
||||
@param last_update: Latest mtime across files.
|
||||
@type last_update: timestamp
|
||||
|
||||
@return: Updated latest mtime.
|
||||
@rtype: timestamp
|
||||
"""
|
||||
# Decide whether we need to download it
|
||||
if overwrite or not os.path.exists(lecture_filename) or resume:
|
||||
if not skip_download:
|
||||
if url.startswith(IN_MEMORY_MARKER):
|
||||
page_content = url[len(IN_MEMORY_MARKER):]
|
||||
logging.info('Saving page contents to: %s', lecture_filename)
|
||||
with codecs.open(lecture_filename, 'w', 'utf-8') as file_object:
|
||||
file_object.write(page_content)
|
||||
else:
|
||||
if skipped_urls is not None and skip_format_url(fmt, url):
|
||||
skipped_urls.append(url)
|
||||
else:
|
||||
logging.info('Downloading: %s', lecture_filename)
|
||||
downloader.download(url, lecture_filename, resume=resume)
|
||||
else:
|
||||
open(lecture_filename, 'w').close() # touch
|
||||
last_update = time.time()
|
||||
else:
|
||||
logging.info('%s already downloaded', lecture_filename)
|
||||
# if this file hasn't been modified in a long time,
|
||||
# record that time
|
||||
last_update = max(last_update, os.path.getmtime(lecture_filename))
|
||||
|
||||
return last_update
|
||||
|
||||
|
||||
class CourseDownloader(object):
|
||||
__metaclass__ = abc.ABCMeta
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def download_modules(self, modules):
|
||||
pass
|
||||
|
||||
|
||||
class CourseraDownloader(CourseDownloader):
|
||||
def __init__(self,
|
||||
downloader,
|
||||
commandline_args,
|
||||
class_name,
|
||||
path='',
|
||||
ignored_formats=None,
|
||||
disable_url_skipping=False):
|
||||
super(CourseraDownloader, self).__init__()
|
||||
|
||||
self._downloader = downloader
|
||||
self._args = commandline_args
|
||||
self._class_name = class_name
|
||||
self._path = path
|
||||
self._ignored_formats = ignored_formats
|
||||
self._disable_url_skipping = disable_url_skipping
|
||||
|
||||
self.skipped_urls = [] if disable_url_skipping else None
|
||||
self.failed_urls = []
|
||||
|
||||
self._last_update = -1
|
||||
|
||||
def download_modules(self, modules):
|
||||
completed = True
|
||||
for idx, module in enumerate(modules):
|
||||
module_name = '%02d_%s' % (idx + 1, module[0])
|
||||
sections = module[1]
|
||||
result = self._download_sections(module_name, sections)
|
||||
completed = completed and result
|
||||
return completed
|
||||
|
||||
def _download_sections(self, module_name, sections):
|
||||
"""
|
||||
Download lecture resources described by sections.
|
||||
|
||||
Returns True if the class appears completed, False otherwise.
|
||||
"""
|
||||
self._last_update = -1
|
||||
|
||||
section_filter = self._args.section_filter
|
||||
verbose_dirs = self._args.verbose_dirs
|
||||
hooks = self._args.hooks
|
||||
playlist = self._args.playlist
|
||||
|
||||
for (secnum, (section, lectures)) in enumerate(sections):
|
||||
if section_filter and not re.search(section_filter, section):
|
||||
logging.debug('Skipping b/c of sf: %s %s', section_filter,
|
||||
section)
|
||||
continue
|
||||
|
||||
section_dir = os.path.join(
|
||||
self._path, self._class_name, module_name,
|
||||
format_section(secnum + 1, section,
|
||||
self._class_name, verbose_dirs))
|
||||
|
||||
self._download_lectures(lectures, secnum, section_dir)
|
||||
|
||||
# After fetching resources, create a playlist in M3U format with the
|
||||
# videos downloaded.
|
||||
if playlist:
|
||||
create_m3u_playlist(section_dir)
|
||||
|
||||
if hooks:
|
||||
original_dir = os.getcwd()
|
||||
for hook in hooks:
|
||||
logging.info('Running hook %s for section %s.', hook, section_dir)
|
||||
os.chdir(section_dir)
|
||||
subprocess.call(hook)
|
||||
os.chdir(original_dir)
|
||||
|
||||
# if we haven't updated any files in 1 month, we're probably
|
||||
# done with this course
|
||||
is_complete = is_course_complete(self._last_update)
|
||||
if is_complete:
|
||||
logging.info('COURSE PROBABLY COMPLETE: ' + self._class_name)
|
||||
|
||||
return is_complete
|
||||
|
||||
def _download_lectures(self, lectures, secnum, section_dir):
|
||||
lecture_filter = self._args.lecture_filter
|
||||
file_formats = self._args.file_formats
|
||||
resource_filter = self._args.resource_filter
|
||||
combined_section_lectures_nums = self._args.combined_section_lectures_nums
|
||||
overwrite = self._args.overwrite
|
||||
resume = self._args.resume
|
||||
skip_download = self._args.skip_download
|
||||
|
||||
for (lecnum, (lecname, lecture)) in enumerate(lectures):
|
||||
if lecture_filter and not re.search(lecture_filter,
|
||||
lecname):
|
||||
logging.debug('Skipping b/c of lf: %s %s', lecture_filter,
|
||||
lecname)
|
||||
continue
|
||||
|
||||
if not os.path.exists(section_dir):
|
||||
mkdir_p(normalize_path(section_dir))
|
||||
|
||||
resources_to_get = find_resources_to_get(lecture,
|
||||
file_formats,
|
||||
resource_filter,
|
||||
self._ignored_formats)
|
||||
|
||||
# write lecture resources
|
||||
for fmt, url, title in resources_to_get:
|
||||
lecture_filename = get_lecture_filename(
|
||||
combined_section_lectures_nums,
|
||||
section_dir, secnum, lecnum, lecname, title, fmt)
|
||||
|
||||
lecture_filename = normalize_path(lecture_filename)
|
||||
|
||||
try:
|
||||
self._last_update = handle_resource(
|
||||
self._downloader, lecture_filename, fmt, url,
|
||||
overwrite, resume, skip_download,
|
||||
self.skipped_urls, self._last_update)
|
||||
except requests.exceptions.RequestException as e:
|
||||
logging.error('The following error has occurred while '
|
||||
'downloading URL %s: %s', url, str(e))
|
||||
self.failed_urls.append(url)
|
||||
Loading…
Add table
Add a link
Reference in a new issue