diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..c6e43d8 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,7 @@ +root = true + +[*] +charset = utf-8 +indent_style = space +indent_size = 4 +insert_final_newline = true diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..a76c228 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +coursera/test/* linguist-vendored diff --git a/.gitignore b/.gitignore index 1726811..648ae3d 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,4 @@ venv3 .python-version .ipynb_checkpoints .ropeproject +.mypy_cache diff --git a/.travis.yml b/.travis.yml index f61af34..53ee630 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,10 +1,10 @@ language: python python: - - "2.6" - "2.7" - - "3.3" - - "3.4" - - "3.5" + - "3.6" + - "3.7" + - "3.8" + - "3.9" - "pypy" matrix: allow_failures: diff --git a/CHANGELOG.md b/CHANGELOG.md index ff3668b..1aae0cb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,57 @@ # Change Log +## 0.11.5 (2019-12-16) + +Features: + - add --cauth argument to specify CAUTH cookie directly from command-line (#724) + +## 0.11.4 (2018-06-24) + +Features: + - Do not expand class names if there is a specialization with the same name, + but add --specialization flag to do that explicitly (#673) + +## 0.11.3 (2018-06-24) + +Bugfixes: + - Switch to newer API for syllabus and lecture retrieval (#665, #673, #634) + +Features: + - You can now download specializations: the child courses will be + downloaded automatically + +## 0.11.2 (2018-06-03) + +Bugfixes: + - Use TLS v1.2 instead of v1.0 + - Switched to api.coursera.org subdomain for subtitles requests (#664) + +## 0.11.1 (2018-06-02) + +Bugfixes: + - Specify utf-8 encoding in setup.py to fix installation on Windows (#662) + +## 0.11.0 (2018-06-02) + +Features: + - Add support for "peer assignment" section (#650) + +Bugfixes: + - Switched to api.coursera.org subdomain for API requests (#660) + +## 0.10.0 (2018-02-19) + +Features: + - Support Coursera Notebooks (option: `--download-notebooks`) + - Add hints in the documentation for users in China + +## 0.9.0 (2017-05-25) + +Features: + - Default arguments are loaded from `coursera-dl.conf` file + - Added option `--mathjax-cdn ` to specify alternative MathJax CDN + - Added support for Resources section + ## 0.8.0 (2016-10-04) Features: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index cca172d..2895dd5 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -4,7 +4,7 @@ developers/maintainers feel good when trying to change code that other people contributed. For the record, when this document mentions "I", it mostly means Rogério -Brito's (@rbrito) is the one to blame. +Theodoro de Brito's (@rbrito) is the one to blame. # Write good commit messages @@ -237,7 +237,23 @@ DRAFT `git add ... & git ci -m 'Bump version (old_version -> new_version)'` 4. `git tag new_version` 5. `git push && git push --tags` -6. `pandoc --from=markdown --to=rst --output=README.rst README.md`. - I think this is required for PyPI description to look nice. -7. `python setup.py sdist` to build the package -8. `twine upload dist/coursera-dl-0.6.1.tar.gz` to deploy the package. +6. `python setup.py sdist bdist_wheel --universal` to build the package +7. `twine upload dist/coursera-dl-0.6.1.tar.gz` to deploy the package. + +## Docker + +Build new Docker image from PyPI package: + +``` +docker build --tag courseradl/courseradl --build-arg VERSION=0.11.2 . +``` + +Run the image: +``` +docker run --rm -it -v "$(pwd):/courses" -v "$HOME/.netrc:/netrc" courseradl -n /netrc -- google-machine-learning +``` + +Publish the image: +``` +docker push courseradl/courseradl +``` diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..cef9e3c --- /dev/null +++ b/Dockerfile @@ -0,0 +1,15 @@ +FROM python:3.6-slim + +LABEL maintainer "https://github.com/coursera-dl/" + +RUN apt-get update && \ + apt-get install -y --no-install-recommends gcc g++ libssl-dev && \ + rm -rf /var/lib/apt/lists/* && \ + apt-get purge -y --auto-remove gcc g++ libssl-dev + +ARG VERSION +RUN pip install coursera-dl==$VERSION + +WORKDIR /courses +ENTRYPOINT ["coursera-dl"] +CMD ["--help"] diff --git a/MANIFEST.in b/MANIFEST.in index a6c2ce8..c4b71c1 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,33 @@ include requirements*.txt include CONTRIBUTING.md include LICENSE + +exclude .coveragerc +exclude .ctags +exclude .gitattributes +exclude .github/ISSUE_TEMPLATE.md +exclude .github/PULL_REQUEST_TEMPLATE.md +exclude .gitignore +exclude .travis.yml +exclude AUTHORS.md +exclude CHANGELOG.md +exclude README.md +exclude appveyor.yml +exclude appveyor/install.ps1 +exclude appveyor/run_with_env.cmd +exclude assets/hat-logo.svg +exclude coursera-dl +exclude coursera-dl.bat +exclude deploy/.netrc +exclude deploy/Dockerfile +exclude deploy/README.md +exclude deploy/build.sh +exclude deploy/download.sh +exclude fabfile.py +exclude tox.ini + +prune appveyor/ +prune assets/ +prune deploy/ +prune coursera/test/ +prune .github/ diff --git a/README.md b/README.md index 28c3d70..49601c5 100644 --- a/README.md +++ b/README.md @@ -4,9 +4,11 @@ [![Build status](https://ci.appveyor.com/api/projects/status/3hru0ycv5fbny5k8/branch/master?svg=true)](https://ci.appveyor.com/project/balta2ar/coursera-dl/branch/master) [![Coverage Status](https://coveralls.io/repos/coursera-dl/coursera-dl/badge.svg)](https://coveralls.io/r/coursera-dl/coursera-dl) [![Latest version on PyPI](https://img.shields.io/pypi/v/coursera-dl.svg)](https://pypi.python.org/pypi/coursera-dl) -[![Downloads from PyPI](https://img.shields.io/pypi/dm/coursera-dl.svg)](https://pypi.python.org/pypi/coursera-dl) [![Code Climate](https://codeclimate.com/github/coursera-dl/coursera-dl/badges/gpa.svg)](https://codeclimate.com/github/coursera-dl/coursera-dl) + + +- [Coursera Downloader](#coursera-downloader) - [Introduction](#introduction) - [Features](#features) - [Disclaimer](#disclaimer) @@ -14,20 +16,28 @@ - [Recommended installation method for all Operating Systems](#recommended-installation-method-for-all-operating-systems) - [Alternative ways of installing missing dependencies](#alternative-ways-of-installing-missing-dependencies) - [Alternative installation method for Unix systems](#alternative-installation-method-for-unix-systems) + - [ArchLinux](#archlinux) - [Installing dependencies on your own](#installing-dependencies-on-your-own) + - [Docker](#docker) - [Windows](#windows) - [Create an account with Coursera](#create-an-account-with-coursera) - - [Running the script](#running-the-script) +- [Running the script](#running-the-script) - [Resuming downloads](#resuming-downloads) - [Troubleshooting](#troubleshooting) + - [China issues](#china-issues) - [Found 0 sections and 0 lectures on this page](#found-0-sections-and-0-lectures-on-this-page) + - [Download timeouts](#download-timeouts) + - [Windows: proxy support](#windows-proxy-support) - [Windows: Failed to create process](#windows-failed-to-create-process) - - [SSLError: Errno 1 _ssl.c:504: error:14094410:SSL routines:SSL3_READ_BYTES:sslv3 alert handshake failure](#sslerror-errno-1-_sslc504-error14094410ssl-routinesssl3_read_bytessslv3-alert-handshake-failure) + - [SSLError: [Errno 1] _ssl.c:504: error:14094410:SSL routines:SSL3_READ_BYTES:sslv3 alert handshake failure](#sslerror-errno-1-_sslc504-error14094410ssl-routinesssl3_read_bytessslv3-alert-handshake-failure) + - [Alternative CDN for `MathJax.js`](#alternative-cdn-for-mathjaxjs) - [Reporting issues](#reporting-issues) - [Filing an issue/Reporting a bug](#filing-an-issuereporting-a-bug) - [Feedback](#feedback) - [Contact](#contact) + + # Introduction [Coursera][1] is arguably the leader in *massive open online courses* (MOOC) @@ -69,6 +79,7 @@ I've downloaded many other good videos such as those from Khan Academy. certain resources. * File format extension filter to grab resource types you want. * Login credentials accepted on command-line or from `.netrc` file. + * Default arguments loaded from `coursera-dl.conf` file. * Core functionality tested on Linux, Mac and Windows. # Disclaimer @@ -90,11 +101,11 @@ relevant excerpt: # Installation instructions `coursera-dl` requires Python 2 or Python 3 and a free Coursera account -enrolled in the class of interest. (As of February of 2016, we test -automatically the execution of the program with Python versions 2.6, 2.7, -Pypy, 3.2, 3.3, 3.4, and 3.5). +enrolled in the class of interest. (As of February of 2020, we test +automatically the execution of the program with Python versions 2.7, Pypy, +3.6, 3.7, 3.8, and 3.9). -**Note:** We *strongly* recommend that you use a Python 3 interpreter (3.4 +**Note:** We *strongly* recommend that you use a Python 3 interpreter (3.9 or later). On any operating system, ensure that the Python executable location is added @@ -109,7 +120,7 @@ particular courses that you want to use with `coursera-dl`. ## Recommended installation method for all Operating Systems -From a command line (preferrably, from a virtual environment), simply issue +From a command line (preferably, from a virtual environment), simply issue the command: pip install coursera-dl @@ -134,7 +145,7 @@ installed in your system (or they can interfere with `coursera-dl`). Prefer to use the option `--user` to `pip install`, if you need can. **Note 2:** As already mentioned, we *strongly* recommend that you use a new -Python 3 interpreter (e.g., 3.4 or later), since Python 3 has better support +Python 3 interpreter (e.g., 3.9 or later), since Python 3 has better support for SSL/TLS (for secure connections) than earlier versions.
If you must use Python 2, be sure that you have at least Python 2.7.9 (later versions are OK).
@@ -212,22 +223,46 @@ your own, please check that the versions of your modules are at least those listed in the `requirements.txt` file (and, `requirements-dev.txt` file, if applicable). +## Docker + +If you prefer you can run this software inside Docker: + +``` +docker run --rm -it -v \ + "$(pwd):/courses" \ + courseradl/courseradl -u -p +``` + +Or using netrc file: + +``` +docker run --rm -it \ + -v "$(pwd):/courses" -v "$HOME/.netrc:/netrc" \ + courseradl/courseradl -n /netrc +``` + +The actual working dir for coursera-dl is /courses, all courses will be +downloaded there if you don't specify otherwise. + ## Windows `python -m pip install coursera-dl` -Be sure that the Python install path is added to the PATH system environment variables. This can be found in Control Panel > System > Advanced System Settings > Environment Variables. +Be sure that the Python install path is added to the PATH system environment +variables. This can be found in Control Panel > System > Advanced System +Settings > Environment Variables. ``` Example: -C:\Python35\Scripts\;C:\Python35\; +C:\Python39\Scripts\;C:\Python39\; ``` -Or if you have restricted installation permissions and you've installed Python under AppData, add this to your PATH. +Or if you have restricted installation permissions and you've installed Python +under AppData, add this to your PATH. ``` Example: -C:\Users\\AppData\Local\Programs\Python\Python35-32\Scripts;C:\Users\\AppData\Local\Programs\Python\Python35-32; +C:\Users\\AppData\Local\Programs\Python\Python39-32\Scripts;C:\Users\\AppData\Local\Programs\Python\Python39-32; ``` Coursera-dl can now be run from commandline or powershell. @@ -237,55 +272,93 @@ Coursera-dl can now be run from commandline or powershell. If you don't already have one, create a [Coursera][1] account and enroll in a class. See https://www.coursera.org/courses for the list of classes. -## Running the script +# Running the script + +Refer to `coursera-dl --help` for a complete, up-to-date reference on the runtime options +supported by this utility. Run the script to download the materials by providing your Coursera account credentials (e.g. email address and password or a `~/.netrc` file), the class names, as well as any additional parameters: - +``` General: coursera-dl -u -p modelthinking-004 + + With CAUTH parameter: coursera-dl -ca 'some-ca-value-from-browser' modelthinking-004 +``` +If you don't want to type your password in command line as plain text, you can use the +script without `-p` option. In this case you will be prompted for password once the +script is run. + +Here are some examples of how to invoke `coursera-dl` from the command line: +``` + Without -p field: coursera-dl -u modelthinking-004 Multiple classes: coursera-dl -u -p saas historyofrock1-001 algo-2012-002 Filter by section name: coursera-dl -u -p -sf "Chapter_Four" crypto-004 Filter by lecture name: coursera-dl -u -p -lf "3.1_" ml-2012-002 Download only ppt files: coursera-dl -u -p -f "ppt" qcomp-2012-001 Use a ~/.netrc file: coursera-dl -n -- matrix-001 Get the preview classes: coursera-dl -n -b ni-001 + Download videos at 720p: coursera-dl -n --video-resolution 720p ni-001 Specify download path: coursera-dl -n --path=C:\Coursera\Classes\ comnetworks-002 Display help: coursera-dl --help Maintain a list of classes in a dir: Initialize: mkdir -p CURRENT/{class1,class2,..classN} Update: coursera-dl -n --path CURRENT `\ls CURRENT` - +``` **Note:** If your `ls` command is aliased to display a colorized output, you may experience problems. Be sure to escape the `ls` command (use `\ls`) to assure that no special characters get sent to the script. -Note that we *do* support the New Platform ("on-demand") classes. +Note that we *do* support the New Platform ("on-demand") courses. + +By default, videos are downloaded at 540p resolution. For on-demand courses, the +`--video-resolution` flag accepts 360p, 540p, and 720p values. + +To download just the `.txt` and/or `.srt` subtitle files instead of the videos, +use `-ignore-formats mp4 --subtitle-language en` or whatever format the videos +are encoded in and desired languages for subtitles. On \*nix platforms, the use of a `~/.netrc` file is a good alternative to specifying both your username (i.e., your email address) and password every time on the command line. To use it, simply add a line like the one below to a file named `.netrc` in your home directory (or the [equivalent][8], if you are using Windows) with contents like: - +``` machine coursera-dl login password - +``` Create the file if it doesn't exist yet. From then on, you can switch from using `-u` and `-p` to simply call `coursera-dl` with the option `-n` instead. This is especially convenient, as typing usernames (email addresses) and passwords directly on the command line can get tiresome (even more if you happened to choose a "strong" password). +Alternatively, if you want to store your preferred parameters (which might +also include your username and password), create a file named `coursera-dl.conf` +where the script is supposed to be executed, with the following format: +``` + --username + --password + --subtitle-language en,zh-CN|zh-TW + --download-quizzes + #--mathjax-cdn https://cdn.bootcss.com/mathjax/2.7.1/MathJax.js + # more other parameters +``` +Parameters which are specified in the file will be overriden if they are +provided again on the commandline. + +**Note:** In `coursera-dl.conf`, all the parameters should not be wrapped +with quotes. + ## Resuming downloads In default mode when you interrupt the download process by pressing CTRL+C, partially downloaded files will be deleted from your disk and -you have to start the download process from the begining. If your +you have to start the download process from the beginning. If your download was interrupted by something other than KeyboardInterrupt (CTRL+C) like sudden system crash, partially downloaded files will remain on your disk and the next time you start the process again, -these files will be discraded from download list!, therefore it's your +these files will be discarded from download list!, therefore it's your job to delete them manually before next start. For this reason we added an option called `--resume` which continues your downloads from where they stopped: @@ -316,7 +389,7 @@ one of the following actions solve your problem: * Make sure the class name you are using corresponds to the resource name used in the URL for that class: - `https://class.coursera.org//class/index` + `https://www.coursera.org/learn//home/welcome` * Have you tried to clean the cached cookies/credentials with the `--clear-cache` option? @@ -340,7 +413,7 @@ one of the following actions solve your problem: * If results show 0 sections, you most likely have provided invalid credentials (username and/or password in the command line or in your - `.netrc` file). + `.netrc` file or in your `coursera-dl.conf` file). * For courses that have not started yet, but have had a previous iteration sometimes a preview is available, containing all the classes from the last @@ -361,7 +434,7 @@ one of the following actions solve your problem: * You get an error when using `-n` to specify that you want to use a `.netrc` file and, * You want the script to use your default netrc file and, - * You get a message saying `coursera-dl: error: too few arguments` + * You get a message saying `coursera-dl: error: too few arguments` Then you should specify `--` as an argument after `-n`, that is, `-n --` or change the order in which you pass the arguments to the script, so that @@ -380,6 +453,13 @@ one of the following actions solve your problem: pip install coursera-dl ``` +## China issues + +If you are from China and you're having problems downloading videos, +adding "52.84.167.78 d3c33hcgiwev3.cloudfront.net" in the hosts file +(/etc/hosts) and freshing DNS with "ipconfig/flushdns" may work +(see https://github.com/googlehosts/hosts for more info). + ## Found 0 sections and 0 lectures on this page First of all, make sure you are enrolled to the course you want to download. @@ -394,13 +474,50 @@ file that lists all the course materials. Maybe your friend who is enrolled could save that course page for you. In that case use the `--process_local_page` option. -Alternatively you may want to try this Chrome extension: https://chrome.google.com/webstore/detail/coursera-materials-downlo/ijkboagofaehocnjacacdhdcbbcpilih +Alternatively you may want to try this various browser extensions designed for +this problem. If none of the above works for you, there is nothing we can do. +## Download timeouts + +Coursera-dl supports external downloaders but note that they are only used to +download materials after the syllabus has been parsed, e.g. videos, PDFs, some +handouts and additional files (syllabus is always downloaded using the internal +downloader). If you experience problems with downloading such materials, you may +want to start using external downloader and configure its timeout values. For +example, you can use aria2c downloader by passing `--aria` option: + +``` +coursera-dl -n --path . --aria2 +``` + +And put this into aria2c's configuration file `~/.aria2/aria2.conf` to reduce +timeouts: + +``` +connect-timeout=2 +timeout=2 +bt-stop-timeout=1 +``` + +Timeout configuration for internal downloader is not supported. + +## Windows: proxy support + +If you're on Windows behind a proxy, set up the environment variables +before running the script as follows: + +``` +set HTTP_PROXY=http://host:port +set HTTPS_PROXY=http://host:port +``` + +Related discussion: [#205](https://github.com/coursera-dl/coursera-dl/issues/205) + ## Windows: Failed to create process -In `C:\Users\\AppData\Local\Programs\Python\Python35-32\Scripts` +In `C:\Users\\AppData\Local\Programs\Python\Python39-32\Scripts` or wherever Python installed (above is default for Windows) edit below file in idle: (right click on script name and select 'edit with idle in menu) @@ -411,13 +528,13 @@ coursera-dl-script from ``` -#!c:\users\\appdata\local\programs\python\python35-32\python.exe +#!c:\users\\appdata\local\programs\python\python39-32\python.exe ``` to ``` -#"!c:\users\\appdata\local\programs\python\python35-32\python.exe" +#"!c:\users\\appdata\local\programs\python\python39-32\python.exe" ``` (add quotes). This is a known pip bug. @@ -443,6 +560,15 @@ If you still have the problem, please read the following issues for more ideas o This is also worth reading: https://urllib3.readthedocs.io/en/latest/security.html#insecureplatformwarning +## Alternative CDN for `MathJax.js` + +When saving a course page, we enabled `MathJax` rendering for math equations, by +injecting `MathJax.js` in the header. The script is using a cdn service provided +by [mathjax.org](https://cdn.mathjax.org/mathjax/latest/MathJax.js). However, that +url is not accessible in some countries/regions, you can provide a +`--mathjax-cdn ` parameter to specify the `MathJax.js` file that is +accessible in your region. + # Reporting issues Before reporting any issue please follow the steps below: @@ -509,10 +635,9 @@ I enjoy getting feedback. Here are a few of the comments I've received: # Contact -Please, post bugs and issues on [github][11]. Send other comments to Rogério -Theodoro de Brito (the current maintainer): rbrito@ime.usp.br (twitter: -[@rtdbrito][21]) or to John Lehmann (the original author): first last at -geemail dotcom (twitter: [@jplehmann][12]). +Please, post bugs and issues on [github][11]. Please, **DON'T** send support +requests privately to the maintainers! We are quite swamped with day-to-day +activities. If you have problems, **PLEASE**, file them on the issue tracker. [1]: https://www.coursera.org [2]: https://sourceforge.net/projects/gnuwin32/files/wget/1.11.4-1/wget-1.11.4-1-setup.exe @@ -525,7 +650,6 @@ geemail dotcom (twitter: [@jplehmann][12]). [9]: https://chrome.google.com/webstore/detail/cookietxt-export/lopabhfecdfhgogdbojmaicoicjekelh [10]: https://addons.mozilla.org/en-US/firefox/addon/export-cookies/ [11]: https://github.com/coursera-dl/coursera-dl/issues -[12]: https://twitter.com/jplehmann [13]: http://techcrunch.com/2013/02/20/coursera-adds-29-schools-90-courses-and-4-new-languages-to-its-online-learning-platform/ [14]: http://www.tunapanda.org [15]: https://github.com/html5lib/html5lib-python @@ -534,11 +658,8 @@ geemail dotcom (twitter: [@jplehmann][12]). [18]: http://ww45.python-distribute.org/pip_distribute.png [19]: https://pypi.python.org/pypi/six/ [20]: https://www.coursera.org/about/terms -[21]: https://twitter.com/rtdbrito [22]: https://pypi.python.org/ [23]: https://pypi.python.org/pypi/coursera-dl [issue213]: https://github.com/coursera-dl/coursera-dl/issues/213 [issue500]: https://github.com/coursera-dl/coursera-dl/issues/500 [pipinstallerbug]: http://stackoverflow.com/questions/31808180/installing-pyinstaller-via-pip-leads-to-failed-to-create-process - -[![Bitdeli Badge](https://d2weczhvl823v0.cloudfront.net/coursera-dl/coursera-dl/trend.png)](https://bitdeli.com/free "Bitdeli Badge") diff --git a/appveyor.yml b/appveyor.yml index 452468e..2aab721 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -24,14 +24,6 @@ environment: # a later point release. # See: http://www.appveyor.com/docs/installed-software#python - - PYTHON: "C:\\Python26" - PYTHON_VERSION: "2.6.x" # currently 2.6.6 - PYTHON_ARCH: "32" - - - PYTHON: "C:\\Python26-x64" - PYTHON_VERSION: "2.6.x" # currently 2.6.6 - PYTHON_ARCH: "64" - - PYTHON: "C:\\Python27" PYTHON_VERSION: "2.7.x" # currently 2.7.11 PYTHON_ARCH: "32" @@ -40,28 +32,36 @@ environment: PYTHON_VERSION: "2.7.x" # currently 2.7.11 PYTHON_ARCH: "64" - - PYTHON: "C:\\Python33" - PYTHON_VERSION: "3.3.x" # currently 3.3.5 + - PYTHON: "C:\\Python36" + PYTHON_VERSION: "3.6.x" # currently 3.6.? PYTHON_ARCH: "32" - - PYTHON: "C:\\Python33-x64" - PYTHON_VERSION: "3.3.x" # currently 3.3.5 + - PYTHON: "C:\\Python36-x64" + PYTHON_VERSION: "3.6.x" # currently 3.6.? PYTHON_ARCH: "64" - - PYTHON: "C:\\Python34" - PYTHON_VERSION: "3.4.x" # currently 3.4.3 + - PYTHON: "C:\\Python37" + PYTHON_VERSION: "3.7.x" # currently 3.7.? PYTHON_ARCH: "32" - - PYTHON: "C:\\Python34-x64" - PYTHON_VERSION: "3.4.x" # currently 3.4.3 + - PYTHON: "C:\\Python37-x64" + PYTHON_VERSION: "3.7.x" # currently 3.7.? PYTHON_ARCH: "64" - - PYTHON: "C:\\Python35" - PYTHON_VERSION: "3.5.x" # currently 3.5.1 + - PYTHON: "C:\\Python38" + PYTHON_VERSION: "3.8.x" # currently 3.8.? PYTHON_ARCH: "32" - - PYTHON: "C:\\Python35-x64" - PYTHON_VERSION: "3.5.x" # currently 3.5.1 + - PYTHON: "C:\\Python38-x64" + PYTHON_VERSION: "3.8.x" # currently 3.8.? + PYTHON_ARCH: "64" + + - PYTHON: "C:\\Python39" + PYTHON_VERSION: "3.8.x" # currently 3.9.? + PYTHON_ARCH: "32" + + - PYTHON: "C:\\Python38-x64" + PYTHON_VERSION: "3.8.x" # currently 3.9.? PYTHON_ARCH: "64" init: @@ -83,7 +83,7 @@ install: # Upgrade to the latest version of pip to avoid it displaying warnings # about it being out of date. - - "pip install --disable-pip-version-check --user --upgrade pip" + - "python -m pip install --disable-pip-version-check --user --upgrade pip" # Install requirements - "%CMD_IN_ENV% pip install -r requirements.txt" diff --git a/coursera-dl b/coursera-dl index 88a1e6c..16bba10 100755 --- a/coursera-dl +++ b/coursera-dl @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # -*- coding: utf-8 -*- from coursera import coursera_dl diff --git a/coursera/__init__.py b/coursera/__init__.py index 32a90a3..eebcbfe 100644 --- a/coursera/__init__.py +++ b/coursera/__init__.py @@ -1 +1 @@ -__version__ = '0.8.0' +__version__ = '0.11.5' diff --git a/coursera/api.py b/coursera/api.py index e22c32d..a7a11b1 100644 --- a/coursera/api.py +++ b/coursera/api.py @@ -5,13 +5,18 @@ downloader. """ import os +import re import json import base64 import logging +import time import requests -from collections import namedtuple +import urllib + +from collections import namedtuple, OrderedDict from six import iterkeys, iteritems from six.moves.urllib_parse import quote_plus +import attr from .utils import (BeautifulSoup, make_coursera_absolute_url, extend_supplement_links, clean_url, clean_filename, @@ -23,20 +28,37 @@ from .define import (OPENCOURSE_SUPPLEMENT_URL, OPENCOURSE_ASSETS_URL, OPENCOURSE_API_ASSETS_V1_URL, OPENCOURSE_ONDEMAND_COURSE_MATERIALS, - OPENCOURSE_VIDEO_URL, + OPENCOURSE_ONDEMAND_COURSE_MATERIALS_V2, + OPENCOURSE_ONDEMAND_COURSES_V1, + OPENCOURSE_ONDEMAND_LECTURE_VIDEOS_URL, + OPENCOURSE_ONDEMAND_LECTURE_ASSETS_URL, + OPENCOURSE_ONDEMAND_SPECIALIZATIONS_V1, OPENCOURSE_MEMBERSHIPS, + OPENCOURSE_REFERENCES_POLL_URL, + OPENCOURSE_REFERENCE_ITEM_URL, + OPENCOURSE_PROGRAMMING_IMMEDIATE_INSTRUCTIOINS_URL, + OPENCOURSE_PEER_ASSIGNMENT_INSTRUCTIONS, + + # New feature, Notebook (Python Jupyter) + OPENCOURSE_NOTEBOOK_DESCRIPTIONS, + OPENCOURSE_NOTEBOOK_LAUNCHES, + OPENCOURSE_NOTEBOOK_TREE, + OPENCOURSE_NOTEBOOK_DOWNLOAD, + POST_OPENCOURSE_API_QUIZ_SESSION, POST_OPENCOURSE_API_QUIZ_SESSION_GET_STATE, POST_OPENCOURSE_ONDEMAND_EXAM_SESSIONS, POST_OPENCOURSE_ONDEMAND_EXAM_SESSIONS_GET_STATE, - INSTRUCTIONS_HTML_INJECTION, + INSTRUCTIONS_HTML_INJECTION_PRE, + INSTRUCTIONS_HTML_MATHJAX_URL, + INSTRUCTIONS_HTML_INJECTION_AFTER, IN_MEMORY_EXTENSION, IN_MEMORY_MARKER) -from .cookies import prepape_auth_headers +from .cookies import prepare_auth_headers class QuizExamToMarkupConverter(object): @@ -111,7 +133,8 @@ class QuizExamToMarkupConverter(object): result = ['
'] for option in options: - option_text = unescape_html(option['display']['definition']['value']) + option_text = unescape_html( + option['display']['definition']['value']) # We need to replace with so that answer text # stays on the same line with checkbox/radio button @@ -135,9 +158,12 @@ class QuizExamToMarkupConverter(object): class MarkupToHTMLConverter(object): - def __init__(self, session): + def __init__(self, session, mathjax_cdn_url=None): self._session = session self._asset_retriever = AssetRetriever(session) + if not mathjax_cdn_url: + mathjax_cdn_url = INSTRUCTIONS_HTML_MATHJAX_URL + self._mathjax_cdn_url = mathjax_cdn_url def __call__(self, markup): """ @@ -170,7 +196,11 @@ class MarkupToHTMLConverter(object): soup.insert(0, meta) # 1. Inject basic CSS style - css_soup = BeautifulSoup(INSTRUCTIONS_HTML_INJECTION) + css = "".join([ + INSTRUCTIONS_HTML_INJECTION_PRE, + self._mathjax_cdn_url, + INSTRUCTIONS_HTML_INJECTION_AFTER]) + css_soup = BeautifulSoup(css) soup.append(css_soup) # 2. Replace with

@@ -215,7 +245,8 @@ class MarkupToHTMLConverter(object): asset = self._asset_retriever[image['assetid']] if asset.data is not None: encoded64 = base64.b64encode(asset.data).decode() - image['src'] = 'data:%s;base64,%s' % (asset.content_type, encoded64) + image['src'] = 'data:%s;base64,%s' % ( + asset.content_type, encoded64) def _convert_markup_audios(self, soup): """ @@ -241,9 +272,11 @@ class MarkupToHTMLConverter(object): asset = self._asset_retriever[audio['id']] if asset.data is not None: encoded64 = base64.b64encode(asset.data).decode() - data_string = 'data:%s;base64,%s' % (asset.content_type, encoded64) + data_string = 'data:%s;base64,%s' % ( + asset.content_type, encoded64) - source_tag = soup.new_tag('source', src=data_string, type=asset.content_type) + source_tag = soup.new_tag( + 'source', src=data_string, type=asset.content_type) controls_tag = soup.new_tag('audio', controls="") controls_tag.string = 'Your browser does not support the audio element.' @@ -251,10 +284,11 @@ class MarkupToHTMLConverter(object): audio.insert_after(controls_tag) -class OnDemandCourseMaterialItems(object): +class OnDemandCourseMaterialItemsV1(object): """ Helper class that allows accessing lecture JSONs by lesson IDs. """ + def __init__(self, items): """ Initialization. Build a map from lessonId to Lecture (item) @@ -284,7 +318,7 @@ class OnDemandCourseMaterialItems(object): dom = get_page(session, OPENCOURSE_ONDEMAND_COURSE_MATERIALS, json=True, class_name=course_name) - return OnDemandCourseMaterialItems( + return OnDemandCourseMaterialItemsV1( dom['linked']['onDemandCourseMaterialItems.v1']) def get(self, lesson_id): @@ -324,6 +358,7 @@ class Asset(namedtuple('Asset', 'id name type_name url content_type data')): This class contains information about an asset. """ __slots__ = () + def __repr__(self): return 'Asset(id="%s", name="%s", type_name="%s", url="%s", content_type="%s", data="<...>")' % ( self.id, self.name, self.type_name, self.url, self.content_type) @@ -333,6 +368,7 @@ class AssetRetriever(object): """ This class helps download assets by their ID. """ + def __init__(self, session): self._session = session self._asset_mapping = {} @@ -349,7 +385,8 @@ class AssetRetriever(object): id=','.join(asset_ids)) # Create a map "asset_id => asset" for easier access - asset_map = dict((asset['id'], asset) for asset in asset_list['elements']) + asset_map = dict((asset['id'], asset) + for asset in asset_list['elements']) for asset_id in asset_ids: # Download each asset @@ -377,6 +414,173 @@ class AssetRetriever(object): return result +@attr.s +class ModuleV1(object): + name = attr.ib() + id = attr.ib() + slug = attr.ib() + child_ids = attr.ib() + + def children(self, all_children): + return [all_children[child] for child in self.child_ids] + + +@attr.s +class ModulesV1(object): + children = attr.ib() + + @staticmethod + def from_json(data): + return ModulesV1(OrderedDict( + (item['id'], + ModuleV1(item['name'], + item['id'], + item['slug'], + item['lessonIds'])) + for item in data + )) + + def __getitem__(self, key): + return self.children[key] + + def __iter__(self): + return iter(self.children.values()) + + +@attr.s +class LessonV1(object): + name = attr.ib() + id = attr.ib() + slug = attr.ib() + child_ids = attr.ib() + + def children(self, all_children): + return [all_children[child] for child in self.child_ids] + + +@attr.s +class LessonsV1(object): + children = attr.ib() + + @staticmethod + def from_json(data): + return LessonsV1(OrderedDict( + (item['id'], + LessonV1(item['name'], + item['id'], + item['slug'], + item['itemIds'])) + for item in data + )) + + def __getitem__(self, key): + return self.children[key] + + +@attr.s +class ItemV2(object): + name = attr.ib() + id = attr.ib() + slug = attr.ib() + type_name = attr.ib() + lesson_id = attr.ib() + module_id = attr.ib() + + +@attr.s +class ItemsV2(object): + children = attr.ib() + + @staticmethod + def from_json(data): + return ItemsV2(OrderedDict( + (item['id'], + ItemV2(item['name'], + item['id'], + item['slug'], + item['contentSummary']['typeName'], + item['lessonId'], + item['moduleId'])) + for item in data + )) + + def __getitem__(self, key): + return self.children[key] + + +@attr.s +class VideoV1(object): + resolution = attr.ib() + mp4_video_url = attr.ib() + + +@attr.s +class VideosV1(object): + children = attr.ib() + + @staticmethod + def from_json(data): + + videos = [VideoV1(resolution, links['mp4VideoUrl']) + for resolution, links + in data['sources']['byResolution'].items()] + videos.sort(key=lambda video: video.resolution, reverse=True) + + videos = OrderedDict( + (video.resolution, video) + for video in videos + ) + return VideosV1(videos) + + def __contains__(self, key): + return key in self.children + + def __getitem__(self, key): + return self.children[key] + + def get_best(self): + return next(iter(self.children.values())) + + +def expand_specializations(session, class_names): + """ + Checks whether any given name is not a class but a specialization. + + If it's a specialization, expand the list of class names with the child + class names. + """ + result = [] + for class_name in class_names: + specialization = SpecializationV1.create(session, class_name) + if specialization is None: + result.append(class_name) + else: + result.extend(specialization.children) + logging.info('Expanded specialization "%s" into the following' + ' classes: %s', + class_name, ' '.join(specialization.children)) + + return result + + +@attr.s +class SpecializationV1(object): + children = attr.ib() + + @staticmethod + def create(session, class_name): + try: + dom = get_page(session, OPENCOURSE_ONDEMAND_SPECIALIZATIONS_V1, + json=True, quiet=True, + class_name=class_name) + except requests.exceptions.HTTPError as e: + logging.debug('Could not expand %s: %s', class_name, e) + return None + + return SpecializationV1( + [course['slug'] for course in dom['linked']['courses.v1']]) + + class CourseraOnDemand(object): """ This is a class that provides a friendly interface to extract certain @@ -386,7 +590,8 @@ class CourseraOnDemand(object): """ def __init__(self, session, course_id, course_name, - unrestricted_filenames=False): + unrestricted_filenames=False, + mathjax_cdn_url=None): """ Initialize Coursera OnDemand API. @@ -402,6 +607,7 @@ class CourseraOnDemand(object): @type unrestricted_filenames: bool """ self._session = session + self._notebook_cookies = None self._course_id = course_id self._course_name = course_name @@ -409,7 +615,8 @@ class CourseraOnDemand(object): self._user_id = None self._quiz_to_markup = QuizExamToMarkupConverter(session) - self._markup_to_html = MarkupToHTMLConverter(session) + self._markup_to_html = MarkupToHTMLConverter( + session, mathjax_cdn_url=mathjax_cdn_url) self._asset_retriever = AssetRetriever(session) def obtain_user_id(self): @@ -438,7 +645,135 @@ class CourseraOnDemand(object): except requests.exceptions.HTTPError as exception: logging.error('Could not download exam %s: %s', exam_id, exception) if is_debug_run(): - logging.exception('Could not download exam %s: %s', exam_id, exception) + logging.exception( + 'Could not download exam %s: %s', exam_id, exception) + return None + + def _get_notebook_folder(self, url, jupyterId, **kwargs): + + supplement_links = {} + + url = url.format(**kwargs) + reply = get_page(self._session, url, json=True) + + for content in reply['content']: + + if content['type'] == 'directory': + a = self._get_notebook_folder( + OPENCOURSE_NOTEBOOK_TREE, jupyterId, jupId=jupyterId, + path=content['path'], timestamp=int(time.time())) + supplement_links.update(a) + + elif content['type'] == 'file': + tmp_url = OPENCOURSE_NOTEBOOK_DOWNLOAD.format( + path=content['path'], jupId=jupyterId, + timestamp=int(time.time())) + filename, extension = os.path.splitext(clean_url(tmp_url)) + + head, tail = os.path.split(content['path']) + # '/' in the following line is for a reason: + # @noureddin says: "I split head using split('/') not + # os.path.split() because it's seems to me that it comes from a + # web page, so the separator will always be /, so using the + # native path splitting function is not the most portable + # way to do it." + # Original pull request: + # https://github.com/coursera-dl/coursera-dl/pull/654 + head = '/'.join([clean_filename(dir, minimal_change=True) + for dir in head.split('/')]) + tail = clean_filename(tail, minimal_change=True) + + if not os.path.isdir(self._course_name + "/notebook/" + head + "/"): + logging.info('Creating [%s] directories...', head) + os.makedirs(self._course_name + "/notebook/" + head + "/") + + r = requests.get(tmp_url.replace(" ", "%20"), + cookies=self._session.cookies) + if not os.path.exists(self._course_name + "/notebook/" + head + "/" + tail): + logging.info('Downloading %s into %s', tail, head) + with open(self._course_name + "/notebook/" + head + "/" + tail, 'wb+') as f: + f.write(r.content) + else: + logging.info('Skipping %s... (file exists)', tail) + + if str(extension[1:]) not in supplement_links: + supplement_links[str(extension[1:])] = [] + + supplement_links[str(extension[1:])].append( + (tmp_url.replace(" ", "%20"), filename)) + + elif content['type'] == 'notebook': + tmp_url = OPENCOURSE_NOTEBOOK_DOWNLOAD.format( + path=content['path'], jupId=jupyterId, timestamp=int(time.time())) + filename, extension = os.path.splitext(clean_url(tmp_url)) + + head, tail = os.path.split(content['path']) + + if not os.path.isdir(self._course_name + "/notebook/" + head + "/"): + logging.info('Creating [%s] directories...', head) + os.makedirs(self._course_name + "/notebook/" + head + "/") + + r = requests.get(tmp_url.replace(" ", "%20"), + cookies=self._session.cookies) + if not os.path.exists(self._course_name + "/notebook/" + head + "/" + tail): + logging.info( + 'Downloading Jupyter %s into %s', tail, head) + with open(self._course_name + "/notebook/" + head + "/" + tail, 'wb+') as f: + f.write(r.content) + else: + logging.info('Skipping %s... (file exists)', tail) + + if "ipynb" not in supplement_links: + supplement_links["ipynb"] = [] + + supplement_links["ipynb"].append( + (tmp_url.replace(" ", "%20"), filename)) + + else: + logging.info( + 'Unsupported typename %s in notebook', content['type']) + + return supplement_links + + def _get_notebook_json(self, notebook_id, authorizationId): + + headers = self._auth_headers_with_json() + reply = get_page( + self._session, + OPENCOURSE_NOTEBOOK_DESCRIPTIONS, + json=False, + authId=authorizationId, + headers=headers + ) + + jupyted_id = re.findall(r"\"\/user\/(.*)\/tree\"", reply) + if len(jupyted_id) == 0: + logging.error('Could not download notebook %s', notebook_id) + return None + + jupyted_id = jupyted_id[0] + + newReq = requests.Session() + req = newReq.get(OPENCOURSE_NOTEBOOK_TREE.format( + jupId=jupyted_id, path="/", timestamp=int(time.time())), + headers=headers) + + return self._get_notebook_folder( + OPENCOURSE_NOTEBOOK_TREE, jupyted_id, jupId=jupyted_id, + path="/", timestamp=int(time.time())) + + def extract_links_from_notebook(self, notebook_id): + + try: + authorizationId = self._extract_notebook_text(notebook_id) + ret = self._get_notebook_json(notebook_id, authorizationId) + return ret + except requests.exceptions.HTTPError as exception: + logging.error('Could not download notebook %s: %s', + notebook_id, exception) + if is_debug_run(): + logging.exception( + 'Could not download notebook %s: %s', notebook_id, exception) return None def extract_links_from_quiz(self, quiz_id): @@ -449,7 +784,8 @@ class CourseraOnDemand(object): except requests.exceptions.HTTPError as exception: logging.error('Could not download quiz %s: %s', quiz_id, exception) if is_debug_run(): - logging.exception('Could not download quiz %s: %s', quiz_id, exception) + logging.exception( + 'Could not download quiz %s: %s', quiz_id, exception) return None def _convert_quiz_json_to_links(self, quiz_json, filename_suffix): @@ -504,7 +840,7 @@ class CourseraOnDemand(object): def _get_quiz_session_id(self, quiz_id): headers = self._auth_headers_with_json() - data = {"contentRequestBody":[]} + data = {"contentRequestBody": []} reply = get_page(self._session, POST_OPENCOURSE_API_QUIZ_SESSION, json=True, @@ -518,15 +854,15 @@ class CourseraOnDemand(object): return reply['contentResponseBody']['session']['id'] def _auth_headers_with_json(self): - headers = prepape_auth_headers(self._session, include_cauth=True) + headers = prepare_auth_headers(self._session, include_cauth=True) headers.update({ 'Content-Type': 'application/json; charset=UTF-8' }) return headers - def extract_links_from_lecture(self, + def extract_links_from_lecture(self, course_id, video_id, subtitle_language='en', - resolution='540p', assets=None): + resolution='540p'): """ Return the download URLs of on-demand course video. @@ -539,29 +875,37 @@ class CourseraOnDemand(object): @param resolution: Preferred video resolution. @type resolution: str - @param assets: List of assets that may present in the video. - @type assets: [str] - @return: @see CourseraOnDemand._extract_links_from_text """ - if assets is None: - assets = [] - try: links = self._extract_videos_and_subtitles_from_lecture( - video_id, subtitle_language, resolution) + course_id, video_id, subtitle_language, resolution) + assets = self._get_lecture_asset_ids(course_id, video_id) assets = self._normalize_assets(assets) extend_supplement_links( links, self._extract_links_from_lecture_assets(assets)) return links except requests.exceptions.HTTPError as exception: - logging.error('Could not download lecture %s: %s', video_id, exception) + logging.error('Could not download lecture %s: %s', + video_id, exception) if is_debug_run(): - logging.exception('Could not download lecture %s: %s', video_id, exception) + logging.exception( + 'Could not download lecture %s: %s', video_id, exception) return None + def _get_lecture_asset_ids(self, course_id, video_id): + """ + Obtain a list of asset ids from a lecture. + """ + dom = get_page(self._session, OPENCOURSE_ONDEMAND_LECTURE_ASSETS_URL, + json=True, course_id=course_id, video_id=video_id) + # Note that we extract here "id", not definition -> assetId, as it + # be extracted later. + return [asset['id'] + for asset in dom['linked']['openCourseAssets.v1']] + def _normalize_assets(self, assets): """ Perform asset normalization. For some reason, assets that are sometimes @@ -685,65 +1029,40 @@ class CourseraOnDemand(object): return urls def _extract_videos_and_subtitles_from_lecture(self, + course_id, video_id, subtitle_language='en', resolution='540p'): - dom = get_page(self._session, OPENCOURSE_VIDEO_URL, - json=True, - video_id=video_id) - logging.debug('Parsing JSON for video_id <%s>.', video_id) + + dom = get_page(self._session, OPENCOURSE_ONDEMAND_LECTURE_VIDEOS_URL, + json=True, + course_id=course_id, + video_id=video_id) + dom = dom['linked']['onDemandVideos.v1'][0] + + videos = VideosV1.from_json(dom) video_content = {} - # videos - logging.debug('Gathering video URLs for video_id <%s>.', video_id) - sources = dom['sources'] - sources.sort(key=lambda src: src['resolution']) - sources.reverse() - - # Try to select resolution requested by the user. - filtered_sources = [source - for source in sources - if source['resolution'] == resolution] - - if len(filtered_sources) == 0: - # We will just use the 'vanilla' version of sources here, instead of - # filtered_sources. - logging.warning('Requested resolution %s not available for <%s>. ' - 'Downloading highest resolution available instead.', - resolution, video_id) - else: + if resolution in videos: + source = videos[resolution] logging.debug('Proceeding with download of resolution %s of <%s>.', resolution, video_id) - sources = filtered_sources + else: + source = videos.get_best() + logging.warning( + 'Requested resolution %s not available for <%s>. ' + 'Downloading highest resolution (%s) available instead.', + resolution, video_id, source.resolution) - video_url = sources[0]['formatSources']['video/mp4'] - video_content['mp4'] = video_url + video_content['mp4'] = source.mp4_video_url - # subtitles and transcripts - subtitle_nodes = [ - ('subtitles', 'srt', 'subtitle'), - ('subtitlesTxt', 'txt', 'transcript'), - ] - for (subtitle_node, subtitle_extension, subtitle_description) in subtitle_nodes: - logging.debug('Gathering %s URLs for video_id <%s>.', subtitle_description, video_id) - subtitles = dom.get(subtitle_node) - if subtitles is not None: - if subtitle_language == 'all': - for current_subtitle_language in subtitles: - video_content[current_subtitle_language + '.' + subtitle_extension] = make_coursera_absolute_url(subtitles.get(current_subtitle_language)) - else: - if subtitle_language != 'en' and subtitle_language not in subtitles: - logging.warning("%s unavailable in '%s' language for video " - "with video id: [%s], falling back to 'en' " - "%s", subtitle_description.capitalize(), subtitle_language, video_id, subtitle_description) - subtitle_language = 'en' + subtitle_link = self._extract_subtitles_from_video_dom( + dom, subtitle_language, video_id) - subtitle_url = subtitles.get(subtitle_language) - if subtitle_url is not None: - # some subtitle urls are relative! - video_content[subtitle_language + '.' + subtitle_extension] = make_coursera_absolute_url(subtitle_url) + for key, value in iteritems(subtitle_link): + video_content[key] = value lecture_video_content = {} for key, value in iteritems(video_content): @@ -751,6 +1070,103 @@ class CourseraOnDemand(object): return lecture_video_content + def _extract_subtitles_from_video_dom(self, video_dom, + subtitle_language, video_id): + # subtitles and transcripts + subtitle_nodes = [ + ('subtitles', 'srt', 'subtitle'), + ('subtitlesTxt', 'txt', 'transcript'), + ] + subtitle_set_download = set() + subtitle_set_nonexist = set() + subtitle_links = {} + for (subtitle_node, subtitle_extension, subtitle_description) \ + in subtitle_nodes: + logging.debug('Gathering %s URLs for video_id <%s>.', + subtitle_description, video_id) + subtitles = video_dom.get(subtitle_node) + download_all_subtitle = False + if subtitles is not None: + subtitles_set = set(subtitles) + requested_subtitle_list = [s.strip() for s in + subtitle_language.split(",")] + for language_with_alts in requested_subtitle_list: + if download_all_subtitle: + break + grouped_language_list = [l.strip() for l in + language_with_alts.split("|")] + for language in grouped_language_list: + if language == "all": + download_all_subtitle = True + break + elif language in subtitles_set: + subtitle_set_download.update([language]) + break + else: + subtitle_set_nonexist.update([language]) + + if download_all_subtitle and subtitles is not None: + subtitle_set_download = set(subtitles) + + if not download_all_subtitle and subtitle_set_nonexist: + logging.warning("%s unavailable in '%s' language for video " + "with video id: [%s]," + "%s", subtitle_description.capitalize(), + ", ".join(subtitle_set_nonexist), video_id, + subtitle_description) + if not subtitle_set_download: + logging.warning("%s all requested subtitles are unavailable," + "with video id: [%s], falling back to 'en' " + "%s", subtitle_description.capitalize(), + video_id, + subtitle_description) + subtitle_set_download = set(['en']) + + for current_subtitle_language in subtitle_set_download: + subtitle_url = subtitles.get(current_subtitle_language) + if subtitle_url is not None: + # some subtitle urls are relative! + subtitle_links[ + "%s.%s" % (current_subtitle_language, + subtitle_extension) + ] = make_coursera_absolute_url(subtitle_url) + return subtitle_links + + def extract_links_from_programming_immediate_instructions(self, element_id): + """ + Return a dictionary with links to supplement files (pdf, csv, zip, + ipynb, html and so on) extracted from graded programming assignment. + + @param element_id: Element ID to extract files from. + @type element_id: str + + @return: @see CourseraOnDemand._extract_links_from_text + """ + logging.debug('Extracting links from programming immediate ' + 'instructions for element_id <%s>.', element_id) + + try: + # Assignment text (instructions) contains asset tags which describe + # supplementary files. + text = ''.join( + self._extract_programming_immediate_instructions_text(element_id)) + if not text: + return {} + + supplement_links = self._extract_links_from_text(text) + instructions = (IN_MEMORY_MARKER + self._markup_to_html(text), + 'instructions') + extend_supplement_links( + supplement_links, {IN_MEMORY_EXTENSION: [instructions]}) + return supplement_links + except requests.exceptions.HTTPError as exception: + logging.error('Could not download programming assignment %s: %s', + element_id, exception) + if is_debug_run(): + logging.exception('Could not download programming assignment %s: %s', + element_id, exception) + return None + def extract_links_from_programming(self, element_id): """ Return a dictionary with links to supplement files (pdf, csv, zip, @@ -761,7 +1177,8 @@ class CourseraOnDemand(object): @return: @see CourseraOnDemand._extract_links_from_text """ - logging.debug('Gathering supplement URLs for element_id <%s>.', element_id) + logging.debug( + 'Gathering supplement URLs for element_id <%s>.', element_id) try: # Assignment text (instructions) contains asset tags which describe @@ -784,6 +1201,40 @@ class CourseraOnDemand(object): element_id, exception) return None + def extract_links_from_peer_assignment(self, element_id): + """ + Return a dictionary with links to supplement files (pdf, csv, zip, + ipynb, html and so on) extracted from peer assignment. + + @param element_id: Element ID to extract files from. + @type element_id: str + + @return: @see CourseraOnDemand._extract_links_from_text + """ + logging.debug( + 'Gathering supplement URLs for element_id <%s>.', element_id) + + try: + # Assignment text (instructions) contains asset tags which describe + # supplementary files. + text = ''.join(self._extract_peer_assignment_text(element_id)) + if not text: + return {} + + supplement_links = self._extract_links_from_text(text) + instructions = (IN_MEMORY_MARKER + self._markup_to_html(text), + 'peer_assignment_instructions') + extend_supplement_links( + supplement_links, {IN_MEMORY_EXTENSION: [instructions]}) + return supplement_links + except requests.exceptions.HTTPError as exception: + logging.error('Could not download peer assignment %s: %s', + element_id, exception) + if is_debug_run(): + logging.exception('Could not download peer assignment %s: %s', + element_id, exception) + return None + def extract_links_from_supplement(self, element_id): """ Return a dictionary with supplement files (pdf, csv, zip, ipynb, html @@ -791,13 +1242,14 @@ class CourseraOnDemand(object): @return: @see CourseraOnDemand._extract_links_from_text """ - logging.debug('Gathering supplement URLs for element_id <%s>.', element_id) + logging.debug( + 'Gathering supplement URLs for element_id <%s>.', element_id) try: dom = get_page(self._session, OPENCOURSE_SUPPLEMENT_URL, - json=True, - course_id=self._course_id, - element_id=element_id) + json=True, + course_id=self._course_id, + element_id=element_id) supplement_content = {} @@ -876,6 +1328,113 @@ class CourseraOnDemand(object): 'url': element['url'].strip()} for element in dom['elements']] + def extract_references_poll(self): + try: + dom = get_page(self._session, + OPENCOURSE_REFERENCES_POLL_URL.format( + course_id=self._course_id), + json=True + ) + logging.info('Downloaded resource poll (%d bytes)', len(dom)) + return dom['elements'] + + except requests.exceptions.HTTPError as exception: + logging.error('Could not download resource section: %s', + exception) + if is_debug_run(): + logging.exception('Could not download resource section: %s', + exception) + return None + + def extract_links_from_reference(self, short_id): + """ + Return a dictionary with supplement files (pdf, csv, zip, ipynb, html + and so on) extracted from supplement page. + + @return: @see CourseraOnDemand._extract_links_from_text + """ + logging.debug('Gathering resource URLs for short_id <%s>.', short_id) + + try: + dom = get_page(self._session, OPENCOURSE_REFERENCE_ITEM_URL, + json=True, + course_id=self._course_id, + short_id=short_id) + + resource_content = {} + + # Supplement content has structure as follows: + # 'linked' { + # 'openCourseAssets.v1' [ { + # 'definition' { + # 'value' + + for asset in dom['linked']['openCourseAssets.v1']: + value = asset['definition']['value'] + # Supplement lecture types are known to contain both tags + # and tags (depending on the course), so we extract + # both of them. + extend_supplement_links( + resource_content, self._extract_links_from_text(value)) + + instructions = (IN_MEMORY_MARKER + self._markup_to_html(value), + 'resources') + extend_supplement_links( + resource_content, {IN_MEMORY_EXTENSION: [instructions]}) + + return resource_content + except requests.exceptions.HTTPError as exception: + logging.error('Could not download supplement %s: %s', + short_id, exception) + if is_debug_run(): + logging.exception('Could not download supplement %s: %s', + short_id, exception) + return None + + def _extract_programming_immediate_instructions_text(self, element_id): + """ + Extract assignment text (instructions). + + @param element_id: Element id to extract assignment instructions from. + @type element_id: str + + @return: List of assignment text (instructions). + @rtype: [str] + """ + dom = get_page(self._session, OPENCOURSE_PROGRAMMING_IMMEDIATE_INSTRUCTIOINS_URL, + json=True, + course_id=self._course_id, + element_id=element_id) + + return [element['assignmentInstructions']['definition']['value'] + for element in dom['elements']] + + def _extract_notebook_text(self, element_id): + """ + Extract notebook text (instructions). + + @param element_id: Element id to extract notebook links. + @type element_id: str + + @return: Notebook URL. + @rtype: [str] + """ + headers = self._auth_headers_with_json() + data = {'courseId': self._course_id, + 'learnerId': self._user_id, 'itemId': element_id} + dom = get_page(self._session, OPENCOURSE_NOTEBOOK_LAUNCHES, + post=True, + json=True, + user_id=self._user_id, + course_id=self._course_id, + headers=headers, + element_id=element_id, + data=json.dumps(data) + ) + + # Return authorization id. This id changes on each request + return dom['elements'][0]['authorizationId'] + def _extract_assignment_text(self, element_id): """ Extract assignment text (instructions). @@ -895,6 +1454,43 @@ class CourseraOnDemand(object): ['assignmentInstructions']['definition']['value'] for element in dom['elements']] + def _extract_peer_assignment_text(self, element_id): + """ + Extract peer assignment text (instructions). + + @param element_id: Element id to extract peer assignment instructions from. + @type element_id: str + + @return: List of peer assignment text (instructions). + @rtype: [str] + """ + dom = get_page(self._session, OPENCOURSE_PEER_ASSIGNMENT_INSTRUCTIONS, + json=True, + user_id=self._user_id, + course_id=self._course_id, + element_id=element_id) + + result = [] + + for element in dom['elements']: + # There is only one section with Instructions + if 'introduction' in element['instructions']: + result.append(element['instructions'] + ['introduction']['definition']['value']) + + # But there may be multiple sections in Sections + for section in element['instructions'].get('sections', []): + section_value = section['content']['definition']['value'] + section_title = section.get('title') + if section_title is not None: + # If section title is present, put it in the beginning of + # section value as if it was there. + section_value = ('%s' % + section_title) + section_value + result.append(section_value) + + return result + def _extract_links_from_text(self, text): """ Extract supplement links from the html text. Links may be provided diff --git a/coursera/commandline.py b/coursera/commandline.py index dc9a2bb..2fd80d7 100644 --- a/coursera/commandline.py +++ b/coursera/commandline.py @@ -6,13 +6,15 @@ handling. The primary candidate is argument parser. import os import sys import logging -import argparse +import configargparse as argparse from coursera import __version__ from .credentials import get_credentials, CredentialsError, keyring from .utils import decode_input +LOCAL_CONF_FILE_NAME = 'coursera-dl.conf' + def class_name_arg_required(args): """ @@ -33,321 +35,418 @@ def parse_args(args=None): Parse the arguments/options passed to the program on the command line. """ - parser = argparse.ArgumentParser( - description='Download Coursera.org lecture material and resources.') + parse_kwargs = { + "description": 'Download Coursera.org lecture material and resources.' + } + + conf_file_path = os.path.join(os.getcwd(), LOCAL_CONF_FILE_NAME) + if os.path.isfile(conf_file_path): + parse_kwargs["default_config_files"] = [conf_file_path] + parser = argparse.ArgParser(**parse_kwargs) # Basic options group_basic = parser.add_argument_group('Basic options') - group_basic.add_argument('class_names', - action='store', - nargs='*', - help='name(s) of the class(es) (e.g. "ml-005")') + group_basic.add_argument( + 'class_names', + action='store', + nargs='*', + help='name(s) of the class(es) (e.g. "ml-005")') - group_basic.add_argument('-u', - '--username', - dest='username', - action='store', - default=None, - help='coursera username') + group_basic.add_argument( + '-u', + '--username', + dest='username', + action='store', + default=None, + help='username (email) that you use to login to Coursera') - group_basic.add_argument('-p', - '--password', - dest='password', - action='store', - default=None, - help='coursera password') + group_basic.add_argument( + '-p', + '--password', + dest='password', + action='store', + default=None, + help='coursera password') - group_basic.add_argument('--jobs', - dest='jobs', - action='store', - default=1, - type=int, - help='number of parallel jobs to use for ' - 'downloading resources. (Default: 1)') + group_basic.add_argument( + '--jobs', + dest='jobs', + action='store', + default=1, + type=int, + help='number of parallel jobs to use for ' + 'downloading resources. (Default: 1)') - group_basic.add_argument('--download-delay', - dest='download_delay', - action='store', - default=60, - type=int, - help='number of seconds to wait before downloading ' - 'next course. (Default: 60)') + group_basic.add_argument( + '--download-delay', + dest='download_delay', + action='store', + default=60, + type=int, + help='number of seconds to wait before downloading ' + 'next course. (Default: 60)') - group_basic.add_argument('-b', # FIXME: kill this one-letter option - '--preview', - dest='preview', - action='store_true', - default=False, - help='get videos from preview pages. (Default: False)') + group_basic.add_argument( + '-b', # FIXME: kill this one-letter option + '--preview', + dest='preview', + action='store_true', + default=False, + help='get videos from preview pages. (Default: False)') - group_basic.add_argument('--path', - dest='path', - action='store', - default='', - help='path to where to save the file. (Default: current directory)') + group_basic.add_argument( + '--path', + dest='path', + action='store', + default='', + help='path to where to save the file. (Default: current directory)') - group_basic.add_argument('-sl', # FIXME: deprecate this option - '--subtitle-language', - dest='subtitle_language', - action='store', - default='all', - help='Choose language to download subtitles and transcripts. (Default: all)' - 'Use special value "all" to download all available.') + group_basic.add_argument( + '-sl', # FIXME: deprecate this option + '--subtitle-language', + dest='subtitle_language', + action='store', + default='all', + help='Choose language to download subtitles and transcripts.' + '(Default: all) Use special value "all" to download all available.' + 'To download subtitles and transcripts of multiple languages,' + 'use comma(s) (without spaces) to seperate the names of the languages,' + ' i.e., "en,zh-CN".' + 'To download subtitles and transcripts of alternative language(s) ' + 'if only the current language is not available,' + 'put an "|" for each of the alternative languages after ' + 'the current language, i.e., "en|fr,zh-CN|zh-TW|de", and make sure ' + 'the parameter are wrapped with quotes when "|" presents.' + + ) # Selection of material to download - group_material = parser.add_argument_group('Selection of material to download') + group_material = parser.add_argument_group( + 'Selection of material to download') - group_material.add_argument('--only-syllabus', - dest='only_syllabus', - action='store_true', - default=False, - help='download only syllabus, skip course content. ' - '(Default: False)') + group_material.add_argument( + '--specialization', + dest='specialization', + action='store_true', + default=False, + help='treat given class names as specialization names and try to ' + 'download its courses, if available. Note that there are name ' + 'clashes, e.g. "machine-learning" is both a course and a ' + 'specialization (Default: False)') - group_material.add_argument('--download-quizzes', - dest='download_quizzes', - action='store_true', - default=False, - help='download quiz and exam questions. (Default: False)') + group_material.add_argument( + '--only-syllabus', + dest='only_syllabus', + action='store_true', + default=False, + help='download only syllabus, skip course content. ' + '(Default: False)') - group_material.add_argument('--about', # FIXME: should be --about-course - dest='about', - action='store_true', - default=False, - help='download "about" metadata. (Default: False)') + group_material.add_argument( + '--download-quizzes', + dest='download_quizzes', + action='store_true', + default=False, + help='download quiz and exam questions. (Default: False)') - group_material.add_argument('-f', - '--formats', - dest='file_formats', - action='store', - default='all', - help='file format extensions to be downloaded in' - ' quotes space separated, e.g. "mp4 pdf" ' - '(default: special value "all")') + group_material.add_argument( + '--download-notebooks', + dest='download_notebooks', + action='store_true', + default=False, + help='download Python Jupyther Notebooks. (Default: False)') - group_material.add_argument('--ignore-formats', - dest='ignore_formats', - action='store', - default=None, - help='file format extensions of resources to ignore' - ' (default: None)') + group_material.add_argument( + '--about', # FIXME: should be --about-course + dest='about', + action='store_true', + default=False, + help='download "about" metadata. (Default: False)') - group_material.add_argument('-sf', # FIXME: deprecate this option - '--section_filter', - dest='section_filter', - action='store', - default=None, - help='only download sections which contain this' - ' regex (default: disabled)') + group_material.add_argument( + '-f', + '--formats', + dest='file_formats', + action='store', + default='all', + help='file format extensions to be downloaded in' + ' quotes space separated, e.g. "mp4 pdf" ' + '(default: special value "all")') - group_material.add_argument('-lf', # FIXME: deprecate this option - '--lecture_filter', - dest='lecture_filter', - action='store', - default=None, - help='only download lectures which contain this regex' - ' (default: disabled)') + group_material.add_argument( + '--ignore-formats', + dest='ignore_formats', + action='store', + default=None, + help='file format extensions of resources to ignore' + ' (default: None)') - group_material.add_argument('-rf', # FIXME: deprecate this option - '--resource_filter', - dest='resource_filter', - action='store', - default=None, - help='only download resources which match this regex' - ' (default: disabled)') + group_material.add_argument( + '-sf', # FIXME: deprecate this option + '--section_filter', + dest='section_filter', + action='store', + default=None, + help='only download sections which contain this' + ' regex (default: disabled)') - group_material.add_argument('--video-resolution', - dest='video_resolution', - action='store', - default='540p', - help='video resolution to download (default: 540p); ' - 'only valid for on-demand courses; ' - 'only values allowed: 360p, 540p, 720p') + group_material.add_argument( + '-lf', # FIXME: deprecate this option + '--lecture_filter', + dest='lecture_filter', + action='store', + default=None, + help='only download lectures which contain this regex' + ' (default: disabled)') - group_material.add_argument('--disable-url-skipping', - dest='disable_url_skipping', - action='store_true', - default=False, - help='disable URL skipping, all URLs will be ' - 'downloaded (default: False)') + group_material.add_argument( + '-rf', # FIXME: deprecate this option + '--resource_filter', + dest='resource_filter', + action='store', + default=None, + help='only download resources which match this regex' + ' (default: disabled)') + + group_material.add_argument( + '--video-resolution', + dest='video_resolution', + action='store', + default='540p', + help='video resolution to download (default: 540p); ' + 'only valid for on-demand courses; ' + 'only values allowed: 360p, 540p, 720p') + + group_material.add_argument( + '--disable-url-skipping', + dest='disable_url_skipping', + action='store_true', + default=False, + help='disable URL skipping, all URLs will be ' + 'downloaded (default: False)') # Parameters related to external downloaders group_external_dl = parser.add_argument_group('External downloaders') - group_external_dl.add_argument('--wget', - dest='wget', - action='store', - nargs='?', - const='wget', - default=None, - help='use wget for downloading,' - 'optionally specify wget bin') - group_external_dl.add_argument('--curl', - dest='curl', - action='store', - nargs='?', - const='curl', - default=None, - help='use curl for downloading,' - ' optionally specify curl bin') - group_external_dl.add_argument('--aria2', - dest='aria2', - action='store', - nargs='?', - const='aria2c', - default=None, - help='use aria2 for downloading,' - ' optionally specify aria2 bin') - group_external_dl.add_argument('--axel', - dest='axel', - action='store', - nargs='?', - const='axel', - default=None, - help='use axel for downloading,' - ' optionally specify axel bin') - group_external_dl.add_argument('--downloader-arguments', - dest='downloader_arguments', - default='', - help='additional arguments passed to the' - ' downloader') + group_external_dl.add_argument( + '--wget', + dest='wget', + action='store', + nargs='?', + const='wget', + default=None, + help='use wget for downloading,' + 'optionally specify wget bin') - parser.add_argument('--list-courses', - dest='list_courses', - action='store_true', - default=False, - help='list course names (slugs) and quit. Listed ' - 'course names can be put into program arguments') + group_external_dl.add_argument( + '--curl', + dest='curl', + action='store', + nargs='?', + const='curl', + default=None, + help='use curl for downloading,' + ' optionally specify curl bin') - parser.add_argument('--resume', - dest='resume', - action='store_true', - default=False, - help='resume incomplete downloads (default: False)') + group_external_dl.add_argument( + '--aria2', + dest='aria2', + action='store', + nargs='?', + const='aria2c', + default=None, + help='use aria2 for downloading,' + ' optionally specify aria2 bin') - parser.add_argument('-o', - '--overwrite', - dest='overwrite', - action='store_true', - default=False, - help='whether existing files should be overwritten' - ' (default: False)') + group_external_dl.add_argument( + '--axel', + dest='axel', + action='store', + nargs='?', + const='axel', + default=None, + help='use axel for downloading,' + ' optionally specify axel bin') - parser.add_argument('--verbose-dirs', - dest='verbose_dirs', - action='store_true', - default=False, - help='include class name in section directory name') + group_external_dl.add_argument( + '--downloader-arguments', + dest='downloader_arguments', + default='', + help='additional arguments passed to the' + ' downloader') - parser.add_argument('--quiet', - dest='quiet', - action='store_true', - default=False, - help='omit as many messages as possible' - ' (only printing errors)') + parser.add_argument( + '--list-courses', + dest='list_courses', + action='store_true', + default=False, + help='list course names (slugs) and quit. Listed ' + 'course names can be put into program arguments') - parser.add_argument('-r', - '--reverse', - dest='reverse', - action='store_true', - default=False, - help='download sections in reverse order') + parser.add_argument( + '--resume', + dest='resume', + action='store_true', + default=False, + help='resume incomplete downloads (default: False)') - parser.add_argument('--combined-section-lectures-nums', - dest='combined_section_lectures_nums', - action='store_true', - default=False, - help='include lecture and section name in final files') + parser.add_argument( + '-o', + '--overwrite', + dest='overwrite', + action='store_true', + default=False, + help='whether existing files should be overwritten' + ' (default: False)') - parser.add_argument('--unrestricted-filenames', - dest='unrestricted_filenames', - action='store_true', - default=False, - help='Do not limit filenames to be ASCII-only') + parser.add_argument( + '--verbose-dirs', + dest='verbose_dirs', + action='store_true', + default=False, + help='include class name in section directory name') + + parser.add_argument( + '--quiet', + dest='quiet', + action='store_true', + default=False, + help='omit as many messages as possible' + ' (only printing errors)') + + parser.add_argument( + '-r', + '--reverse', + dest='reverse', + action='store_true', + default=False, + help='download sections in reverse order') + + parser.add_argument( + '--combined-section-lectures-nums', + dest='combined_section_lectures_nums', + action='store_true', + default=False, + help='include lecture and section name in final files') + + parser.add_argument( + '--unrestricted-filenames', + dest='unrestricted_filenames', + action='store_true', + default=False, + help='Do not limit filenames to be ASCII-only') # Advanced authentication - group_adv_auth = parser.add_argument_group('Advanced authentication options') + group_adv_auth = parser.add_argument_group( + 'Advanced authentication options') - group_adv_auth.add_argument('-c', - '--cookies_file', - dest='cookies_file', - action='store', - default=None, - help='full path to the cookies.txt file') + group_adv_auth.add_argument( + '-ca', + '--cauth', + dest='cookies_cauth', + action='store', + default=None, + help='cauth cookie value from browser') - group_adv_auth.add_argument('-n', - '--netrc', - dest='netrc', - nargs='?', - action='store', - const=True, - default=False, - help='use netrc for reading passwords, uses default' - ' location if no path specified') + group_adv_auth.add_argument( + '-c', + '--cookies_file', + dest='cookies_file', + action='store', + default=None, + help='full path to the cookies.txt file') - group_adv_auth.add_argument('-k', - '--keyring', - dest='use_keyring', - action='store_true', - default=False, - help='use keyring provided by operating system to ' - 'save and load credentials') + group_adv_auth.add_argument( + '-n', + '--netrc', + dest='netrc', + nargs='?', + action='store', + const=True, + default=False, + help='use netrc for reading passwords, uses default' + ' location if no path specified') - group_adv_auth.add_argument('--clear-cache', - dest='clear_cache', - action='store_true', - default=False, - help='clear cached cookies') + group_adv_auth.add_argument( + '-k', + '--keyring', + dest='use_keyring', + action='store_true', + default=False, + help='use keyring provided by operating system to ' + 'save and load credentials') + + group_adv_auth.add_argument( + '--clear-cache', + dest='clear_cache', + action='store_true', + default=False, + help='clear cached cookies') # Advanced miscellaneous options - group_adv_misc = parser.add_argument_group('Advanced miscellaneous options') + group_adv_misc = parser.add_argument_group( + 'Advanced miscellaneous options') - group_adv_misc.add_argument('--hook', - dest='hooks', - action='append', - default=[], - help='hooks to run when finished') + group_adv_misc.add_argument( + '--hook', + dest='hooks', + action='append', + default=[], + help='hooks to run when finished') - group_adv_misc.add_argument('-pl', - '--playlist', - dest='playlist', - action='store_true', - default=False, - help='generate M3U playlists for course weeks') + group_adv_misc.add_argument( + '-pl', + '--playlist', + dest='playlist', + action='store_true', + default=False, + help='generate M3U playlists for course weeks') + + group_adv_misc.add_argument( + '--mathjax-cdn', + dest='mathjax_cdn_url', + default='https://cdn.mathjax.org/mathjax/latest/MathJax.js', + help='the cdn address of MathJax.js' + ) # Debug options group_debug = parser.add_argument_group('Debugging options') - group_debug.add_argument('--skip-download', - dest='skip_download', - action='store_true', - default=False, - help='for debugging: skip actual downloading of files') + group_debug.add_argument( + '--skip-download', + dest='skip_download', + action='store_true', + default=False, + help='for debugging: skip actual downloading of files') - group_debug.add_argument('--debug', - dest='debug', - action='store_true', - default=False, - help='print lots of debug information') + group_debug.add_argument( + '--debug', + dest='debug', + action='store_true', + default=False, + help='print lots of debug information') - group_debug.add_argument('--cache-syllabus', - dest='cache_syllabus', - action='store_true', - default=False, - help='cache course syllabus into a file') + group_debug.add_argument( + '--cache-syllabus', + dest='cache_syllabus', + action='store_true', + default=False, + help='cache course syllabus into a file') - group_debug.add_argument('--version', - dest='version', - action='store_true', - default=False, - help='display version and exit') + group_debug.add_argument( + '--version', + dest='version', + action='store_true', + default=False, + help='display version and exit') - group_debug.add_argument('-l', # FIXME: remove short option from rarely used ones - '--process_local_page', - dest='local_page', - help='uses or creates local cached version of syllabus' - ' page') + group_debug.add_argument( + '-l', # FIXME: remove short option from rarely used ones + '--process_local_page', + dest='local_page', + help='uses or creates local cached version of syllabus' + ' page') # Final parsing of the options args = parser.parse_args(args) @@ -372,7 +471,8 @@ def parse_args(args=None): # show version? if args.version: # we use print (not logging) function because version may be used - # by some external script while logging may output excessive information + # by some external script while logging may output excessive + # information print(__version__) sys.exit(0) @@ -388,7 +488,8 @@ def parse_args(args=None): # check arguments if args.use_keyring and args.password: - logging.warning('--keyring and --password cannot be specified together') + logging.warning( + '--keyring and --password cannot be specified together') args.use_keyring = False if args.use_keyring and not keyring: @@ -399,7 +500,7 @@ def parse_args(args=None): logging.error('Cookies file not found: %s', args.cookies_file) sys.exit(1) - if not args.cookies_file: + if not args.cookies_file and not args.cookies_cauth: try: args.username, args.password = get_credentials( username=args.username, password=args.password, @@ -409,5 +510,3 @@ def parse_args(args=None): sys.exit(1) return args - - diff --git a/coursera/cookies.py b/coursera/cookies.py index 36fa1df..f12ed8f 100644 --- a/coursera/cookies.py +++ b/coursera/cookies.py @@ -25,7 +25,7 @@ from .utils import mkdir_p, random_string # Monkey patch cookielib.Cookie.__init__. # Reason: The expires value may be a decimal string, # but the Cookie class uses int() ... -__orginal_init__ = cookielib.Cookie.__init__ +__original_init__ = cookielib.Cookie.__init__ def __fixed_init__(self, version, name, value, @@ -41,7 +41,7 @@ def __fixed_init__(self, version, name, value, rfc2109=False): if expires is not None: expires = float(expires) - __orginal_init__(self, version, name, value, + __original_init__(self, version, name, value, port, port_specified, domain, domain_specified, domain_initial_dot, path, path_specified, @@ -53,6 +53,7 @@ def __fixed_init__(self, version, name, value, rest, rfc2109=False) + cookielib.Cookie.__init__ = __fixed_init__ @@ -68,15 +69,15 @@ class AuthenticationFailed(BaseException): """ -def prepape_auth_headers(session, include_cauth=False): +def prepare_auth_headers(session, include_cauth=False): """ - This function prepapes headers with CSRF/CAUTH tokens that can + This function prepares headers with CSRF/CAUTH tokens that can be used in POST requests such as login/get_quiz. @param session: Requests session. @type session: requests.Session - @param include_cauth: Flag that indicates whethe CAUTH cookies should be + @param include_cauth: Flag that indicates whether CAUTH cookies should be included as well. @type include_cauth: bool @@ -132,7 +133,7 @@ def login(session, username, password, class_name=None): logging.error(e) raise ClassNotFound(class_name) - headers = prepape_auth_headers(session, include_cauth=False) + headers = prepare_auth_headers(session, include_cauth=False) data = { 'email': username, @@ -170,7 +171,8 @@ def down_the_wabbit_hole(session, class_name): try: r.raise_for_status() except requests.exceptions.HTTPError as e: - raise AuthenticationFailed('Cannot login on class.coursera.org: %s' % e) + raise AuthenticationFailed( + 'Cannot login on class.coursera.org: %s' % e) logging.debug('Exiting "deep" authentication.') @@ -353,7 +355,7 @@ def get_cookies_for_class(session, class_name, Get the cookies for the given class. We do not validate the cookies if they are loaded from a cookies file - because this is intented for debugging purposes or if the coursera + because this is intended for debugging purposes or if the coursera authentication process has changed. """ if cookies_file: @@ -375,8 +377,9 @@ class TLSAdapter(HTTPAdapter): A customized HTTP Adapter which uses TLS v1.2 for encrypted connections. """ + def init_poolmanager(self, connections, maxsize, block=False): self.poolmanager = PoolManager(num_pools=connections, maxsize=maxsize, block=block, - ssl_version=ssl.PROTOCOL_TLSv1) + ssl_version=ssl.PROTOCOL_TLSv1_2) diff --git a/coursera/coursera_dl.py b/coursera/coursera_dl.py index 124868e..da18281 100644 --- a/coursera/coursera_dl.py +++ b/coursera/coursera_dl.py @@ -3,7 +3,7 @@ # Authors and copyright: # © 2012-2013, John Lehmann (first last at geemail dotcom or @jplehmann) -# © 2012-2015, Rogério Brito (r lastname at ime usp br) +# © 2012-2020, Rogério Theodoro de Brito # © 2013, Jonas De Taeye (first dt at fastmail fm) # # Contributions are welcome, but please add new unit tests to test your changes @@ -60,15 +60,17 @@ import requests from .cookies import ( AuthenticationFailed, ClassNotFound, - get_cookies_for_class, make_cookie_values, TLSAdapter) + get_cookies_for_class, make_cookie_values, TLSAdapter, login) from .define import (CLASS_URL, ABOUT_URL, PATH_CACHE) from .downloaders import get_downloader from .workflow import CourseraDownloader from .parallel import ConsecutiveDownloader, ParallelDownloader from .utils import (clean_filename, get_anchor_format, mkdir_p, fix_url, print_ssl_error_message, - decode_input, BeautifulSoup, is_debug_run) + decode_input, BeautifulSoup, is_debug_run, + spit_json, slurp_json) +from .api import expand_specializations from .network import get_page, get_page_and_url from .commandline import parse_args from .extractors import CourseraExtractor @@ -103,46 +105,48 @@ def list_courses(args): @type args: namedtuple """ session = get_session() - extractor = CourseraExtractor(session, args.username, args.password) + login(session, args.username, args.password) + extractor = CourseraExtractor(session) courses = extractor.list_courses() logging.info('Found %d courses', len(courses)) for course in courses: logging.info(course) -def download_on_demand_class(args, class_name): +def download_on_demand_class(session, args, class_name): """ - Download all requested resources from the on-demand class given in class_name. + Download all requested resources from the on-demand class given + in class_name. @return: Tuple of (bool, bool), where the first bool indicates whether - errors occured while parsing syllabus, the second bool indicaters + errors occurred while parsing syllabus, the second bool indicates whether the course appears to be completed. @rtype: (bool, bool) """ - error_occured = False - session = get_session() - extractor = CourseraExtractor(session, args.username, args.password) + error_occurred = False + extractor = CourseraExtractor(session) cached_syllabus_filename = '%s-syllabus-parsed.json' % class_name if args.cache_syllabus and os.path.isfile(cached_syllabus_filename): - with open(cached_syllabus_filename) as syllabus_file: - modules = json.load(syllabus_file) + modules = slurp_json(cached_syllabus_filename) else: - error_occured, modules = extractor.get_modules( + error_occurred, modules = extractor.get_modules( class_name, args.reverse, args.unrestricted_filenames, args.subtitle_language, args.video_resolution, - args.download_quizzes) + args.download_quizzes, + args.mathjax_cdn_url, + args.download_notebooks + ) if is_debug_run or args.cache_syllabus(): - with open(cached_syllabus_filename, 'w') as file_object: - json.dump(modules, file_object, indent=4) + spit_json(modules, cached_syllabus_filename) if args.only_syllabus: - return error_occured, False + return error_occurred, False downloader = get_downloader(session, class_name, args) downloader_wrapper = ParallelDownloader(downloader, args.jobs) \ @@ -174,7 +178,7 @@ def download_on_demand_class(args, class_name): if course_downloader.failed_urls: print_failed_urls(course_downloader.failed_urls) - return error_occured, completed + return error_occurred, completed def print_skipped_urls(skipped_urls): @@ -197,17 +201,17 @@ def print_failed_urls(failed_urls): logging.info('-' * 80) -def download_class(args, class_name): +def download_class(session, args, class_name): """ Try to download on-demand class. @return: Tuple of (bool, bool), where the first bool indicates whether - errors occured while parsing syllabus, the second bool indicaters + errors occurred while parsing syllabus, the second bool indicates whether the course appears to be completed. @rtype: (bool, bool) """ logging.debug('Downloading new style (on demand) class %s', class_name) - return download_on_demand_class(args, class_name) + return download_on_demand_class(session, args, class_name) def main(): @@ -228,14 +232,23 @@ def main(): list_courses(args) return + session = get_session() + if args.cookies_cauth: + session.cookies.set('CAUTH', args.cookies_cauth) + else: + login(session, args.username, args.password) + if args.specialization: + args.class_names = expand_specializations(session, args.class_names) + for class_index, class_name in enumerate(args.class_names): try: logging.info('Downloading class: %s (%d / %d)', class_name, class_index + 1, len(args.class_names)) - error_occured, completed = download_class(args, class_name) + error_occurred, completed = download_class( + session, args, class_name) if completed: completed_classes.append(class_name) - if error_occured: + if error_occurred: classes_with_errors.append(class_name) except requests.exceptions.HTTPError as e: logging.error('HTTPError %s', e) @@ -246,10 +259,10 @@ def main(): print_ssl_error_message(e) if is_debug_run(): raise - except ClassNotFound as cnf: - logging.error('Could not find class: %s', cnf) - except AuthenticationFailed as af: - logging.error('Could not authenticate: %s', af) + except ClassNotFound as e: + logging.error('Could not find class: %s', e) + except AuthenticationFailed as e: + logging.error('Could not authenticate: %s', e) if class_index + 1 != len(args.class_names): logging.info('Sleeping for %d seconds before downloading next course. ' diff --git a/coursera/credentials.py b/coursera/credentials.py index d6a36cd..aacd1d1 100644 --- a/coursera/credentials.py +++ b/coursera/credentials.py @@ -134,7 +134,8 @@ def authenticate_through_netrc(path=None): error_messages = '\n'.join(str(e) for e in errors) raise CredentialsError( - 'Did not find valid netrc file:\n' + error_messages) + 'Did not find valid netrc file:\n' + error_messages + + '\nPlease run this command: chmod og-rw ~/.netrc') def get_credentials(username=None, password=None, netrc=None, use_keyring=False): diff --git a/coursera/define.py b/coursera/define.py index 1a72241..64d3f64 100644 --- a/coursera/define.py +++ b/coursera/define.py @@ -11,13 +11,13 @@ import tempfile HTTP_FORBIDDEN = 403 -COURSERA_URL = 'https://www.coursera.org' +COURSERA_URL = 'https://api.coursera.org' AUTH_URL = 'https://accounts.coursera.org/api/v1/login' -AUTH_URL_V3 = 'https://www.coursera.org/api/login/v3' +AUTH_URL_V3 = 'https://api.coursera.org/api/login/v3' CLASS_URL = 'https://class.coursera.org/{class_name}' -# The following link is left just for illustative purposes: -# https://www.coursera.org/api/courses.v1?fields=display%2CpartnerIds%2CphotoUrl%2CstartDate%2Cpartners.v1(homeLink%2Cname)&includes=partnerIds&q=watchlist&start=0 +# The following link is left just for illustrative purposes: +# https://api.coursera.org/api/courses.v1?fields=display%2CpartnerIds%2CphotoUrl%2CstartDate%2Cpartners.v1(homeLink%2Cname)&includes=partnerIds&q=watchlist&start=0 # Reply is as follows: # { # "elements": [ @@ -34,10 +34,10 @@ CLASS_URL = 'https://class.coursera.org/{class_name}' # }, # "linked": {} # } -OPENCOURSE_LIST_COURSES = 'https://www.coursera.org/api/courses.v1?q=watchlist&start={start}' +OPENCOURSE_LIST_COURSES = 'https://api.coursera.org/api/courses.v1?q=watchlist&start={start}' -# The following link is left just for illustative purposes: -# https://www.coursera.org/api/memberships.v1?fields=courseId,enrolledTimestamp,grade,id,lastAccessedTimestamp,onDemandSessionMembershipIds,onDemandSessionMemberships,role,v1SessionId,vc,vcMembershipId,courses.v1(courseStatus,display,partnerIds,photoUrl,specializations,startDate,v1Details,v2Details),partners.v1(homeLink,name),v1Details.v1(sessionIds),v1Sessions.v1(active,certificatesReleased,dbEndDate,durationString,hasSigTrack,startDay,startMonth,startYear),v2Details.v1(onDemandSessions,plannedLaunchDate,sessionsEnabledAt),specializations.v1(logo,name,partnerIds,shortName)&includes=courseId,onDemandSessionMemberships,vcMembershipId,courses.v1(partnerIds,specializations,v1Details,v2Details),v1Details.v1(sessionIds),v2Details.v1(onDemandSessions),specializations.v1(partnerIds)&q=me&showHidden=true&filter=current,preEnrolled +# The following link is left just for illustrative purposes: +# https://api.coursera.org/api/memberships.v1?fields=courseId,enrolledTimestamp,grade,id,lastAccessedTimestamp,onDemandSessionMembershipIds,onDemandSessionMemberships,role,v1SessionId,vc,vcMembershipId,courses.v1(courseStatus,display,partnerIds,photoUrl,specializations,startDate,v1Details,v2Details),partners.v1(homeLink,name),v1Details.v1(sessionIds),v1Sessions.v1(active,certificatesReleased,dbEndDate,durationString,hasSigTrack,startDay,startMonth,startYear),v2Details.v1(onDemandSessions,plannedLaunchDate,sessionsEnabledAt),specializations.v1(logo,name,partnerIds,shortName)&includes=courseId,onDemandSessionMemberships,vcMembershipId,courses.v1(partnerIds,specializations,v1Details,v2Details),v1Details.v1(sessionIds),v2Details.v1(onDemandSessions),specializations.v1(partnerIds)&q=me&showHidden=true&filter=current,preEnrolled # Sample reply: # { # "elements": [ @@ -60,13 +60,21 @@ OPENCOURSE_LIST_COURSES = 'https://www.coursera.org/api/courses.v1?q=watchlist&s # ] # } # } -OPENCOURSE_MEMBERSHIPS = 'https://www.coursera.org/api/memberships.v1?includes=courseId,courses.v1&q=me&showHidden=true&filter=current,preEnrolled' -OPENCOURSE_CONTENT_URL = 'https://www.coursera.org/api/opencourse.v1/course/{class_name}?showLockedItems=true' -OPENCOURSE_VIDEO_URL = 'https://www.coursera.org/api/opencourse.v1/video/{video_id}' -OPENCOURSE_SUPPLEMENT_URL = 'https://www.coursera.org/api/onDemandSupplements.v1/'\ +OPENCOURSE_MEMBERSHIPS = 'https://api.coursera.org/api/memberships.v1?includes=courseId,courses.v1&q=me&showHidden=true&filter=current,preEnrolled' +OPENCOURSE_ONDEMAND_LECTURE_VIDEOS_URL = \ + 'https://api.coursera.org/api/onDemandLectureVideos.v1/'\ + '{course_id}~{video_id}?includes=video&'\ + 'fields=onDemandVideos.v1(sources%2Csubtitles%2CsubtitlesVtt%2CsubtitlesTxt)' +OPENCOURSE_SUPPLEMENT_URL = 'https://api.coursera.org/api/onDemandSupplements.v1/'\ '{course_id}~{element_id}?includes=asset&fields=openCourseAssets.v1%28typeName%29,openCourseAssets.v1%28definition%29' OPENCOURSE_PROGRAMMING_ASSIGNMENTS_URL = \ - 'https://www.coursera.org/api/onDemandProgrammingLearnerAssignments.v1/{course_id}~{element_id}?fields=submissionLearnerSchema' + 'https://api.coursera.org/api/onDemandProgrammingLearnerAssignments.v1/{course_id}~{element_id}?fields=submissionLearnerSchema' +OPENCOURSE_PROGRAMMING_IMMEDIATE_INSTRUCTIOINS_URL = \ + 'https://api.coursera.org/api/onDemandProgrammingImmediateInstructions.v1/{course_id}~{element_id}' +OPENCOURSE_REFERENCES_POLL_URL = \ + "https://api.coursera.org/api/onDemandReferences.v1/?courseId={course_id}&q=courseListed&fields=name%2CshortId%2Cslug%2Ccontent&includes=assets" +OPENCOURSE_REFERENCE_ITEM_URL = \ + "https://api.coursera.org/api/onDemandReferences.v1/?courseId={course_id}&q=shortId&shortId={short_id}&fields=name%2CshortId%2Cslug%2Ccontent&includes=assets" # These are ids that are present in tag in assignment text: # @@ -89,7 +97,24 @@ OPENCOURSE_PROGRAMMING_ASSIGNMENTS_URL = \ # "linked": null # } OPENCOURSE_ASSET_URL = \ - 'https://www.coursera.org/api/assetUrls.v1?ids={ids}' + 'https://api.coursera.org/api/assetUrls.v1?ids={ids}' + +# Sample response: +# "linked": { +# "openCourseAssets.v1": [ +# { +# "typeName": "asset", +# "definition": { +# "assetId": "fytYX5rYEeedWRLokafKRg", +# "name": "Lecture slides" +# }, +# "id": "j6g7VZrYEeeUVgpv-dYMig" +# } +# ] +# } +OPENCOURSE_ONDEMAND_LECTURE_ASSETS_URL = \ + 'https://api.coursera.org/api/onDemandLectureAssets.v1/'\ + '{course_id}~{video_id}/?includes=openCourseAssets' # These ids are provided in lecture json: # @@ -137,7 +162,7 @@ OPENCOURSE_ASSET_URL = \ # "linked": null # } OPENCOURSE_ASSETS_URL = \ - 'https://www.coursera.org/api/openCourseAssets.v1/{id}' + 'https://api.coursera.org/api/openCourseAssets.v1/{id}' # These asset ids are ids returned from OPENCOURSE_ASSETS_URL request: # See example above. @@ -160,13 +185,39 @@ OPENCOURSE_ASSETS_URL = \ # "linked": null # } OPENCOURSE_API_ASSETS_V1_URL = \ - 'https://www.coursera.org/api/assets.v1?ids={id}' + 'https://api.coursera.org/api/assets.v1?ids={id}' OPENCOURSE_ONDEMAND_COURSE_MATERIALS = \ - 'https://www.coursera.org/api/onDemandCourseMaterials.v1/?'\ - 'q=slug&slug={class_name}&includes=moduleIds%2ClessonIds%2CpassableItemGroups%2CpassableItemGroupChoices%2CpassableLessonElements%2CitemIds%2Ctracks'\ - '&fields=moduleIds%2ConDemandCourseMaterialModules.v1(name%2Cslug%2Cdescription%2CtimeCommitment%2ClessonIds%2Coptional)%2ConDemandCourseMaterialLessons.v1(name%2Cslug%2CtimeCommitment%2CelementIds%2Coptional%2CtrackId)%2ConDemandCourseMaterialPassableItemGroups.v1(requiredPassedCount%2CpassableItemGroupChoiceIds%2CtrackId)%2ConDemandCourseMaterialPassableItemGroupChoices.v1(name%2Cdescription%2CitemIds)%2ConDemandCourseMaterialPassableLessonElements.v1(gradingWeight)%2ConDemandCourseMaterialItems.v1(name%2Cslug%2CtimeCommitment%2Ccontent%2CisLocked%2ClockableByItem%2CitemLockedReasonCode%2CtrackId)%2ConDemandCourseMaterialTracks.v1(passablesCount)'\ - '&showLockedItems=true' + 'https://api.coursera.org/api/onDemandCourseMaterials.v1/?'\ + 'q=slug&slug={class_name}&includes=moduleIds%2ClessonIds%2CpassableItemGroups%2CpassableItemGroupChoices%2CpassableLessonElements%2CitemIds%2Ctracks'\ + '&fields=moduleIds%2ConDemandCourseMaterialModules.v1(name%2Cslug%2Cdescription%2CtimeCommitment%2ClessonIds%2Coptional)%2ConDemandCourseMaterialLessons.v1(name%2Cslug%2CtimeCommitment%2CelementIds%2Coptional%2CtrackId)%2ConDemandCourseMaterialPassableItemGroups.v1(requiredPassedCount%2CpassableItemGroupChoiceIds%2CtrackId)%2ConDemandCourseMaterialPassableItemGroupChoices.v1(name%2Cdescription%2CitemIds)%2ConDemandCourseMaterialPassableLessonElements.v1(gradingWeight)%2ConDemandCourseMaterialItems.v1(name%2Cslug%2CtimeCommitment%2Ccontent%2CisLocked%2ClockableByItem%2CitemLockedReasonCode%2CtrackId)%2ConDemandCourseMaterialTracks.v1(passablesCount)'\ + '&showLockedItems=true' + +OPENCOURSE_ONDEMAND_COURSE_MATERIALS_V2 = \ + 'https://api.coursera.org/api/onDemandCourseMaterials.v2/?q=slug&slug={class_name}'\ + '&includes=modules%2Clessons%2CpassableItemGroups%2CpassableItemGroupChoices%2CpassableLessonElements%2Citems%2Ctracks%2CgradePolicy&'\ + '&fields=moduleIds%2ConDemandCourseMaterialModules.v1(name%2Cslug%2Cdescription%2CtimeCommitment%2ClessonIds%2Coptional%2ClearningObjectives)%2ConDemandCourseMaterialLessons.v1(name%2Cslug%2CtimeCommitment%2CelementIds%2Coptional%2CtrackId)%2ConDemandCourseMaterialPassableItemGroups.v1(requiredPassedCount%2CpassableItemGroupChoiceIds%2CtrackId)%2ConDemandCourseMaterialPassableItemGroupChoices.v1(name%2Cdescription%2CitemIds)%2ConDemandCourseMaterialPassableLessonElements.v1(gradingWeight%2CisRequiredForPassing)%2ConDemandCourseMaterialItems.v2(name%2Cslug%2CtimeCommitment%2CcontentSummary%2CisLocked%2ClockableByItem%2CitemLockedReasonCode%2CtrackId%2ClockedStatus%2CitemLockSummary)%2ConDemandCourseMaterialTracks.v1(passablesCount)'\ + '&showLockedItems=true' + +OPENCOURSE_ONDEMAND_SPECIALIZATIONS_V1 = \ + 'https://api.coursera.org/api/onDemandSpecializations.v1?q=slug'\ + '&slug={class_name}&fields=courseIds,interchangeableCourseIds,launchedAt,'\ + 'logo,memberships,metadata,partnerIds,premiumExperienceVariant,'\ + 'onDemandSpecializationMemberships.v1(suggestedSessionSchedule),'\ + 'onDemandSpecializationSuggestedSchedule.v1(suggestedSessions),'\ + 'partners.v1(homeLink,name),courses.v1(courseProgress,description,'\ + 'membershipIds,startDate,v2Details,vcMembershipIds),v2Details.v1('\ + 'onDemandSessions,plannedLaunchDate),memberships.v1(grade,'\ + 'vcMembershipId),vcMemberships.v1(certificateCodeWithGrade)'\ + '&includes=courseIds,memberships,partnerIds,'\ + 'onDemandSpecializationMemberships.v1(suggestedSessionSchedule),'\ + 'courses.v1(courseProgress,membershipIds,v2Details,vcMembershipIds),'\ + 'v2Details.v1(onDemandSessions)' + +OPENCOURSE_ONDEMAND_COURSES_V1 = \ + 'https://api.coursera.org/api/onDemandCourses.v1?q=slug&slug={class_name}&'\ + 'includes=instructorIds%2CpartnerIds%2C_links&'\ + 'fields=brandingImage%2CcertificatePurchaseEnabledAt%2Cpartners.v1(squareLogo%2CrectangularLogo)%2Cinstructors.v1(fullName)%2CoverridePartnerLogos%2CsessionsEnabledAt%2CdomainTypes%2CpremiumExperienceVariant%2CisRestrictedMembership' ABOUT_URL = ('https://api.coursera.org/api/catalog.v1/courses?' 'fields=largeIcon,photo,previewLink,shortDescription,smallIcon,' @@ -179,7 +230,111 @@ ABOUT_URL = ('https://api.coursera.org/api/catalog.v1/courses?' AUTH_REDIRECT_URL = ('https://class.coursera.org/{class_name}' '/auth/auth_redirector?type=login&subtype=normal') -#POST_OPENCOURSE_API_QUIZ_SESSION = 'https://www.coursera.org/api/opencourse.v1/user/4958/course/text-mining/item/7OQHc/quiz/session' +# Sample URL: +# +# https://api.coursera.org/api/onDemandPeerAssignmentInstructions.v1/?q=latest&userId=4958&courseId=RcnRZHHtEeWxvQr3acyajw&itemId=2yTvX&includes=gradingMetadata%2CreviewSchemas%2CsubmissionSchemas&fields=instructions%2ConDemandPeerAssignmentGradingMetadata.v1(requiredAuthoredReviewCount%2CisMentorGraded%2CassignmentDetails)%2ConDemandPeerReviewSchemas.v1(reviewSchema)%2ConDemandPeerSubmissionSchemas.v1(submissionSchema) +# +# Sample response: +# +# { +# "elements": [ +# { +# "instructions": { +# "introduction": { +# "typeName": "cml", +# "definition": { +# "dtdId": "assess/1", +# "value": "Ваше первое задание заключается в установке Python и библиотек.." +# } +# }, +# "sections": [ +# { +# "typeId": "unknown", +# "title": "Review criteria", +# "content": { +# "typeName": "cml", +# "definition": { +# "dtdId": "assess/1", +# "value": "В результате работы вы установите на компьютер Python и библиотеки, необходимые для дальнейшего прохождения курса.." +# } +# } +# } +# ] +# }, +# "id": "4958~RcnRZHHtEeWxvQr3acyajw~2yTvX~8x7Qhs66EeW2Tw715xhIPQ@13" +# } +# ], +# "paging": {}, +# "linked": { +# "onDemandPeerSubmissionSchemas.v1": [ +# { +# "submissionSchema": { +# "parts": [ +# { +# "details": { +# "typeName": "fileUpload", +# "definition": { +# "required": false +# } +# }, +# "id": "_fcfP3bPT5W4pkfkshmUAQ", +# "prompt": { +# "typeName": "cml", +# "definition": { +# "dtdId": "assess/1", +# "value": "Загрузите скриншот №1." +# } +# } +# }, +# { +# "details": { +# "typeName": "fileUpload", +# "definition": { +# "required": false +# } +# }, +# "id": "92ea4b4e-3492-41eb-ee32-2624ee807bd3", +# "prompt": { +# "typeName": "cml", +# "definition": { +# "dtdId": "assess/1", +# "value": "Загрузите скриншот №2." +# } +# } +# } +# ] +# }, +# "id": "4958~RcnRZHHtEeWxvQr3acyajw~2yTvX~8x7Qhs66EeW2Tw715xhIPQ@13" +# } +# ], +# "onDemandPeerAssignmentGradingMetadata.v1": [ +# { +# "assignmentDetails": { +# "typeName": "phased", +# "definition": { +# "receivedReviewCutoffs": { +# "count": 3 +# }, +# "passingFraction": 0.8 +# } +# }, +# "requiredAuthoredReviewCount": 3, +# "isMentorGraded": false, +# "id": "4958~RcnRZHHtEeWxvQr3acyajw~2yTvX~8x7Qhs66EeW2Tw715xhIPQ@13" +# } +# ], +# "onDemandPeerReviewSchemas.v1": [] +# } +# } +# +# This URL is used to retrieve "phasedPeer" typename instructions' contents +OPENCOURSE_PEER_ASSIGNMENT_INSTRUCTIONS = ( + 'https://api.coursera.org/api/onDemandPeerAssignmentInstructions.v1/?' + 'q=latest&userId={user_id}&courseId={course_id}&itemId={element_id}&' + 'includes=gradingMetadata%2CreviewSchemas%2CsubmissionSchemas&' + 'fields=instructions%2ConDemandPeerAssignmentGradingMetadata.v1(requiredAuthoredReviewCount%2CisMentorGraded%2CassignmentDetails)%2ConDemandPeerReviewSchemas.v1(reviewSchema)%2ConDemandPeerSubmissionSchemas.v1(submissionSchema)') + +#POST_OPENCOURSE_API_QUIZ_SESSION = 'https://api.coursera.org/api/opencourse.v1/user/4958/course/text-mining/item/7OQHc/quiz/session' # Sample response: # # { @@ -195,9 +350,9 @@ AUTH_REDIRECT_URL = ('https://class.coursera.org/{class_name}' # "progressState": "Started" # } # } -POST_OPENCOURSE_API_QUIZ_SESSION = 'https://www.coursera.org/api/opencourse.v1/user/{user_id}/course/{class_name}/item/{quiz_id}/quiz/session' +POST_OPENCOURSE_API_QUIZ_SESSION = 'https://api.coursera.org/api/opencourse.v1/user/{user_id}/course/{class_name}/item/{quiz_id}/quiz/session' -#POST_OPENCOURSE_API_QUIZ_SESSION_GET_STATE = 'https://www.coursera.org/api/opencourse.v1/user/4958/course/text-mining/item/7OQHc/quiz/session/opencourse~bVgqTevEEeWvGQrWsIkLlw:4958:BiNDdOvPEeWAkwqbKEEh3w@13:1468773901987@1/action/getState?autoEnroll=false' +#POST_OPENCOURSE_API_QUIZ_SESSION_GET_STATE = 'https://api.coursera.org/api/opencourse.v1/user/4958/course/text-mining/item/7OQHc/quiz/session/opencourse~bVgqTevEEeWvGQrWsIkLlw:4958:BiNDdOvPEeWAkwqbKEEh3w@13:1468773901987@1/action/getState?autoEnroll=false' # Sample response: # # { @@ -279,9 +434,9 @@ POST_OPENCOURSE_API_QUIZ_SESSION = 'https://www.coursera.org/api/opencourse.v1/u # } # } # -POST_OPENCOURSE_API_QUIZ_SESSION_GET_STATE = 'https://www.coursera.org/api/opencourse.v1/user/{user_id}/course/{class_name}/item/{quiz_id}/quiz/session/{session_id}/action/getState?autoEnroll=false' +POST_OPENCOURSE_API_QUIZ_SESSION_GET_STATE = 'https://api.coursera.org/api/opencourse.v1/user/{user_id}/course/{class_name}/item/{quiz_id}/quiz/session/{session_id}/action/getState?autoEnroll=false' -#POST_OPENCOURSE_ONDEMAND_EXAM_SESSIONS = 'https://www.coursera.org/api/onDemandExamSessions.v1/-N44X0IJEeWpogr5ZO8qxQ~YV0W4~10!~1467462079068/actions?includes=gradingAttempts' +#POST_OPENCOURSE_ONDEMAND_EXAM_SESSIONS = 'https://api.coursera.org/api/onDemandExamSessions.v1/-N44X0IJEeWpogr5ZO8qxQ~YV0W4~10!~1467462079068/actions?includes=gradingAttempts' # Sample response: # # { @@ -422,14 +577,14 @@ POST_OPENCOURSE_API_QUIZ_SESSION_GET_STATE = 'https://www.coursera.org/api/openc # Request payload: # {"courseId":"-N44X0IJEeWpogr5ZO8qxQ","itemId":"YV0W4"} # -#POST_OPENCOURSE_ONDEMAND_EXAM_SESSIONS = 'https://www.coursera.org/api/onDemandExamSessions.v1/-N44X0IJEeWpogr5ZO8qxQ~YV0W4~10!~1467462079068/actions?includes=gradingAttempts' +#POST_OPENCOURSE_ONDEMAND_EXAM_SESSIONS = 'https://api.coursera.org/api/onDemandExamSessions.v1/-N44X0IJEeWpogr5ZO8qxQ~YV0W4~10!~1467462079068/actions?includes=gradingAttempts' # Response for this request is empty. Result (session_id) should be taken # either from Location header or from X-Coursera-Id header. # # Request payload: # {"courseId":"-N44X0IJEeWpogr5ZO8qxQ","itemId":"YV0W4"} -POST_OPENCOURSE_ONDEMAND_EXAM_SESSIONS = 'https://www.coursera.org/api/onDemandExamSessions.v1' +POST_OPENCOURSE_ONDEMAND_EXAM_SESSIONS = 'https://api.coursera.org/api/onDemandExamSessions.v1' # Sample response: # { @@ -741,7 +896,7 @@ POST_OPENCOURSE_ONDEMAND_EXAM_SESSIONS = 'https://www.coursera.org/api/onDemandE # # Request payload: # {"name":"getState","argument":[]} -POST_OPENCOURSE_ONDEMAND_EXAM_SESSIONS_GET_STATE = 'https://www.coursera.org/api/onDemandExamSessions.v1/{session_id}/actions?includes=gradingAttempts' +POST_OPENCOURSE_ONDEMAND_EXAM_SESSIONS_GET_STATE = 'https://api.coursera.org/api/onDemandExamSessions.v1/{session_id}/actions?includes=gradingAttempts' ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) # define a per-user cache folder @@ -772,7 +927,7 @@ FORMAT_MAX_LENGTH = 20 TITLE_MAX_LENGTH = 200 #: CSS that is usen to prettify instructions -INSTRUCTIONS_HTML_INJECTION = ''' +INSTRUCTIONS_HTML_INJECTION_PRE = ''' ''' + +# The following url is the root url (tree) for a Coursera Course +OPENCOURSE_NOTEBOOK_DESCRIPTIONS = "https://hub.coursera-notebooks.org/hub/coursera_login?token={authId}&next=/" +OPENCOURSE_NOTEBOOK_LAUNCHES = "https://api.coursera.org/api/onDemandNotebookWorkspaceLaunches.v1/?fields=authorizationId%2CcontentPath%2CuseLegacySystem" +OPENCOURSE_NOTEBOOK_TREE = "https://hub.coursera-notebooks.org/user/{jupId}/api/contents/{path}?type=directory&_={timestamp}" +OPENCOURSE_NOTEBOOK_DOWNLOAD = "https://hub.coursera-notebooks.org/user/{jupId}/files/{path}?download=1" diff --git a/coursera/extractors.py b/coursera/extractors.py index a4e8ade..7bb44fd 100644 --- a/coursera/extractors.py +++ b/coursera/extractors.py @@ -9,11 +9,11 @@ import abc import json import logging -from .api import CourseraOnDemand, OnDemandCourseMaterialItems -from .define import OPENCOURSE_CONTENT_URL -from .cookies import login +from .api import (CourseraOnDemand, OnDemandCourseMaterialItemsV1, + ModulesV1, LessonsV1, ItemsV2) +from .define import OPENCOURSE_ONDEMAND_COURSE_MATERIALS_V2 from .network import get_page -from .utils import is_debug_run +from .utils import is_debug_run, spit_json class PlatformExtractor(object): @@ -27,9 +27,8 @@ class PlatformExtractor(object): class CourseraExtractor(PlatformExtractor): - def __init__(self, session, username, password): - login(session, username, password) - + def __init__(self, session): + self._notebook_downloaded = False self._session = session def list_courses(self): @@ -47,31 +46,38 @@ class CourseraExtractor(PlatformExtractor): def get_modules(self, class_name, reverse=False, unrestricted_filenames=False, subtitle_language='en', video_resolution=None, - download_quizzes=False): + download_quizzes=False, mathjax_cdn_url=None, + download_notebooks=False): page = self._get_on_demand_syllabus(class_name) - error_occured, modules = self._parse_on_demand_syllabus( + error_occurred, modules = self._parse_on_demand_syllabus( + class_name, page, reverse, unrestricted_filenames, subtitle_language, video_resolution, - download_quizzes) - return error_occured, modules + download_quizzes, mathjax_cdn_url, download_notebooks) + + return error_occurred, modules def _get_on_demand_syllabus(self, class_name): """ Get the on-demand course listing webpage. """ - url = OPENCOURSE_CONTENT_URL.format(class_name=class_name) + url = OPENCOURSE_ONDEMAND_COURSE_MATERIALS_V2.format( + class_name=class_name) page = get_page(self._session, url) - logging.info('Downloaded %s (%d bytes)', url, len(page)) + logging.debug('Downloaded %s (%d bytes)', url, len(page)) return page - def _parse_on_demand_syllabus(self, page, reverse=False, + def _parse_on_demand_syllabus(self, course_name, page, reverse=False, unrestricted_filenames=False, subtitle_language='en', video_resolution=None, - download_quizzes=False): + download_quizzes=False, + mathjax_cdn_url=None, + download_notebooks=False + ): """ Parse a Coursera on-demand course listing/syllabus page. @@ -82,96 +88,152 @@ class CourseraExtractor(PlatformExtractor): """ dom = json.loads(page) - course_name = dom['slug'] + class_id = dom['elements'][0]['id'] - logging.info('Parsing syllabus of on-demand course. ' - 'This may take some time, please be patient ...') + logging.info('Parsing syllabus of on-demand course (id=%s). ' + 'This may take some time, please be patient ...', + class_id) modules = [] - json_modules = dom['courseMaterial']['elements'] - course = CourseraOnDemand(session=self._session, course_id=dom['id'], - course_name=course_name, - unrestricted_filenames=unrestricted_filenames) + + json_modules = dom['linked']['onDemandCourseMaterialItems.v2'] + course = CourseraOnDemand( + session=self._session, course_id=class_id, + course_name=course_name, + unrestricted_filenames=unrestricted_filenames, + mathjax_cdn_url=mathjax_cdn_url) course.obtain_user_id() - ondemand_material_items = OnDemandCourseMaterialItems.create( + ondemand_material_items = OnDemandCourseMaterialItemsV1.create( session=self._session, course_name=course_name) if is_debug_run(): - with open('%s-syllabus-raw.json' % course_name, 'w') as file_object: - json.dump(dom, file_object, indent=4) - with open('%s-course-material-items.json' % course_name, 'w') as file_object: - json.dump(ondemand_material_items._items, file_object, indent=4) + spit_json(dom, '%s-syllabus-raw.json' % course_name) + spit_json(json_modules, '%s-material-items-v2.json' % course_name) + spit_json(ondemand_material_items._items, + '%s-course-material-items.json' % course_name) - error_occured = False + error_occurred = False - for module in json_modules: - module_slug = module['slug'] - logging.info('Processing module %s', module_slug) - sections = [] - json_sections = module['elements'] - for section in json_sections: - section_slug = section['slug'] - logging.info('Processing section %s', section_slug) + all_modules = ModulesV1.from_json( + dom['linked']['onDemandCourseMaterialModules.v1']) + all_lessons = LessonsV1.from_json( + dom['linked']['onDemandCourseMaterialLessons.v1']) + all_items = ItemsV2.from_json( + dom['linked']['onDemandCourseMaterialItems.v2']) + + for module in all_modules: + logging.info('Processing module %s', module.slug) + lessons = [] + for section in module.children(all_lessons): + logging.info('Processing section %s', section.slug) lectures = [] - json_lectures = section['elements'] + available_lectures = section.children(all_items) # Certain modules may be empty-looking programming assignments - # e.g. in data-structures, algorithms-on-graphs ondemand courses - if not json_lectures: - lesson_id = section['id'] - lecture = ondemand_material_items.get(lesson_id) + # e.g. in data-structures, algorithms-on-graphs ondemand + # courses + if not available_lectures: + lecture = ondemand_material_items.get(section.id) if lecture is not None: - json_lectures = [lecture] + available_lectures = [lecture] - for lecture in json_lectures: - lecture_slug = lecture['slug'] - typename = lecture['content']['typeName'] + for lecture in available_lectures: + typename = lecture.type_name logging.info('Processing lecture %s (%s)', - lecture_slug, typename) + lecture.slug, typename) # Empty dictionary means there were no data - # None means an error occured + # None means an error occurred links = {} if typename == 'lecture': - lecture_video_id = lecture['content']['definition']['videoId'] - assets = lecture['content']['definition'].get('assets', []) + # lecture_video_id = lecture['content']['definition']['videoId'] + # assets = lecture['content']['definition'].get( + # 'assets', []) + lecture_video_id = lecture.id + # assets = [] links = course.extract_links_from_lecture( + class_id, lecture_video_id, subtitle_language, - video_resolution, assets) + video_resolution) elif typename == 'supplement': links = course.extract_links_from_supplement( - lecture['id']) + lecture.id) + + elif typename == 'phasedPeer': + links = course.extract_links_from_peer_assignment( + lecture.id) elif typename in ('gradedProgramming', 'ungradedProgramming'): - links = course.extract_links_from_programming(lecture['id']) + links = course.extract_links_from_programming( + lecture.id) elif typename == 'quiz': if download_quizzes: - links = course.extract_links_from_quiz(lecture['id']) + links = course.extract_links_from_quiz( + lecture.id) elif typename == 'exam': if download_quizzes: - links = course.extract_links_from_exam(lecture['id']) + links = course.extract_links_from_exam( + lecture.id) + + elif typename == 'programming': + if download_quizzes: + links = course.extract_links_from_programming_immediate_instructions( + lecture.id) + + elif typename == 'notebook': + if download_notebooks and not self._notebook_downloaded: + logging.warning( + 'According to notebooks platform, content will be downloaded first') + links = course.extract_links_from_notebook( + lecture.id) + self._notebook_downloaded = True else: - logging.info('Unsupported typename "%s" in lecture "%s"', - typename, lecture_slug) + logging.info( + 'Unsupported typename "%s" in lecture "%s" (lecture id "%s")', + typename, lecture.slug, lecture.id) continue if links is None: - error_occured = True + error_occurred = True elif links: - lectures.append((lecture_slug, links)) + lectures.append((lecture.slug, links)) if lectures: - sections.append((section_slug, lectures)) + lessons.append((section.slug, lectures)) - if sections: - modules.append((module_slug, sections)) + if lessons: + modules.append((module.slug, lessons)) if modules and reverse: modules.reverse() - return error_occured, modules + # Processing resources section + json_references = course.extract_references_poll() + references = [] + if json_references: + logging.info('Processing resources') + for json_reference in json_references: + reference = [] + reference_slug = json_reference['slug'] + logging.info('Processing resource %s', + reference_slug) + + links = course.extract_links_from_reference( + json_reference['shortId']) + if links is None: + error_occurred = True + elif links: + reference.append(('', links)) + + if reference: + references.append((reference_slug, reference)) + + if references: + modules.append(("Resources", references)) + + return error_occurred, modules diff --git a/coursera/filtering.py b/coursera/filtering.py index d9ed5d5..4331831 100644 --- a/coursera/filtering.py +++ b/coursera/filtering.py @@ -94,15 +94,16 @@ def find_resources_to_get(lecture, file_formats, resource_filter, ignored_format logging.info("The following file formats will be ignored: " + ",".join(ignored_formats)) for fmt, resources in iteritems(lecture): - fmt0 = fmt - if '.' in fmt: - fmt = fmt.split('.')[1] - if fmt in ignored_formats: + short_fmt = None + if '.' in fmt: + short_fmt = fmt.split('.')[1] + + if fmt in ignored_formats or (short_fmt != None and short_fmt in ignored_formats) : continue - if fmt in file_formats or 'all' in file_formats: + if fmt in file_formats or (short_fmt != None and short_fmt in file_formats) or 'all' in file_formats: for r in resources: if resource_filter and r[1] and not re.search(resource_filter, r[1]): logging.debug('Skipping b/c of rf: %s %s', diff --git a/coursera/network.py b/coursera/network.py index c06736e..50908ba 100644 --- a/coursera/network.py +++ b/coursera/network.py @@ -9,7 +9,7 @@ import logging import requests -def get_reply(session, url, post=False, data=None, headers=None): +def get_reply(session, url, post=False, data=None, headers=None, quiet=False): """ Download an HTML page using the requests session. Low-level function that allows for flexible request configuration. @@ -29,6 +29,10 @@ def get_reply(session, url, post=False, data=None, headers=None): @param headers: Additional headers to send with request. @type headers: dict + @param quiet: Flag that tells whether to print error message when status + code != 200. + @type quiet: bool + @return: Requests response. @rtype: requests.Response """ @@ -46,8 +50,9 @@ def get_reply(session, url, post=False, data=None, headers=None): try: reply.raise_for_status() except requests.exceptions.HTTPError as e: - logging.error("Error %s getting page %s", e, url) - logging.error("The server replied: %s", reply.text) + if not quiet: + logging.error("Error %s getting page %s", e, url) + logging.error("The server replied: %s", reply.text) raise return reply @@ -59,6 +64,7 @@ def get_page(session, post=False, data=None, headers=None, + quiet=False, **kwargs): """ Download an HTML page using the requests session. @@ -82,7 +88,8 @@ def get_page(session, @rtype: str """ url = url.format(**kwargs) - reply = get_reply(session, url, post=post, data=data, headers=headers) + reply = get_reply(session, url, post=post, data=data, headers=headers, + quiet=quiet) return reply.json() if json else reply.text diff --git a/coursera/test/fixtures/json/peer-assignment-instructions-all.json b/coursera/test/fixtures/json/peer-assignment-instructions-all.json new file mode 100644 index 0000000..70d9f5c --- /dev/null +++ b/coursera/test/fixtures/json/peer-assignment-instructions-all.json @@ -0,0 +1,29 @@ +{ + "elements": [ + { + "instructions": { + "introduction": { + "typeName": "cml", + "definition": { + "dtdId": "assess/1", + "value": "intro" + } + }, + "sections": [ + { + "typeId": "unknown", + "title": "Review criteria", + "content": { + "typeName": "cml", + "definition": { + "dtdId": "assess/1", + "value": "section" + } + } + } + ] + }, + "id": "4958~RcnRZHHtEeWxvQr3acyajw~2yTvX~8x7Qhs66EeW2Tw715xhIPQ@13" + } + ] +} diff --git a/coursera/test/fixtures/json/peer-assignment-instructions-no-title.json b/coursera/test/fixtures/json/peer-assignment-instructions-no-title.json new file mode 100644 index 0000000..f210263 --- /dev/null +++ b/coursera/test/fixtures/json/peer-assignment-instructions-no-title.json @@ -0,0 +1,28 @@ +{ + "elements": [ + { + "instructions": { + "introduction": { + "typeName": "cml", + "definition": { + "dtdId": "assess/1", + "value": "intro" + } + }, + "sections": [ + { + "typeId": "unknown", + "content": { + "typeName": "cml", + "definition": { + "dtdId": "assess/1", + "value": "section" + } + } + } + ] + }, + "id": "4958~RcnRZHHtEeWxvQr3acyajw~2yTvX~8x7Qhs66EeW2Tw715xhIPQ@13" + } + ] +} diff --git a/coursera/test/fixtures/json/peer-assignment-instructions-only-introduction.json b/coursera/test/fixtures/json/peer-assignment-instructions-only-introduction.json new file mode 100644 index 0000000..7a186c4 --- /dev/null +++ b/coursera/test/fixtures/json/peer-assignment-instructions-only-introduction.json @@ -0,0 +1,16 @@ +{ + "elements": [ + { + "instructions": { + "introduction": { + "typeName": "cml", + "definition": { + "dtdId": "assess/1", + "value": "intro" + } + } + }, + "id": "4958~RcnRZHHtEeWxvQr3acyajw~2yTvX~8x7Qhs66EeW2Tw715xhIPQ@13" + } + ] +} diff --git a/coursera/test/fixtures/json/peer-assignment-instructions-only-sections.json b/coursera/test/fixtures/json/peer-assignment-instructions-only-sections.json new file mode 100644 index 0000000..7cd735c --- /dev/null +++ b/coursera/test/fixtures/json/peer-assignment-instructions-only-sections.json @@ -0,0 +1,22 @@ +{ + "elements": [ + { + "instructions": { + "sections": [ + { + "typeId": "unknown", + "title": "Review criteria", + "content": { + "typeName": "cml", + "definition": { + "dtdId": "assess/1", + "value": "section" + } + } + } + ] + }, + "id": "4958~RcnRZHHtEeWxvQr3acyajw~2yTvX~8x7Qhs66EeW2Tw715xhIPQ@13" + } + ] +} diff --git a/coursera/test/fixtures/json/peer-assignment-no-instructions.json b/coursera/test/fixtures/json/peer-assignment-no-instructions.json new file mode 100644 index 0000000..9764791 --- /dev/null +++ b/coursera/test/fixtures/json/peer-assignment-no-instructions.json @@ -0,0 +1,4 @@ +{ + "elements": [ + ] +} diff --git a/coursera/test/fixtures/json/references-poll-output.json b/coursera/test/fixtures/json/references-poll-output.json new file mode 100644 index 0000000..7da94f1 --- /dev/null +++ b/coursera/test/fixtures/json/references-poll-output.json @@ -0,0 +1,24 @@ +[ + { + "id": "Tk_5NiCREeeSVwJCrBEADA", + "slug": "tutorials", + "content": { + "org.coursera.ondemand.reference.AssetReferenceContent": { + "assetId": "4e66aa537abf1bdec8ecc508324891ac" + } + }, + "shortId": "zVvo7", + "name": "Tutorials" + }, + { + "id": "Tk_5MSCREeeSVwJCrBEADA", + "slug": "test-cases", + "content": { + "org.coursera.ondemand.reference.AssetReferenceContent": { + "assetId": "7c84e5c5249eb551d95444c172592274" + } + }, + "shortId": "a4I28", + "name": "Test Cases" + } +] \ No newline at end of file diff --git a/coursera/test/fixtures/json/references-poll-reply.json b/coursera/test/fixtures/json/references-poll-reply.json new file mode 100644 index 0000000..e4dbd04 --- /dev/null +++ b/coursera/test/fixtures/json/references-poll-reply.json @@ -0,0 +1,47 @@ +{ + "paging": {}, + "linked": { + "openCourseAssets.v1": [ + { + "typeName": "cml", + "definition": { + "dtdId": "supplement/1", + "value": "supplement1" + }, + "id": "4e66aa537abf1bdec8ecc508324891ac" + }, + { + "typeName": "cml", + "definition": { + "dtdId": "supplement/1", + "value": "supplement2" + }, + "id": "7c84e5c5249eb551d95444c172592274" + } + ] + }, + "elements": [ + { + "name": "Tutorials", + "id": "Tk_5NiCREeeSVwJCrBEADA", + "slug": "tutorials", + "content": { + "org.coursera.ondemand.reference.AssetReferenceContent": { + "assetId": "4e66aa537abf1bdec8ecc508324891ac" + } + }, + "shortId": "zVvo7" + }, + { + "name": "Test Cases", + "id": "Tk_5MSCREeeSVwJCrBEADA", + "slug": "test-cases", + "content": { + "org.coursera.ondemand.reference.AssetReferenceContent": { + "assetId": "7c84e5c5249eb551d95444c172592274" + } + }, + "shortId": "a4I28" + } + ] +} \ No newline at end of file diff --git a/coursera/test/fixtures/json/supplement-programming-immediate-instructions-empty-instructions.json b/coursera/test/fixtures/json/supplement-programming-immediate-instructions-empty-instructions.json new file mode 100644 index 0000000..0439c72 --- /dev/null +++ b/coursera/test/fixtures/json/supplement-programming-immediate-instructions-empty-instructions.json @@ -0,0 +1,18 @@ +{ + "elements": [ + { + "id": "Gtv4Xb1-EeS-ViIACwYKVQ~8f3qT", + "itemId": "8f3qT", + "courseId": "Gtv4Xb1-EeS-ViIACwYKVQ", + "assignmentInstructions": { + "definition": { + "dtdId": "", + "value": "" + }, + "typeName": "cml" + } + } + ], + "linked": {}, + "paging": {} +} \ No newline at end of file diff --git a/coursera/test/fixtures/json/supplement-programming-immediate-instructions-no-instructions.json b/coursera/test/fixtures/json/supplement-programming-immediate-instructions-no-instructions.json new file mode 100644 index 0000000..98faaae --- /dev/null +++ b/coursera/test/fixtures/json/supplement-programming-immediate-instructions-no-instructions.json @@ -0,0 +1,6 @@ +{ + "elements": [ + ], + "paging": null, + "linked": null +} diff --git a/coursera/test/fixtures/json/supplement-programming-immediate-instructions-one-asset.json b/coursera/test/fixtures/json/supplement-programming-immediate-instructions-one-asset.json new file mode 100644 index 0000000..34d6423 --- /dev/null +++ b/coursera/test/fixtures/json/supplement-programming-immediate-instructions-one-asset.json @@ -0,0 +1,18 @@ +{ + "elements": [ + { + "id": "Gtv4Xb1-EeS-ViIACwYKVQ~e4hZk", + "itemId": "e4hZk", + "courseId": "Gtv4Xb1-EeS-ViIACwYKVQ", + "assignmentInstructions": { + "definition": { + "dtdId": "", + "value": ". " + }, + "typeName": "cml" + } + } + ], + "linked": {}, + "paging": {} +} \ No newline at end of file diff --git a/coursera/test/fixtures/json/video-output-1-all.json b/coursera/test/fixtures/json/video-output-1-all.json new file mode 100644 index 0000000..1e04337 --- /dev/null +++ b/coursera/test/fixtures/json/video-output-1-all.json @@ -0,0 +1,16 @@ +{ + "zh-CN.txt": "https://api.coursera.org/api/subtitleAssetProxy.v1/UKEuZoMQRcChLmaDEMXAsA?expiry=1495238400000&hmac=eNyKwEu_aMQtn7bg0mUj6uIyVZvjahFSE5x2CrbOXOU&fileExtension=txt", + "en.srt": "https://api.coursera.org/api/subtitleAssetProxy.v1/GgGZN65HQkyBmTeuR2JMsw?expiry=1495238400000&hmac=afqFhv9FWfxxEeSka8PCA4ihiyX3g2Z6K4jWJPFlcdo&fileExtension=srt", + "zh-CN.srt": "https://api.coursera.org/api/subtitleAssetProxy.v1/UKEuZoMQRcChLmaDEMXAsA?expiry=1495238400000&hmac=nmGzGoF4oNLv28ZDLUtX5dF4xPXUABgym76XMs4UzDE&fileExtension=srt", + "en.txt": "https://api.coursera.org/api/subtitleAssetProxy.v1/GgGZN65HQkyBmTeuR2JMsw?expiry=1495238400000&hmac=2Z37WW5Rc7GoT0eft1vdK0HX5imBqoZTKULMTiZ2EjM&fileExtension=txt", + "hi.srt": "https://api.coursera.org/api/subtitleAssetProxy.v1/v2OWSJUVSqCjlkiVFRqgng?expiry=1495238400000&hmac=qk--Ptsc4w3u6c-5BFPO9vhjyczMHzlSqUOQskjbfZ0&fileExtension=srt", + "es.srt": "https://api.coursera.org/api/subtitleAssetProxy.v1/jtYTHsSQToaWEx7EkJ6G4A?expiry=1495238400000&hmac=Ts5QKzu0jwhUafwsaHk7RKoQJK26d4_bzrX2M6iuRaQ&fileExtension=srt", + "pl.srt": "https://api.coursera.org/api/subtitleAssetProxy.v1/RGtowSWPQxSraMElj3MUbA?expiry=1495238400000&hmac=mcaMPGeK3J7Fn9RRwnuVFnHkyr1COFnLXYKVkUbyfSg&fileExtension=srt", + "ja.srt": "https://api.coursera.org/api/subtitleAssetProxy.v1/758f7ykrRcWfH-8pK3XFHw?expiry=1495238400000&hmac=huh5qtCJVj4rEJnsJ6D7MJdCcqN-s9cMd-M6xlSicLc&fileExtension=srt", + "pt-BR.srt": "https://api.coursera.org/api/subtitleAssetProxy.v1/1kRk9rXlSSeEZPa15aknhQ?expiry=1495238400000&hmac=XYyDJ71d9gl3HOqNplyJeEr7Wd2UhU3DhT-9w_Yudzs&fileExtension=srt", + "hi.txt": "https://api.coursera.org/api/subtitleAssetProxy.v1/v2OWSJUVSqCjlkiVFRqgng?expiry=1495238400000&hmac=earWLk_RUi3K5UpZfEVOlBgOcpSE9efXz2njRKu31rQ&fileExtension=txt", + "es.txt": "https://api.coursera.org/api/subtitleAssetProxy.v1/jtYTHsSQToaWEx7EkJ6G4A?expiry=1495238400000&hmac=sd6_C14J-qEkvvbqNTgI8W5eUCvOKwW6RzHcz8yF2Jk&fileExtension=txt", + "pl.txt": "https://api.coursera.org/api/subtitleAssetProxy.v1/RGtowSWPQxSraMElj3MUbA?expiry=1495238400000&hmac=sFwO_BWNlhZEDHsXYkFlnOEtHBIX8lSsVGIOLIHeZZ0&fileExtension=txt", + "ja.txt": "https://api.coursera.org/api/subtitleAssetProxy.v1/758f7ykrRcWfH-8pK3XFHw?expiry=1495238400000&hmac=WMhDBDbF6SiBuvRwg_QEkglLSK36bj8_5y6kZ9z94YY&fileExtension=txt", + "pt-BR.txt": "https://api.coursera.org/api/subtitleAssetProxy.v1/1kRk9rXlSSeEZPa15aknhQ?expiry=1495238400000&hmac=uQaL2V2AJ_Wp5dlCZH1HeyTU_AQo9VdJ2cphUhG8yxk&fileExtension=txt" +} \ No newline at end of file diff --git a/coursera/test/fixtures/json/video-output-1-en.json b/coursera/test/fixtures/json/video-output-1-en.json new file mode 100644 index 0000000..e2cb7cb --- /dev/null +++ b/coursera/test/fixtures/json/video-output-1-en.json @@ -0,0 +1,4 @@ +{ + "en.srt": "https://api.coursera.org/api/subtitleAssetProxy.v1/GgGZN65HQkyBmTeuR2JMsw?expiry=1495238400000&hmac=afqFhv9FWfxxEeSka8PCA4ihiyX3g2Z6K4jWJPFlcdo&fileExtension=srt", + "en.txt": "https://api.coursera.org/api/subtitleAssetProxy.v1/GgGZN65HQkyBmTeuR2JMsw?expiry=1495238400000&hmac=2Z37WW5Rc7GoT0eft1vdK0HX5imBqoZTKULMTiZ2EjM&fileExtension=txt" +} \ No newline at end of file diff --git a/coursera/test/fixtures/json/video-output-1.json b/coursera/test/fixtures/json/video-output-1.json new file mode 100644 index 0000000..315a1af --- /dev/null +++ b/coursera/test/fixtures/json/video-output-1.json @@ -0,0 +1,6 @@ +{ + "zh-CN.txt": "https://api.coursera.org/api/subtitleAssetProxy.v1/UKEuZoMQRcChLmaDEMXAsA?expiry=1495238400000&hmac=eNyKwEu_aMQtn7bg0mUj6uIyVZvjahFSE5x2CrbOXOU&fileExtension=txt", + "en.srt": "https://api.coursera.org/api/subtitleAssetProxy.v1/GgGZN65HQkyBmTeuR2JMsw?expiry=1495238400000&hmac=afqFhv9FWfxxEeSka8PCA4ihiyX3g2Z6K4jWJPFlcdo&fileExtension=srt", + "zh-CN.srt": "https://api.coursera.org/api/subtitleAssetProxy.v1/UKEuZoMQRcChLmaDEMXAsA?expiry=1495238400000&hmac=nmGzGoF4oNLv28ZDLUtX5dF4xPXUABgym76XMs4UzDE&fileExtension=srt", + "en.txt": "https://api.coursera.org/api/subtitleAssetProxy.v1/GgGZN65HQkyBmTeuR2JMsw?expiry=1495238400000&hmac=2Z37WW5Rc7GoT0eft1vdK0HX5imBqoZTKULMTiZ2EjM&fileExtension=txt" +} \ No newline at end of file diff --git a/coursera/test/fixtures/json/video-output-2.json b/coursera/test/fixtures/json/video-output-2.json new file mode 100644 index 0000000..0b29d8c --- /dev/null +++ b/coursera/test/fixtures/json/video-output-2.json @@ -0,0 +1,6 @@ +{ + "zh-TW.srt": "https://api.coursera.org/api/subtitleAssetProxy.v1/j8femXUVQaGH3pl1FYGh-Q?expiry=1495238400000&hmac=-sOeJbk_bICP9OMfbtkjLuwUAIZZcjGasIMk8JO6n0Q&fileExtension=srt", + "en.txt": "https://api.coursera.org/api/subtitleAssetProxy.v1/r3LdPY_CTUqy3T2Pwu1KVQ?expiry=1495238400000&hmac=xhMK0SSslbfwxl-vzjAXy-bd_iQQTY9iAIrNP4QHxq4&fileExtension=txt", + "en.srt": "https://api.coursera.org/api/subtitleAssetProxy.v1/r3LdPY_CTUqy3T2Pwu1KVQ?expiry=1495238400000&hmac=nO6NGCExQ5FO0aFFnr_YVXtd_lVW4JQaT34WS9tJi6c&fileExtension=srt", + "zh-TW.txt": "https://api.coursera.org/api/subtitleAssetProxy.v1/j8femXUVQaGH3pl1FYGh-Q?expiry=1495238400000&hmac=O9DKhZW6bOsI7ncNZIZPBMXmsreSrgulhGf3eyTCULo&fileExtension=txt" +} \ No newline at end of file diff --git a/coursera/test/fixtures/json/video-reply-1.json b/coursera/test/fixtures/json/video-reply-1.json new file mode 100644 index 0000000..5fe9d2e --- /dev/null +++ b/coursera/test/fixtures/json/video-reply-1.json @@ -0,0 +1,47 @@ +{ + "sources": [ + { + "resolution": "540p", + "formatSources": { + "video/webm": "https://d3c33hcgiwev3.cloudfront.net/20.1-Conclusion-SummaryAndThankYou.63149a70b22b11e4aca907c8d9623f2b/full/540p/index.webm?Expires=1495238400&Signature=Oj2j7hCpfrpp1ugtZHjRITM9D4MjaOJm2x34ecUPGH2nm~BIvt6RY25XpKgCFZ0qbIK01eloymAUolfupBwzJYIjwANxibwpIJ3bX43dtxnoy1dRh2F1YZoZ6lbPCVOSCxODJFod7bZPCuqRTfXvK6X6F0o-IzkbXy8myk6G5Js_&Key-Pair-Id=APKAJLTNE6QMUY6HBC5A", + "video/mp4": "https://d3c33hcgiwev3.cloudfront.net/20.1-Conclusion-SummaryAndThankYou.63149a70b22b11e4aca907c8d9623f2b/full/540p/index.mp4?Expires=1495238400&Signature=TW8nTCrNKfrBGLCloVNFvNB~7qsNXaRI8T~gBUCxyxxPzumARwdw8W9ONVd-8j2TrI8Zvm~j4ysS4UedtLTKynDewxOzjrbsVc3HRBLTqrcQNjjLkG9vzbrGz2wUMMRUwX6qlwT8xFTVuNjh7-W72gq83bzk4eyaALHO~YKXvxk_&Key-Pair-Id=APKAJLTNE6QMUY6HBC5A" + } + } + ], + "subtitles": { + "hi": "/api/subtitleAssetProxy.v1/v2OWSJUVSqCjlkiVFRqgng?expiry=1495238400000&hmac=qk--Ptsc4w3u6c-5BFPO9vhjyczMHzlSqUOQskjbfZ0&fileExtension=srt", + "en": "/api/subtitleAssetProxy.v1/GgGZN65HQkyBmTeuR2JMsw?expiry=1495238400000&hmac=afqFhv9FWfxxEeSka8PCA4ihiyX3g2Z6K4jWJPFlcdo&fileExtension=srt", + "zh-CN": "/api/subtitleAssetProxy.v1/UKEuZoMQRcChLmaDEMXAsA?expiry=1495238400000&hmac=nmGzGoF4oNLv28ZDLUtX5dF4xPXUABgym76XMs4UzDE&fileExtension=srt", + "es": "/api/subtitleAssetProxy.v1/jtYTHsSQToaWEx7EkJ6G4A?expiry=1495238400000&hmac=Ts5QKzu0jwhUafwsaHk7RKoQJK26d4_bzrX2M6iuRaQ&fileExtension=srt", + "pl": "/api/subtitleAssetProxy.v1/RGtowSWPQxSraMElj3MUbA?expiry=1495238400000&hmac=mcaMPGeK3J7Fn9RRwnuVFnHkyr1COFnLXYKVkUbyfSg&fileExtension=srt", + "ja": "/api/subtitleAssetProxy.v1/758f7ykrRcWfH-8pK3XFHw?expiry=1495238400000&hmac=huh5qtCJVj4rEJnsJ6D7MJdCcqN-s9cMd-M6xlSicLc&fileExtension=srt", + "pt-BR": "/api/subtitleAssetProxy.v1/1kRk9rXlSSeEZPa15aknhQ?expiry=1495238400000&hmac=XYyDJ71d9gl3HOqNplyJeEr7Wd2UhU3DhT-9w_Yudzs&fileExtension=srt" + }, + "playlists": { + "hls": "https://d3c33hcgiwev3.cloudfront.net/assetMasterHlsPlaylists.v1/DB-HfUh-EeWWUA71mMib3w?expiry=1495238400000&hmac=kaIpHBiD8US9yrHVsABKCkRPRgIzsBA7tWzB-PHEqhY&mediaCdn=cloudfront" + }, + "subtitlesVtt": { + "hi": "/api/subtitleAssetProxy.v1/v2OWSJUVSqCjlkiVFRqgng?expiry=1495238400000&hmac=4S0NgqfShX81v0QJckTU0IbeAJJnD9m_iZcTYJltbNc&fileExtension=vtt", + "en": "/api/subtitleAssetProxy.v1/GgGZN65HQkyBmTeuR2JMsw?expiry=1495238400000&hmac=lMlV1hdtArRLJvePvHHqFJhekv1Gs-P6WzPQz_TEnzE&fileExtension=vtt", + "zh-CN": "/api/subtitleAssetProxy.v1/UKEuZoMQRcChLmaDEMXAsA?expiry=1495238400000&hmac=H7jCFYLCxt9yS5y5YiLkUGwd3-0fWiioGGVVwiYFfjo&fileExtension=vtt", + "es": "/api/subtitleAssetProxy.v1/jtYTHsSQToaWEx7EkJ6G4A?expiry=1495238400000&hmac=nTa427_IEx68vJJYeNQvNQhWQaNOSwJkWUqfAYW2tYI&fileExtension=vtt", + "pl": "/api/subtitleAssetProxy.v1/RGtowSWPQxSraMElj3MUbA?expiry=1495238400000&hmac=jPndFFisYDio-2FgSWp-vMVOg9Ybx-Zh4tODJsHvUWY&fileExtension=vtt", + "ja": "/api/subtitleAssetProxy.v1/758f7ykrRcWfH-8pK3XFHw?expiry=1495238400000&hmac=m9-S0joHnqjaCJ72qKLtL199dsngS9zoP1jmvI7_AA4&fileExtension=vtt", + "pt-BR": "/api/subtitleAssetProxy.v1/1kRk9rXlSSeEZPa15aknhQ?expiry=1495238400000&hmac=ePd5ZewhjQBA5J6QybTLY-U8yEz2FlIqZuvCE7gdYxQ&fileExtension=vtt" + }, + "subtitlesTxt": { + "hi": "/api/subtitleAssetProxy.v1/v2OWSJUVSqCjlkiVFRqgng?expiry=1495238400000&hmac=earWLk_RUi3K5UpZfEVOlBgOcpSE9efXz2njRKu31rQ&fileExtension=txt", + "en": "/api/subtitleAssetProxy.v1/GgGZN65HQkyBmTeuR2JMsw?expiry=1495238400000&hmac=2Z37WW5Rc7GoT0eft1vdK0HX5imBqoZTKULMTiZ2EjM&fileExtension=txt", + "zh-CN": "/api/subtitleAssetProxy.v1/UKEuZoMQRcChLmaDEMXAsA?expiry=1495238400000&hmac=eNyKwEu_aMQtn7bg0mUj6uIyVZvjahFSE5x2CrbOXOU&fileExtension=txt", + "es": "/api/subtitleAssetProxy.v1/jtYTHsSQToaWEx7EkJ6G4A?expiry=1495238400000&hmac=sd6_C14J-qEkvvbqNTgI8W5eUCvOKwW6RzHcz8yF2Jk&fileExtension=txt", + "pl": "/api/subtitleAssetProxy.v1/RGtowSWPQxSraMElj3MUbA?expiry=1495238400000&hmac=sFwO_BWNlhZEDHsXYkFlnOEtHBIX8lSsVGIOLIHeZZ0&fileExtension=txt", + "ja": "/api/subtitleAssetProxy.v1/758f7ykrRcWfH-8pK3XFHw?expiry=1495238400000&hmac=WMhDBDbF6SiBuvRwg_QEkglLSK36bj8_5y6kZ9z94YY&fileExtension=txt", + "pt-BR": "/api/subtitleAssetProxy.v1/1kRk9rXlSSeEZPa15aknhQ?expiry=1495238400000&hmac=uQaL2V2AJ_Wp5dlCZH1HeyTU_AQo9VdJ2cphUhG8yxk&fileExtension=txt" + }, + "posters": [ + { + "url": "https://d3c33hcgiwev3.cloudfront.net/imageAssetProxy.v1/20.1-Conclusion-SummaryAndThankYou.63149a70b22b11e4aca907c8d9623f2b/thumbnails/540p/0.jpg?expiry=1495238400000&hmac=LuHD_gGzBtwVeNIQqABnGU69sWteFl8xdMozFAsGPco", + "resolution": "540p" + } + ] +} \ No newline at end of file diff --git a/coursera/test/fixtures/json/video-reply-2.json b/coursera/test/fixtures/json/video-reply-2.json new file mode 100644 index 0000000..9f4435d --- /dev/null +++ b/coursera/test/fixtures/json/video-reply-2.json @@ -0,0 +1,77 @@ +{ + "posters": [ + { + "url": "https://d3c33hcgiwev3.cloudfront.net/imageAssetProxy.v1/qHTMkA4fEeW2rSIAC2yC6g.processed/thumbnails/540p/0.jpg?expiry=1495238400000&hmac=ES_Ho42kS5yI6VWTPDEY2LNWyvBMOoKwoGXJHb6LqnU", + "resolution": "540p" + } + ], + "subtitles": { + "en": "/api/subtitleAssetProxy.v1/r3LdPY_CTUqy3T2Pwu1KVQ?expiry=1495238400000&hmac=nO6NGCExQ5FO0aFFnr_YVXtd_lVW4JQaT34WS9tJi6c&fileExtension=srt", + "fr": "/api/subtitleAssetProxy.v1/wAkpEeN1SE6JKRHjdWhOTw?expiry=1495238400000&hmac=DeXzNpOf_7RhvGBaqMWqud5J96PoIh6At1eSDWF1RUM&fileExtension=srt", + "lt": "/api/subtitleAssetProxy.v1/qDmNTUOsQoG5jU1DrEKBtw?expiry=1495238400000&hmac=ifyDt77JEmZu5SgJ0nubLGsy6JV9d2IAzNvZdVevBcE&fileExtension=srt", + "ko": "/api/subtitleAssetProxy.v1/BatzP4LfQA6rcz-C35AOXQ?expiry=1495238400000&hmac=U70Yc3wumD-G_XE3AYEISRbqtjtl9WSOqQMjlHI2OGM&fileExtension=srt", + "hi": "/api/subtitleAssetProxy.v1/-v2-jzNFSxq9vo8zRVsa5Q?expiry=1495238400000&hmac=teLyzOnnfT0P8sZVvD4O8fFDgd0RojSYCxB3n6RvTk0&fileExtension=srt", + "fa": "/api/subtitleAssetProxy.v1/MVu-cT46QyqbvnE-OvMq0g?expiry=1495238400000&hmac=gQgvsV9WFOC-cbOuUjLEIepCtka_W8fup6mltVcnJq8&fileExtension=srt", + "es": "/api/subtitleAssetProxy.v1/_mNV07xnT_ejVdO8Z2_3Ig?expiry=1495238400000&hmac=fC2H9Lajgsm_WkxnsuNaL6TIfgEzpfhdY2dIJhNasMI&fileExtension=srt", + "he": "/api/subtitleAssetProxy.v1/keFd5T42SI6hXeU-NkiO1w?expiry=1495238400000&hmac=DiJ-h_TfbyqgajhWVb-1302MXPoqD5x0JEachin0c3Q&fileExtension=srt", + "ar": "/api/subtitleAssetProxy.v1/vHGgsHVsQ2CxoLB1bKNg9w?expiry=1495238400000&hmac=pmjuKhL8-SNhzXi8FAETaJcakRt5S1yqay-G--r4C0I&fileExtension=srt", + "bn": "/api/subtitleAssetProxy.v1/hi47OjiORsmuOzo4jqbJBQ?expiry=1495238400000&hmac=tcHBX4hMne23haoOlu1olxiKa7M1n_CSOX4eOmH0U14&fileExtension=srt", + "pl": "/api/subtitleAssetProxy.v1/lBH5IsEBRySR-SLBAQcknw?expiry=1495238400000&hmac=QHDKKoNSt2A9Bd4PfRiR82OJwSAsCQmbxZEZoFhLnIg&fileExtension=srt", + "tr": "/api/subtitleAssetProxy.v1/jdBjiWbSSlWQY4lm0rpVzw?expiry=1495238400000&hmac=5WoVi4jm3274ClP3U4JZAIhJt-V5ulNHkNtUACqgR6w&fileExtension=srt", + "zh-TW": "/api/subtitleAssetProxy.v1/j8femXUVQaGH3pl1FYGh-Q?expiry=1495238400000&hmac=-sOeJbk_bICP9OMfbtkjLuwUAIZZcjGasIMk8JO6n0Q&fileExtension=srt", + "hr": "/api/subtitleAssetProxy.v1/k_PHTdx0Rw-zx03cdOcPyw?expiry=1495238400000&hmac=EeciDHPlRVHxIgsgzSJ0uAOjnaatGmqt4bw1hAyGh9A&fileExtension=srt", + "id": "/api/subtitleAssetProxy.v1/xIop0OAYTKaKKdDgGKym3g?expiry=1495238400000&hmac=BT9i5NUXcz5RTOUzjMTK8NOQciU5o-gGI6rTxjNWhbM&fileExtension=srt" + }, + "sources": [ + { + "formatSources": { + "video/webm": "https://d3c33hcgiwev3.cloudfront.net/qHTMkA4fEeW2rSIAC2yC6g.processed/full/540p/index.webm?Expires=1495238400&Signature=iAKFovwfSpnnEflg4blQajE9Tle2peDHKv0ScvXSK5rMVBQnX9SWq8M2CvsawxPwKdA2Xo02lOZBi~-5I5F6fCag9mzzlV-8Q-dxVoS1yWZLu7HtGROStDSwOiJYvGoLGSgS2dT0dGYG4rNJ9hxQmElQzBCOYkeQsP6lIsh0Ejg_&Key-Pair-Id=APKAJLTNE6QMUY6HBC5A", + "video/mp4": "https://d3c33hcgiwev3.cloudfront.net/qHTMkA4fEeW2rSIAC2yC6g.processed/full/540p/index.mp4?Expires=1495238400&Signature=SkYBCgVvj~W2eeSAKjtX4igYZ2WLKq8crzqLbFqgDbZxhH48jW3nXZNWB5~H6ev0EIHkSAbMSQWP4xUGKhzGhcciL8B9jB8LjI180wsSv1jfNknCP9S5p9vd1mdCidheNmJftBtIjfr54m8CgFYUqe1WAo2aLUzRimiS~nf8kxk_&Key-Pair-Id=APKAJLTNE6QMUY6HBC5A" + }, + "resolution": "540p" + }, + { + "formatSources": { + "video/webm": "https://d3c33hcgiwev3.cloudfront.net/qHTMkA4fEeW2rSIAC2yC6g.processed/full/360p/index.webm?Expires=1495238400&Signature=CBaZCNZGbCJ9UpJdWyfGA~KwtUx4UxqnwM8v6GS7T2fsynUVexCfjBcE7IMowDzWa~GZ-jdOI43~s5e4kVc4hbiNOaQoZ-p-te1AffsRJaMhlhI529vxfWUQJGhO3bUGbn9Az9ueSehoe8WLojtHHb5q-IIr53MX-rftl~d3srI_&Key-Pair-Id=APKAJLTNE6QMUY6HBC5A", + "video/mp4": "https://d3c33hcgiwev3.cloudfront.net/qHTMkA4fEeW2rSIAC2yC6g.processed/full/360p/index.mp4?Expires=1495238400&Signature=dt9-1ARw8I5U1IrIKJGbQzy5MkCqjySGXutT~KFNx8~~UD0v3T0cFwUG3ggLhL3lkyMAztA-dTpARKfi2igKgq6Q8qfcnTfs8~iu5Ayt1vRZVHDfgomlasB3aElmOHB7WaWkQbZJCChXYlVgg2fDKLGxMcfEGf797AzzDrhEFBY_&Key-Pair-Id=APKAJLTNE6QMUY6HBC5A" + } + } + ], + "subtitlesTxt": { + "en": "/api/subtitleAssetProxy.v1/r3LdPY_CTUqy3T2Pwu1KVQ?expiry=1495238400000&hmac=xhMK0SSslbfwxl-vzjAXy-bd_iQQTY9iAIrNP4QHxq4&fileExtension=txt", + "fr": "/api/subtitleAssetProxy.v1/wAkpEeN1SE6JKRHjdWhOTw?expiry=1495238400000&hmac=Y9ysEdDIlObbFPyLBWorn1XEOeJ57TEYPxWsnzE3x5Q&fileExtension=txt", + "lt": "/api/subtitleAssetProxy.v1/qDmNTUOsQoG5jU1DrEKBtw?expiry=1495238400000&hmac=92ArkEZV3O5nkxp5f7pxyUClVWbywOWjJ8DFE86foKA&fileExtension=txt", + "ko": "/api/subtitleAssetProxy.v1/BatzP4LfQA6rcz-C35AOXQ?expiry=1495238400000&hmac=aVqHA-CIZDni9UrmUGB6aR2VjDstYzdquDrOvzusKbc&fileExtension=txt", + "hi": "/api/subtitleAssetProxy.v1/-v2-jzNFSxq9vo8zRVsa5Q?expiry=1495238400000&hmac=fN1qhDyCL5aMzYRW2NbkfNyikEypjzB57LJtO9QXb2Q&fileExtension=txt", + "fa": "/api/subtitleAssetProxy.v1/MVu-cT46QyqbvnE-OvMq0g?expiry=1495238400000&hmac=JLJVoBvCluUXGXqzDti9uaW0gDjCsRWQIBOqxARAM9w&fileExtension=txt", + "es": "/api/subtitleAssetProxy.v1/_mNV07xnT_ejVdO8Z2_3Ig?expiry=1495238400000&hmac=FzDR-l0J3CTljW9aywGiBn56TTWdmzh1TEYzsfmdceo&fileExtension=txt", + "he": "/api/subtitleAssetProxy.v1/keFd5T42SI6hXeU-NkiO1w?expiry=1495238400000&hmac=xwD0e-3s7FwTbDuQn-nNVIi9eoAueviOZl4Ezofd_rY&fileExtension=txt", + "ar": "/api/subtitleAssetProxy.v1/vHGgsHVsQ2CxoLB1bKNg9w?expiry=1495238400000&hmac=it_tXtSCiX5oX9MV9VDZQuz1hj5IFZOohCAyYOn8pR4&fileExtension=txt", + "bn": "/api/subtitleAssetProxy.v1/hi47OjiORsmuOzo4jqbJBQ?expiry=1495238400000&hmac=fZebyPIojlQlJL3HuHkEOgoQHlwPsJJ5YEC4Pd_a4Sg&fileExtension=txt", + "pl": "/api/subtitleAssetProxy.v1/lBH5IsEBRySR-SLBAQcknw?expiry=1495238400000&hmac=jmygEnmsUvBv4-sDUbYZK0MsJht9Mg24AAyaI5iMhmg&fileExtension=txt", + "tr": "/api/subtitleAssetProxy.v1/jdBjiWbSSlWQY4lm0rpVzw?expiry=1495238400000&hmac=kgkFFJSswv5HLtPMkjV7rsgLQZzoSBpWHbIffW1FUKc&fileExtension=txt", + "zh-TW": "/api/subtitleAssetProxy.v1/j8femXUVQaGH3pl1FYGh-Q?expiry=1495238400000&hmac=O9DKhZW6bOsI7ncNZIZPBMXmsreSrgulhGf3eyTCULo&fileExtension=txt", + "hr": "/api/subtitleAssetProxy.v1/k_PHTdx0Rw-zx03cdOcPyw?expiry=1495238400000&hmac=DO3oN6U9JBwZcScxzOsIAI8Nn2CTaGnlWGi4pAxjDEE&fileExtension=txt", + "id": "/api/subtitleAssetProxy.v1/xIop0OAYTKaKKdDgGKym3g?expiry=1495238400000&hmac=HIQ_jyC6_xWBpz4dCF6hxiG5ay1tVSJwQJF8LSCo0gk&fileExtension=txt" + }, + "subtitlesVtt": { + "en": "/api/subtitleAssetProxy.v1/r3LdPY_CTUqy3T2Pwu1KVQ?expiry=1495238400000&hmac=WtAfot596syGVoSQ-UJ-QpyWQWqhSQ4auwDRijJy7IM&fileExtension=vtt", + "fr": "/api/subtitleAssetProxy.v1/wAkpEeN1SE6JKRHjdWhOTw?expiry=1495238400000&hmac=KDSprAsOxTLlNvgUVon4RRUAAN7BhOd6rRuTm8KWEU0&fileExtension=vtt", + "lt": "/api/subtitleAssetProxy.v1/qDmNTUOsQoG5jU1DrEKBtw?expiry=1495238400000&hmac=Hwu1AvpVFsSZUrlSh-VxQ2Rvj6dJ3pgu1_VXDflnH-k&fileExtension=vtt", + "ko": "/api/subtitleAssetProxy.v1/BatzP4LfQA6rcz-C35AOXQ?expiry=1495238400000&hmac=I1QSwiy02AWEChUfrmw5C8XKrDReprSmP0mqBv-ipno&fileExtension=vtt", + "hi": "/api/subtitleAssetProxy.v1/-v2-jzNFSxq9vo8zRVsa5Q?expiry=1495238400000&hmac=p6JE3hXLwYbJmlTCQf9vZtIO0Gsg8TVAvPjobSFU0eg&fileExtension=vtt", + "fa": "/api/subtitleAssetProxy.v1/MVu-cT46QyqbvnE-OvMq0g?expiry=1495238400000&hmac=BkasYNlGSg-zm2GSKqcDLzfwzISJu6agsq1qirTV3xU&fileExtension=vtt", + "es": "/api/subtitleAssetProxy.v1/_mNV07xnT_ejVdO8Z2_3Ig?expiry=1495238400000&hmac=zlOw3ifj1lIy4i7GNROw5muCIBidW2MCFnSHRHZ5vhI&fileExtension=vtt", + "he": "/api/subtitleAssetProxy.v1/keFd5T42SI6hXeU-NkiO1w?expiry=1495238400000&hmac=9kbtSLEO8NqxGhzvRMUEJGIZ0rlUQ0IO2F_Un-Bpmj0&fileExtension=vtt", + "ar": "/api/subtitleAssetProxy.v1/vHGgsHVsQ2CxoLB1bKNg9w?expiry=1495238400000&hmac=YnlfoVEGoQ7LAtnEjrMFXfRPFy5F0-tq_C7s_Vlqf8o&fileExtension=vtt", + "bn": "/api/subtitleAssetProxy.v1/hi47OjiORsmuOzo4jqbJBQ?expiry=1495238400000&hmac=J0pV67Lu5PRMeSP_6Mk-pN6CdlguHW024MYQDf1ZOjI&fileExtension=vtt", + "pl": "/api/subtitleAssetProxy.v1/lBH5IsEBRySR-SLBAQcknw?expiry=1495238400000&hmac=knptQ0ipo3LojO72PQawrSjdiy6VqBjlFHX62ECyFPg&fileExtension=vtt", + "tr": "/api/subtitleAssetProxy.v1/jdBjiWbSSlWQY4lm0rpVzw?expiry=1495238400000&hmac=9gYXktWObxjtSPjkiBB3qo__LZhsRzWoTEPc32uRxhA&fileExtension=vtt", + "zh-TW": "/api/subtitleAssetProxy.v1/j8femXUVQaGH3pl1FYGh-Q?expiry=1495238400000&hmac=coTFw8DgESXYsiMqySKkp1JXRfUkwx2fBY_lniYl-i0&fileExtension=vtt", + "hr": "/api/subtitleAssetProxy.v1/k_PHTdx0Rw-zx03cdOcPyw?expiry=1495238400000&hmac=OUzErWIwafoewJ97evxPxAdJRiHgTUQEkPydMHHJElk&fileExtension=vtt", + "id": "/api/subtitleAssetProxy.v1/xIop0OAYTKaKKdDgGKym3g?expiry=1495238400000&hmac=rt0Afn6mQCoiOAVn258RUI0qiXnInP73f7DM3qldWYY&fileExtension=vtt" + }, + "playlists": { + "hls": "https://d3c33hcgiwev3.cloudfront.net/assetMasterHlsPlaylists.v1/qHTMkA4fEeW2rSIAC2yC6g?expiry=1495238400000&hmac=PwXl2HLjLXI3lkJ4YeUu1pNM9Y8XG39j2QzOLFwQ8F4&mediaCdn=cloudfront" + } +} \ No newline at end of file diff --git a/coursera/test/test_api.py b/coursera/test/test_api.py index c5586ef..535a974 100644 --- a/coursera/test/test_api.py +++ b/coursera/test/test_api.py @@ -10,7 +10,7 @@ from mock import patch, Mock from coursera import api from coursera import define -from coursera.test.utils import slurp_fixture +from coursera.test.utils import slurp_fixture, links_to_plain_text from coursera.utils import BeautifulSoup from requests.exceptions import HTTPError @@ -73,7 +73,7 @@ def test_extract_links_from_lecture_http_error(get_page, course): locked_response.status_code = define.HTTP_FORBIDDEN get_page.side_effect = HTTPError('Mocked HTTP error', response=locked_response) - assert None == course.extract_links_from_lecture('0') + assert None == course.extract_links_from_lecture('fake_course_id', '0') @patch('coursera.api.get_page') @@ -89,23 +89,113 @@ def test_extract_links_from_quiz_http_error(get_page, course): assert None == course.extract_links_from_quiz('0') +@patch('coursera.api.get_page') +def test_extract_references_poll_http_error(get_page, course): + """ + This test checks that downloader skips locked programming assignments + instead of throwing an error. (Locked == returning 403 error code) + """ + locked_response = Response() + locked_response.status_code = define.HTTP_FORBIDDEN + get_page.side_effect = HTTPError('Mocked HTTP error', + response=locked_response) + assert None == course.extract_references_poll() + + +@patch('coursera.api.get_page') +def test_extract_links_from_reference_http_error(get_page, course): + """ + This test checks that downloader skips locked resources + instead of throwing an error. (Locked == returning 403 error code) + """ + locked_response = Response() + locked_response.status_code = define.HTTP_FORBIDDEN + get_page.side_effect = HTTPError('Mocked HTTP error', + response=locked_response) + assert None == course.extract_links_from_reference('0') + + +@patch('coursera.api.get_page') +def test_extract_links_from_programming_immediate_instructions_http_error( + get_page, course): + """ + This test checks that downloader skips locked programming immediate instructions + instead of throwing an error. (Locked == returning 403 error code) + """ + locked_response = Response() + locked_response.status_code = define.HTTP_FORBIDDEN + get_page.side_effect = HTTPError('Mocked HTTP error', + response=locked_response) + assert ( + None == course.extract_links_from_programming_immediate_instructions('0')) + + @patch('coursera.api.get_page') def test_ondemand_programming_supplement_no_instructions(get_page, course): - no_instructions = slurp_fixture('json/supplement-programming-no-instructions.json') + no_instructions = slurp_fixture( + 'json/supplement-programming-no-instructions.json') get_page.return_value = json.loads(no_instructions) output = course.extract_links_from_programming('0') assert {} == output +@patch('coursera.api.get_page') +@pytest.mark.parametrize( + "input_filename,expected_output", [ + ('peer-assignment-instructions-all.json', 'intro Review criteria section'), + ('peer-assignment-instructions-no-title.json', 'intro section'), + ('peer-assignment-instructions-only-introduction.json', 'intro'), + ('peer-assignment-instructions-only-sections.json', 'Review criteria section'), + ('peer-assignment-no-instructions.json', ''), + ] +) +def test_ondemand_from_peer_assignment_instructions( + get_page, course, input_filename, expected_output): + instructions = slurp_fixture('json/%s' % input_filename) + get_page.return_value = json.loads(instructions) + + output = course.extract_links_from_peer_assignment('0') + assert expected_output == links_to_plain_text(output) + + +@patch('coursera.api.get_page') +def test_ondemand_from_programming_immediate_instructions_no_instructions( + get_page, course): + no_instructions = slurp_fixture( + 'json/supplement-programming-immediate-instructions-no-instructions.json') + get_page.return_value = json.loads(no_instructions) + + output = course.extract_links_from_programming_immediate_instructions('0') + assert {} == output + + @patch('coursera.api.get_page') def test_ondemand_programming_supplement_empty_instructions(get_page, course): - empty_instructions = slurp_fixture('json/supplement-programming-empty-instructions.json') + empty_instructions = slurp_fixture( + 'json/supplement-programming-empty-instructions.json') get_page.return_value = json.loads(empty_instructions) output = course.extract_links_from_programming('0') # Make sure that SOME html content has been extracted, but remove - # it immeditely because it's a hassle to properly prepare test input + # it immediately because it's a hassle to properly prepare test input + # for it. FIXME later. + assert 'html' in output + del output['html'] + + assert {} == output + + +@patch('coursera.api.get_page') +def test_ondemand_programming_immediate_instructions_empty_instructions( + get_page, course): + empty_instructions = slurp_fixture( + 'json/supplement-programming-immediate-instructions-empty-instructions.json') + get_page.return_value = json.loads(empty_instructions) + output = course.extract_links_from_programming_immediate_instructions('0') + + # Make sure that SOME html content has been extracted, but remove + # it immediately because it's a hassle to properly prepare test input # for it. FIXME later. assert 'html' in output del output['html'] @@ -119,14 +209,50 @@ def test_ondemand_programming_supplement_one_asset(get_page, course): one_asset_url = slurp_fixture('json/asset-urls-one.json') asset_json = json.loads(one_asset_url) get_page.side_effect = [json.loads(one_asset_tag), - json.loads(one_asset_url)] + json.loads(one_asset_url)] expected_output = {'pdf': [(asset_json['elements'][0]['url'], - 'statement-pca')]} + 'statement-pca')]} output = course.extract_links_from_programming('0') # Make sure that SOME html content has been extracted, but remove - # it immeditely because it's a hassle to properly prepare test input + # it immediately because it's a hassle to properly prepare test input + # for it. FIXME later. + assert 'html' in output + del output['html'] + + assert expected_output == output + + +@patch('coursera.api.get_page') +def test_extract_references_poll(get_page, course): + """ + Test extracting course references. + """ + get_page.side_effect = [ + json.loads(slurp_fixture('json/references-poll-reply.json')) + ] + expected_output = json.loads( + slurp_fixture('json/references-poll-output.json')) + output = course.extract_references_poll() + assert expected_output == output + + +@patch('coursera.api.get_page') +def test_ondemand_programming_immediate_instructions_one_asset(get_page, course): + one_asset_tag = slurp_fixture( + 'json/supplement-programming-immediate-instructions-one-asset.json') + one_asset_url = slurp_fixture('json/asset-urls-one.json') + asset_json = json.loads(one_asset_url) + get_page.side_effect = [json.loads(one_asset_tag), + json.loads(one_asset_url)] + + expected_output = {'pdf': [(asset_json['elements'][0]['url'], + 'statement-pca')]} + output = course.extract_links_from_programming_immediate_instructions('0') + + # Make sure that SOME html content has been extracted, but remove + # it immediately because it's a hassle to properly prepare test input # for it. FIXME later. assert 'html' in output del output['html'] @@ -136,17 +262,19 @@ def test_ondemand_programming_supplement_one_asset(get_page, course): @patch('coursera.api.get_page') def test_ondemand_programming_supplement_three_assets(get_page, course): - three_assets_tag = slurp_fixture('json/supplement-programming-three-assets.json') + three_assets_tag = slurp_fixture( + 'json/supplement-programming-three-assets.json') three_assets_url = slurp_fixture('json/asset-urls-three.json') get_page.side_effect = [json.loads(three_assets_tag), - json.loads(three_assets_url)] + json.loads(three_assets_url)] - expected_output = json.loads(slurp_fixture('json/supplement-three-assets-output.json')) + expected_output = json.loads(slurp_fixture( + 'json/supplement-three-assets-output.json')) output = course.extract_links_from_programming('0') output = json.loads(json.dumps(output)) # Make sure that SOME html content has been extracted, but remove - # it immeditely because it's a hassle to properly prepare test input + # it immediately because it's a hassle to properly prepare test input # for it. FIXME later. assert 'html' in output del output['html'] @@ -156,12 +284,15 @@ def test_ondemand_programming_supplement_three_assets(get_page, course): @patch('coursera.api.get_page') def test_extract_links_from_lecture_assets_typename_asset(get_page, course): - open_course_assets_reply = slurp_fixture('json/supplement-open-course-assets-reply.json') - api_assets_v1_reply = slurp_fixture('json/supplement-api-assets-v1-reply.json') + open_course_assets_reply = slurp_fixture( + 'json/supplement-open-course-assets-reply.json') + api_assets_v1_reply = slurp_fixture( + 'json/supplement-api-assets-v1-reply.json') get_page.side_effect = [json.loads(open_course_assets_reply), - json.loads(api_assets_v1_reply)] + json.loads(api_assets_v1_reply)] - expected_output = json.loads(slurp_fixture('json/supplement-extract-links-from-lectures-output.json')) + expected_output = json.loads(slurp_fixture( + 'json/supplement-extract-links-from-lectures-output.json')) assets = ['giAxucdaEeWJTQ5WTi8YJQ'] output = course._extract_links_from_lecture_assets(assets) output = json.loads(json.dumps(output)) @@ -175,14 +306,20 @@ def test_extract_links_from_lecture_assets_typname_url_and_asset(get_page, cours links both from typename == 'asset' and == 'url'. """ get_page.side_effect = [ - json.loads(slurp_fixture('json/supplement-open-course-assets-typename-url-reply-1.json')), - json.loads(slurp_fixture('json/supplement-open-course-assets-typename-url-reply-2.json')), - json.loads(slurp_fixture('json/supplement-open-course-assets-typename-url-reply-3.json')), - json.loads(slurp_fixture('json/supplement-open-course-assets-typename-url-reply-4.json')), - json.loads(slurp_fixture('json/supplement-open-course-assets-typename-url-reply-5.json')), + json.loads(slurp_fixture( + 'json/supplement-open-course-assets-typename-url-reply-1.json')), + json.loads(slurp_fixture( + 'json/supplement-open-course-assets-typename-url-reply-2.json')), + json.loads(slurp_fixture( + 'json/supplement-open-course-assets-typename-url-reply-3.json')), + json.loads(slurp_fixture( + 'json/supplement-open-course-assets-typename-url-reply-4.json')), + json.loads(slurp_fixture( + 'json/supplement-open-course-assets-typename-url-reply-5.json')), ] - expected_output = json.loads(slurp_fixture('json/supplement-extract-links-from-lectures-url-asset-output.json')) + expected_output = json.loads(slurp_fixture( + 'json/supplement-extract-links-from-lectures-url-asset-output.json')) assets = ['Yry0spSKEeW8oA5fR3afVQ', 'kMQyUZSLEeWj-hLVp2Pm8w', 'xkAloZmJEeWjYA4jOOgP8Q'] @@ -190,6 +327,7 @@ def test_extract_links_from_lecture_assets_typname_url_and_asset(get_page, cours output = json.loads(json.dumps(output)) assert expected_output == output + @patch('coursera.api.get_page') def test_list_courses(get_page, course): """ @@ -198,31 +336,66 @@ def test_list_courses(get_page, course): get_page.side_effect = [ json.loads(slurp_fixture('json/list-courses-input.json')) ] - expected_output = json.loads(slurp_fixture('json/list-courses-output.json')) + expected_output = json.loads( + slurp_fixture('json/list-courses-output.json')) expected_output = expected_output['courses'] output = course.list_courses() assert expected_output == output +@pytest.mark.parametrize( + "input_filename,output_filename,subtitle_language,video_id", [ + ('video-reply-1.json', 'video-output-1.json', + 'en,zh-CN|zh-TW', "None"), + ('video-reply-1.json', 'video-output-1-en.json', + 'zh-TW', "None"), + ('video-reply-1.json', 'video-output-1-en.json', + 'en', "None"), + ('video-reply-1.json', 'video-output-1-all.json', + 'all', "None"), + ('video-reply-1.json', 'video-output-1-all.json', + 'zh-TW,all|zh-CN', "None"), + ('video-reply-2.json', 'video-output-2.json', + 'en,zh-CN|zh-TW', "None"), + ] +) +def test_extract_subtitles_from_video_dom(input_filename, output_filename, subtitle_language, video_id): + video_dom = json.loads(slurp_fixture('json/%s' % input_filename)) + expected_output = json.loads(slurp_fixture('json/%s' % output_filename)) + course = api.CourseraOnDemand( + session=Mock(cookies={}), course_id='0', course_name='test_course') + actual_output = course._extract_subtitles_from_video_dom( + video_dom, subtitle_language, video_id) + actual_output = json.loads(json.dumps(actual_output)) + assert actual_output == expected_output + + @pytest.mark.parametrize( "input_filename,output_filename", [ ('empty-input.json', 'empty-output.txt'), - ('answer-text-replaced-with-span-input.json', 'answer-text-replaced-with-span-output.txt'), - ('question-type-textExactMatch-input.json', 'question-type-textExactMatch-output.txt'), + ('answer-text-replaced-with-span-input.json', + 'answer-text-replaced-with-span-output.txt'), + ('question-type-textExactMatch-input.json', + 'question-type-textExactMatch-output.txt'), ('question-type-regex-input.json', 'question-type-regex-output.txt'), - ('question-type-mathExpression-input.json', 'question-type-mathExpression-output.txt'), + ('question-type-mathExpression-input.json', + 'question-type-mathExpression-output.txt'), ('question-type-checkbox-input.json', 'question-type-checkbox-output.txt'), ('question-type-mcq-input.json', 'question-type-mcq-output.txt'), - ('question-type-singleNumeric-input.json', 'question-type-singleNumeric-output.txt'), + ('question-type-singleNumeric-input.json', + 'question-type-singleNumeric-output.txt'), ('question-type-reflect-input.json', 'question-type-reflect-output.txt'), - ('question-type-mcqReflect-input.json', 'question-type-mcqReflect-output.txt'), + ('question-type-mcqReflect-input.json', + 'question-type-mcqReflect-output.txt'), ('question-type-unknown-input.json', 'question-type-unknown-output.txt'), ('multiple-questions-input.json', 'multiple-questions-output.txt'), ] ) def test_quiz_exam_to_markup_converter(input_filename, output_filename): - quiz_json = json.loads(slurp_fixture('json/quiz-to-markup/%s' % input_filename)) - expected_output = slurp_fixture('json/quiz-to-markup/%s' % output_filename).strip() + quiz_json = json.loads(slurp_fixture( + 'json/quiz-to-markup/%s' % input_filename)) + expected_output = slurp_fixture( + 'json/quiz-to-markup/%s' % output_filename).strip() converter = api.QuizExamToMarkupConverter(session=None) actual_output = converter(quiz_json).strip() @@ -238,17 +411,34 @@ class TestMarkupToHTMLConverter: STYLE = None def setup_method(self, test_method): - self.STYLE = self._p(define.INSTRUCTIONS_HTML_INJECTION) + self.STYLE = self._p( + "".join([define.INSTRUCTIONS_HTML_INJECTION_PRE, + define.INSTRUCTIONS_HTML_MATHJAX_URL, + define.INSTRUCTIONS_HTML_INJECTION_AFTER]) + ) self.markup_to_html = api.MarkupToHTMLConverter(session=None) + ALTERNATIVE_MATHJAX_CDN = "https://alternative/mathjax/cdn.js" + self.STYLE_WITH_ALTER = self._p( + "".join([define.INSTRUCTIONS_HTML_INJECTION_PRE, + ALTERNATIVE_MATHJAX_CDN, + define.INSTRUCTIONS_HTML_INJECTION_AFTER]) + ) + self.markup_to_html_with_alter_mjcdn = api.MarkupToHTMLConverter( + session=None, mathjax_cdn_url=ALTERNATIVE_MATHJAX_CDN) + def test_empty(self): output = self.markup_to_html("") - assert self._p(""" + output_with_alter_mjcdn = self.markup_to_html_with_alter_mjcdn("") + markup = """ - """) + self.STYLE == output + """ + assert self._p(markup) + self.STYLE == output + assert self._p(markup) + \ + self.STYLE_WITH_ALTER == output_with_alter_mjcdn def test_replace_text_tag(self): - output = self.markup_to_html(""" + markup = """ TestNested @@ -257,8 +447,8 @@ class TestMarkupToHTMLConverter: Test2 - """) - assert self._p(""" + """ + result = """

@@ -268,7 +458,12 @@ class TestMarkupToHTMLConverter: Test2

\n - """) + self.STYLE == output + """ + output = self.markup_to_html(markup) + output_with_alter_mjcdn = self.markup_to_html_with_alter_mjcdn(markup) + assert self._p(result) + self.STYLE == output + assert self._p(result) + \ + self.STYLE_WITH_ALTER == output_with_alter_mjcdn def test_replace_heading(self): output = self.markup_to_html(""" @@ -331,7 +526,8 @@ class TestMarkupToHTMLConverter: 'nodata': Mock(data=None, content_type='image/png') } mock_asset_retriever.__call__ = Mock(return_value=None) - mock_asset_retriever.__getitem__ = Mock(side_effect=replies.__getitem__) + mock_asset_retriever.__getitem__ = Mock( + side_effect=replies.__getitem__) self.markup_to_html._asset_retriever = mock_asset_retriever output = self.markup_to_html(""" @@ -362,7 +558,8 @@ class TestMarkupToHTMLConverter: 'bWTK9sYwEeW7AxLLCrgDQQ': Mock(data=b'b', content_type='unknown') } mock_asset_retriever.__call__ = Mock(return_value=None) - mock_asset_retriever.__getitem__ = Mock(side_effect=replies.__getitem__) + mock_asset_retriever.__getitem__ = Mock( + side_effect=replies.__getitem__) self.markup_to_html._asset_retriever = mock_asset_retriever output = self.markup_to_html(""" @@ -400,6 +597,7 @@ def test_quiz_converter(): with open('quiz.html', 'w') as file: file.write(result) + def test_quiz_converter_all(): pytest.skip() import os @@ -413,8 +611,8 @@ def test_quiz_converter_all(): markup_to_html = api.MarkupToHTMLConverter(session=session) path = 'quiz_json' - for filename in ['quiz-audio.json']: #os.listdir(path): - # for filename in ['all_question_types.json']: + for filename in ['quiz-audio.json']: # os.listdir(path): + # for filename in ['all_question_types.json']: # if 'YV0W4' not in filename: # continue # if 'QVHj1' not in filename: @@ -430,6 +628,7 @@ def test_quiz_converter_all(): with open('quiz_html/' + filename + '.html', 'w') as f: f.write(result) + def create_session(): from coursera.coursera_dl import get_session from coursera.credentials import get_credentials @@ -455,10 +654,14 @@ def test_asset_retriever(get_reply, get_page): 'vdqUTz61Eea_CQ5dfWSAjQ'] expected_output = [ - api.Asset(id="bWTK9sYwEeW7AxLLCrgDQQ", name="M111.mp3", type_name="audio", url="url4", content_type="image/png", data="<...>"), - api.Asset(id="VceKeChKEeaOMw70NkE3iw", name="09_graph_decomposition_problems_1.pdf", type_name="pdf", url="url7", content_type="image/png", data="<...>"), - api.Asset(id="VcmGXShKEea4ehL5RXz3EQ", name="09_graph_decomposition_starter_files_1.zip", type_name="generic", url="url2", content_type="image/png", data="<...>"), - api.Asset(id="vdqUTz61Eea_CQ5dfWSAjQ", name="Capture.PNG", type_name="image", url="url9", content_type="image/png", data="<...>"), + api.Asset(id="bWTK9sYwEeW7AxLLCrgDQQ", name="M111.mp3", type_name="audio", + url="url4", content_type="image/png", data="<...>"), + api.Asset(id="VceKeChKEeaOMw70NkE3iw", name="09_graph_decomposition_problems_1.pdf", + type_name="pdf", url="url7", content_type="image/png", data="<...>"), + api.Asset(id="VcmGXShKEea4ehL5RXz3EQ", name="09_graph_decomposition_starter_files_1.zip", + type_name="generic", url="url2", content_type="image/png", data="<...>"), + api.Asset(id="vdqUTz61Eea_CQ5dfWSAjQ", name="Capture.PNG", + type_name="image", url="url9", content_type="image/png", data="<...>"), ] retriever = api.AssetRetriever(session=None) diff --git a/coursera/test/test_parsing.py b/coursera/test/test_parsing.py index 385e506..4eac879 100644 --- a/coursera/test/test_parsing.py +++ b/coursera/test/test_parsing.py @@ -65,7 +65,7 @@ def test_that_we_parse_and_write_json_correctly(get_page, json_path): def get_old_style_video(monkeypatch): pytest.skip() """ - Mock some methods that would, otherwise, create repeateadly many web + Mock some methods that would, otherwise, create repeatedly many web requests. More specifically, we mock: @@ -139,7 +139,7 @@ def test_get_on_demand_supplement_url_accumulates_assets(mocked): output = course.extract_links_from_supplement('element_id') # Make sure that SOME html content has been extracted, but remove - # it immeditely because it's a hassle to properly prepare test input + # it immediately because it's a hassle to properly prepare test input # for it. FIXME later. assert 'html' in output del output['html'] diff --git a/coursera/test/test_utils.py b/coursera/test/test_utils.py index ec198d5..8724519 100644 --- a/coursera/test/test_utils.py +++ b/coursera/test/test_utils.py @@ -34,7 +34,7 @@ from coursera.utils import total_seconds, is_course_complete ('Week 3: Data and Abstraction', 'Week_3-_Data_and_Abstraction'), ('  (Week 1) BRANDING: Marketing Strategy and Brand Positioning', 'Week_1_BRANDING-__Marketing_Strategy_and_Brand_Positioning'), - ('test & " adfas', 'test___adfas'), + ('test & " adfas', 'test__-_adfas'), # `"` were changed first to `-` (' ', ''), ('☂℮﹩т ω☤☂ℌ Ṳᾔ☤ḉ◎ⅾε', '__') ] @@ -54,7 +54,7 @@ def test_clean_filename(unclean, clean): 'Week 3- Data and Abstraction'), ('  (Week 1) BRANDING: Marketing Strategy and Brand Positioning', '  (Week 1) BRANDING- Marketing Strategy and Brand Positioning'), - ('test & " adfas', 'test & " adfas'), + ('test & " adfas', 'test & - adfas'), # `"` are forbidden on Windows (' ', u'\xa0'), ('☂℮﹩т ω☤☂ℌ Ṳᾔ☤ḉ◎ⅾε', '☂℮﹩т ω☤☂ℌ Ṳᾔ☤ḉ◎ⅾε') ] diff --git a/coursera/test/test_workflow.py b/coursera/test/test_workflow.py index 1c36224..99567b6 100644 --- a/coursera/test/test_workflow.py +++ b/coursera/test/test_workflow.py @@ -37,7 +37,7 @@ class MockedFailingDownloader(Downloader): raise self._exception_to_throw -TEST_URL = "https://www.coursera.org/api/test-url" +TEST_URL = "https://api.coursera.org/api/test-url" def make_test_modules(): @@ -110,7 +110,7 @@ def test_iter_modules(): (0, '01_section1'), (0, normpath('test_class/01_section1/01_module1')), (0, 'lecture1', 'en.txt', 'title'), - ('en.txt', 'https://www.coursera.org/api/test-url', 'title') + ('en.txt', 'https://api.coursera.org/api/test-url', 'title') ] collected_output = [] @@ -138,7 +138,7 @@ def test_walk_modules(): (0, '01_section1', 0, normpath('test_class/01_section1/01_module1'), 0, 'lecture1', normpath('test_class/01_section1/01_module1/01_lecture1_title.en.txt'), - 'https://www.coursera.org/api/test-url')] + 'https://api.coursera.org/api/test-url')] collected_output = [] for module, section, lecture, resource in _walk_modules( diff --git a/coursera/test/utils.py b/coursera/test/utils.py index cc6805e..0e8e1a2 100644 --- a/coursera/test/utils.py +++ b/coursera/test/utils.py @@ -2,9 +2,43 @@ Helper functions that are only used in tests. """ import os +import re from io import open +from six import iteritems + +from coursera.define import IN_MEMORY_MARKER +from coursera.utils import BeautifulSoup + def slurp_fixture(path): return open(os.path.join(os.path.dirname(__file__), "fixtures", path), encoding='utf8').read() + + +def links_to_plain_text(links): + """ + Converts extracted links into text and cleans up extra whitespace. Only HTML + sections are converted. This is a helper to be used in tests. + + @param links: Links obtained from such methods as extract_links_from_peer_assignment. + @type links: @see CourseraOnDemand._extract_links_from_text + + @return: HTML converted to plain text with extra space removed. + @rtype: str + """ + result = [] + for filetype, contents in iteritems(links): + if filetype != 'html': + continue + + for content, _prefix in contents: + if content.startswith(IN_MEMORY_MARKER): + content = content[len(IN_MEMORY_MARKER):] + + soup = BeautifulSoup(content) + [script.extract() for script in soup(["script", "style"])] + text = re.sub(r'[ \t\r\n]+', ' ', soup.get_text()).strip() + result.append(text) + + return ''.join(result) diff --git a/coursera/utils.py b/coursera/utils.py index 6fd4814..9ba577a 100644 --- a/coursera/utils.py +++ b/coursera/utils.py @@ -8,6 +8,7 @@ import os import re import sys import time +import json import errno import random import string @@ -41,7 +42,9 @@ else: from .define import COURSERA_URL, WINDOWS_UNC_PREFIX # Force us of bs4 with html.parser -BeautifulSoup = lambda page: BeautifulSoup_(page, 'html.parser') + + +def BeautifulSoup(page): return BeautifulSoup_(page, 'html.parser') if six.PY2: @@ -55,6 +58,16 @@ else: return x +def spit_json(obj, filename): + with open(filename, 'w') as file_object: + json.dump(obj, file_object, indent=4) + + +def slurp_json(filename): + with open(filename) as file_object: + return json.load(file_object) + + def is_debug_run(): """ Check whether we're running with DEBUG loglevel. @@ -106,13 +119,24 @@ def clean_filename(s, minimal_change=False): s = unquote_plus(s) # Strip forbidden characters + # https://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx s = ( s.replace(':', '-') .replace('/', '-') + .replace('<', '-') + .replace('>', '-') + .replace('"', '-') + .replace('\\', '-') + .replace('|', '-') + .replace('?', '-') + .replace('*', '-') .replace('\x00', '-') - .replace('\n', '') + .replace('\n', ' ') ) + # Remove trailing dots and spaces; forbidden on Windows + s = s.rstrip(' .') + if minimal_change: return s diff --git a/deploy/.netrc b/deploy/.netrc deleted file mode 100644 index bd0c698..0000000 --- a/deploy/.netrc +++ /dev/null @@ -1 +0,0 @@ -machine coursera-dl login password diff --git a/deploy/Dockerfile b/deploy/Dockerfile deleted file mode 100644 index 3a73264..0000000 --- a/deploy/Dockerfile +++ /dev/null @@ -1,14 +0,0 @@ -FROM ubuntu:14.04 -MAINTAINER Dmitry Senin - -RUN apt-get update -RUN DEBIAN_FRONTEND=noninteractive apt-get install -y git build-essential libssl-dev libffi-dev -RUN DEBIAN_FRONTEND=noninteractive apt-get install -y python-pip python-dev -RUN pip install ndg-httpsclient - -COPY .netrc /root/.netrc -RUN chmod 0600 /root/.netrc - -RUN cd /root && git clone https://github.com/coursera-dl/coursera.git -RUN cd /root/coursera && pip install -r requirements.txt -RUN cd /usr/bin && ln -s /root/coursera/coursera-dl coursera-dl diff --git a/deploy/README.md b/deploy/README.md deleted file mode 100644 index 30725b9..0000000 --- a/deploy/README.md +++ /dev/null @@ -1,10 +0,0 @@ -# How to launch the container - -1. [optional] Insert your username and password in the `.netrc` file if you - plan to use the `-n` optionof `coursera-dl` (edit template in this - directory). -2. Build Docker image: - `./build.sh` -3. Run Docker container to download courses A, B and C: - `./download.sh A B C` -4. All courses will be downloaded in directory `~/courses` diff --git a/deploy/build.sh b/deploy/build.sh deleted file mode 100755 index 55c1661..0000000 --- a/deploy/build.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/sh - -if groups | grep -q "docker" ; then - docker build --tag coursera-img --rm . -else - sudo docker build --tag coursera-img --rm . -fi diff --git a/deploy/download.sh b/deploy/download.sh deleted file mode 100755 index bcda3b9..0000000 --- a/deploy/download.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/sh - -COURSES=$* - -if [ ! -e ~/courses ]; then - mkdir ~/courses -fi - -if groups | grep -q "docker" ; then - docker run --rm --name coursera -v ~/courses:/courses coursera-img \ - coursera-dl -n --path /courses $COURSES -else - sudo docker run --rm --name coursera -v ~/courses:/courses coursera-img \ - coursera-dl -n --path /courses $COURSES -fi diff --git a/requirements.txt b/requirements.txt index 51f84fc..dc3ac2a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,8 @@ beautifulsoup4>=4.1.3 requests>=2.10.0 six>=1.5.0 -urllib3>=1.10 +urllib3>=1.23 pyasn1>=0.1.7 keyring>=4.0 +configargparse>=0.12.0 +attrs==18.1.0 diff --git a/setup.py b/setup.py index 19c0be5..a3da7fb 100644 --- a/setup.py +++ b/setup.py @@ -10,6 +10,8 @@ from __future__ import print_function import os.path import subprocess import sys +# For compatibility with Python2.7 +from io import open from setuptools import setup @@ -48,7 +50,7 @@ def read_file(filename, alt=None): lines = None try: - with open(filename) as f: + with open(filename, encoding='utf-8') as f: lines = f.read() except IOError: lines = [] if alt is None else alt @@ -58,9 +60,8 @@ def read_file(filename, alt=None): generate_readme_rst() long_description = read_file( - 'README.rst', - 'Generate README.rst from README.md via pandoc!\n\nExample: ' - 'pandoc --from=markdown --to=rst --output=README.rst README.md' + 'README.md', + 'Cannot read README.md' ) requirements = read_file('requirements.txt') dev_requirements = read_file('requirements-dev.txt') @@ -72,12 +73,11 @@ trove_classifiers = [ 'License :: OSI Approved :: GNU Lesser General Public License v3 or later (LGPLv3+)', 'Operating System :: OS Independent', 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: Implementation :: CPython', 'Programming Language :: Python :: Implementation :: PyPy', 'Programming Language :: Python', @@ -88,7 +88,7 @@ setup( name='coursera-dl', version=__version__, maintainer='Rogério Theodoro de Brito', - maintainer_email='rbrito@ime.usp.br', + maintainer_email='rbrito@gmail.com', license='LGPL', url='https://github.com/coursera-dl/coursera-dl', @@ -100,7 +100,9 @@ setup( description='Script for downloading Coursera.org videos and naming them.', long_description=long_description, - keywords=['coursera-dl', 'coursera', 'download', 'education', 'MOOCs', 'video'], + long_description_content_type='text/markdown', + keywords=['coursera-dl', 'coursera', + 'download', 'education', 'MOOCs', 'video'], classifiers=trove_classifiers, packages=["coursera"], diff --git a/tox.ini b/tox.ini index c3dbce8..bb378d5 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py26,py27,py33,py34,py35 +envlist = py26,py27,py33,py34,py35,py36 [testenv] downloadcache = .tox/_download/ @@ -14,9 +14,19 @@ deps = six>=1.5.0 urllib3>=1.10 keyrings.alt>=1.1 + configargparse>=0.12.0 commands = py.test -v --junitxml={envlogdir}/result.xml coursera/test # Original command: install_command = pip install {opts} {packages} # {opts} is remove to prevent passing option "--download-cache" to pip # which is already gone. install_command = pip install {packages} + +# Notes for developers. Depending on your system configuration, +# you may find this bash function useful to run before running tox: +# +# activate_pyenv () { +# export PYENV_ROOT="$HOME/.pyenv" +# export PATH="$PYENV_ROOT/bin:$PATH" +# eval "$(pyenv init -)" +# }