-
Notifications
You must be signed in to change notification settings - Fork 1
/
oc_course_downloader_selenium.py
executable file
·614 lines (495 loc) · 29 KB
/
oc_course_downloader_selenium.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
"""
__author__ = "Jeffrey Mvutu Mabilama"
__version__ = "0.1.2.3"
__license__ = "CC-BY"
import os
import argparse
import getpass
import netrc
import string
import requests
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import jmm.browsers
import utils
from util_markdown import html_to_markdown
from video_players import vimeo_video_infos
from utils import soupify_html
def reach_page(browser, url, time_to_wait='default'):
"""Navigates to a page and ensures the page has finished loading properly
before returning its source code.
:param time_to_wait: time to wait before the page is considered fully loaded
:rtype: str
:returns: source code of the target page.
"""
time_to_wait = 5 if time_to_wait == 'default' else time_to_wait
helper_print_func = lambda s, i: (".../" if i > 0 else "") + "/".join(s.split('/')[i:])
print("reaching page @ '%s'" % (helper_print_func(url, 3)))
browser.driver.get(url)
# wait until finished loading
# print("... did we wait until finished loading ? ...\n\t (url: '%s')" % (url))
browser.waitTime(time_to_wait)
return browser.driver.page_source
def helper_parse_course_page_url(url):
"""
Example of urls:
course presentation page url example:
https://openclassrooms.com/fr/courses/4011851-initiez-vous-au-machine-learning
course chapter page url example:
https://openclassrooms.com/fr/courses/4011851-initiez-vous-au-machine-learning/4011858-identifez-les-differentes-etapes-de-modelisation
:return: tuple of str: (course path id, course page subpath or None, language)
"""
hostname = 'openclassrooms.com/'
has_hostname = url.find(hostname) >= 0
if has_hostname:
path = url[url.find(hostname)+len(hostname):]
else:
path = url[1:] if (url[0] == '/') else url
lang, _, course_id, *course_page = path.split('/')
course_page = course_page[0] if len(course_page) > 0 else None
course_page = course_page if course_page and len(course_page) > 0 else None
# for url="https://openclassrooms.com/fr/courses/4011851-initiez-vous-au-machine-learning/4011858-identifez-les-differentes-etapes-de-modelisation"
# would return: ('4011851-initiez-vous-au-machine-learning', '4011858-identifez-les-differentes-etapes-de-modelisation', 'fr')
return (course_id, course_page, lang)
def helper_course_page_url(course_id, course_page=None, lang=None):
"""
"""
lang = "fr" if lang is None else lang
arr = ["https://openclassrooms.com", lang, "courses", course_id]
if course_page is not None:
arr.append(course_page)
return "/".join(arr)
def extract_course_chapters(html_page):
"""
:param html_page:
:rtype: list<tuple>
:returns: list of chapters infos
a chapter is a tuple consisting of
(the part number, the chapter number, chapter path, chapter title, chapter url)
"""
hostname = 'https://openclassrooms.com'
course_description_page_soup = BeautifulSoup(html_page, 'lxml')
# course_url = course_description_page_soup.find('link', {'rel': 'canonical'}).get('href')
# course_id, course_chapter_subpath, lang = helper_parse_course_page_url(course_url)
course_timeline = course_description_page_soup.find('div', {'class': 'timeline__steps'})
timeline_elmts = course_timeline.findChildren(recursive=False)
# first and last childs are chapter separators, so drop them
timeline_elmts = timeline_elmts[1:-1]
nbr_of_course_parts = len(course_timeline.find_all('span', {'class': 'timeline__splitChapter'})) - 2
curr_part_nbr = 1
curr_chap = 0
course_page_url_path = course_description_page_soup.find('div', {'class': 'timeline__inner'}).find('a', {"class": 'timeline__roundIcon'}).get('href')
course_title = course_description_page_soup.find('h1', 'courseHeader__title').get_text().strip()
intro_chapter = (0, 1, course_page_url_path, course_title, (hostname + course_page_url_path))
chapters = [intro_chapter]
for chapter_timeline_soup in timeline_elmts:
node_classes = chapter_timeline_soup.attrs.get('class', [])
if 'timeline__splitChapter' in node_classes:
curr_part_nbr += 1
curr_chap = 0
continue
elif 'timeline__step' in node_classes:
# a chapter node
curr_chap += 1
chap_title = chapter_timeline_soup.attrs.get('title')
# chap_path is absolute path, so already has '/'
chap_path = chapter_timeline_soup.attrs.get('href')
chap_url = (hostname + chap_path)
chapter = (curr_part_nbr, curr_chap, chap_path, chap_title, chap_url)
chapters.append(chapter)
pass
else:
print('OcCourseFetcherSpider.start_requests() :: unrecognized node "%s"' % (chapter_timeline_soup))
pass
return chapters
###############################################################################
def argParser():
"""Creates the argument parser of the program.
See the following answer to see how to implement a multi-level arg parse:
https://stackoverflow.com/a/10579924/4418092
Or see here https://docs.python.org/dev/library/argparse.html#sub-commands
to see how to implement one-level subcommands.
"""
parser = argparse.ArgumentParser(
formatter_class=argparse.RawTextHelpFormatter,
description="""OpenClassrooms.com course downloader.
""",
epilog="""Usage examples:
# Example command: this will download the course with videos at 720p resolution
# to the current directory. It will ask you your username and password inline.
url="https://openclassrooms.com/fr/courses/4425126-testez-votre-projet-avec-python/4434934-decouvrez-les-tests"
python oc_course_downloader_selenium.py [-n] -q 720p $url
# In order to download only the chapter 3-4 of the course with videos at 540p definition.
python oc_course_downloader_selenium.py [-n] --onlyChapters 3-4 -q 540p $url [–-dispatchVideoFiles]
# Skip video files
python oc_course_downloader_selenium.py [-n] --onlyChapters 3-4 -q 0p
"""
)
parser.add_argument('--username', '-u', help="username or email of the service")
parser.add_argument('--password', '-p', help="password of the service. If not provided it will be asked in a secure way interactively")
parser.add_argument('--netrc', '-n', action="store_true", help="Reads the credentials from a netrc file")
parser.add_argument('--dispatchVideoFiles', action="store_false", help="""Dispatches videos files and downloads them to their respective chapters.""")
# parser.add_argument('--groupMedias', '-g', action="store_true", help="""Groups media files and downloads them to the same location.""")
parser.add_argument('--videoQuality', '--quality', '-q', default='360p', help="""The video quality you want to download for videos (available video formats are often "360p", "540p", "720p", or "1080p"). Pass in 0 or an invalid quality to ignore video files. Default is '360p', which the lowest quality normally""")
parser.add_argument('--destination', '-d', default='.', help="""The directory to download the course to. Default will download in the current directory.""")
parser.add_argument('--overwrite', '-o', action="store_true", help="""Overwrite images and videos (will fetch again even if there is already an existing file)""")
parser.add_argument('--ignoreChapters', '-x', nargs="*", default=[], help="""Ignored chapters (in the form part-chapter like 2-4 to ignore chapter 4 of part 2). Example: 0-1 1-1 1-2 2-1 2-3""")
parser.add_argument('--onlyChapters', '--only', nargs="+", default=[], help="""The only chapters to fetch. in the form part-chapter like 2-4 to ignore chapter 4 of part 2). Example: 0-1 1-1 1-2 2-1 2-3.
If --ignoreChapters is also specified, the ignored chapters will filter this list.""")
parser.add_argument('courseUrls', nargs="+", help="Course urls of the courses to fetch. Example: https://openclassrooms.com/fr/courses/4425126-testez-votre-projet-avec-python/")
return parser
def credentials_from_netrc(filepath=None):
# import netrc
netrc_reader = netrc.netrc(filepath)
auth_infos = netrc_reader.authenticators("openclassrooms.com")
username, password = (auth_infos[0], auth_infos[2]) if auth_infos is not None else (None, None)
return username, password
###############################################################################
def extract_course_quiz_page_as_markdown(html_page):
soup = soupify_html(html_page)
page_content_tag = soup.find('div', {'class': "contentWithSidebar__content"})
page_content_tag = page_content_tag if page_content_tag else soup.body
# chapter title tag
title_tag = page_content_tag.h1 if page_content_tag.h1 else soup.body.title
title = title_tag.get_text().strip()
markdown_text = html_to_markdown(str(page_content_tag)).strip()
return markdown_text, title
def extract_course_activity_page(browser):
"""
:param browser: navigator helper
"""
# this func accepts an argument that can be <str> (HTML source code) or browser
html_page = browser.driver.page_source if getattr(browser, 'driver', None) else browser
browser = browser if getattr(browser, 'driver', None) else None
if browser:
## TODO
browser.click_element('.p2p__stepBoxAction > button')
browser.waitTime(5)
html_page = browser.driver.page_source
markdown_text, title = None, None
if html_page:
try:
soup = soupify_html(html_page)
page_content_tag = soup.find('div', {'class': "contentWithSidebar__content"})
title = page_content_tag.h2.get_text().strip() # chapter title
markdown_text = html_to_markdown(str(page_content_tag)).strip()
except:
# in case the source page format change beyond recognition
title = soup.title.get_text().strip() if soup.title is not None else ""
markdown_text = html_to_markdown(str(soup.body))
return markdown_text, title, html_page
def extract_course_page_main_text_as_markdown(html_page):
soup = soupify_html(html_page)
page_content_tag = soup.find('div', {'class': "contentWithSidebar__content"})
try:
title_tag = page_content_tag.h2
if title_tag is None:
# it is probably a Quiz page
# title_tag = page_content_tag.h1.get_text.strip()
return extract_course_quiz_page_as_markdown(html_page)
else:
title = title_tag.get_text().strip() # chapter title
content = page_content_tag.find('div', {'class': 'static'}).section.find('div', {'itemprop': "articleBody"})
markdown_text = html_to_markdown(str(content)).strip()
markdown_text = """## %s\n\n%s\n""" % (title, markdown_text)
return markdown_text, title
except:
### In case we encounter an unexpected page or if the page source
### format were to change in the future, we just markdown everything
markdown_text = html_to_markdown(str(soup.body)).strip()
title = soup.title.get_text() if soup.title is not None else ""
return markdown_text, title
def extract_course_page_images(html_page):
hostname = "https://openclassrooms.com"
soup = soupify_html(html_page)
page_content_tag = soup.find('div', {'class': "contentWithSidebar__content"})
page_content_tag = page_content_tag if page_content_tag else soup.body
images_to_fetch = []
image_tags = page_content_tag('img')
for i, tag in enumerate(image_tags):
image_desc = tag.get('alt')
image_src = tag['src']
if image_src.find('/') == 0:
if image_src.find('//') == 0:
# get full URL (absolute) for `<img src="//static.oc-static.com/prod/images/courses/certif.jpg">`
image_src = ("https:" + image_src)
else:
# get full URL (absolute) for `<img src="/path/to/img.jpg">`
image_src = (hostname + "/" + image_src) if image_src.find('/') == 0 else image_src
image_file_basename = image_src.split('/')[-1].split('?')[0].split('#')[0]
image_info = (i + 1, image_src, image_desc, image_file_basename)
images_to_fetch.append(image_info)
return images_to_fetch
def fetch_course_page_video_informations(html_page, video_pages_html=None):
"""
:param list<str> video_pages_html: instead of fetching, the parser will use an html content out of it
each time it encounters a video it needs to fetch
"""
hostname = "https://openclassrooms.com"
soup = soupify_html(html_page)
page_content_tag = soup.find('div', {'class': "contentWithSidebar__content"})
page_content_tag = page_content_tag if page_content_tag else soup.body
videos_to_fetch = []
video_frame_tags = [[j, iframe_tag, ("https:" + iframe_tag['src'])] for j, iframe_tag in enumerate(page_content_tag('iframe'))
if 'src' in iframe_tag.attrs and iframe_tag.get('src').find('player.vimeo') >= 0]
if len(video_frame_tags) == 0:
# in case we fetch without javascript, Vimeo videos are in the following form
# <video id="r-4452461" data-claire-element-id="7904382" src="https://vimeo.com/217633069"><a href="https://vimeo.com/217633069">https://vimeo.com/217633069</a></video>
# "https://vimeo.com/217633069" -> "https://player.vimeo.com/video/217633069?color=7451eb"
[]
vimeo_video_url_to_player_url = lambda url: "https://player.vimeo.com/video/%s?color=7451eb" % (url.split('?')[0].split('/')[-1])
video_frame_tags = [[j, None, vimeo_video_url_to_player_url(video_tag['src'])] for j, video_tag in enumerate(page_content_tag('video'))
if 'src' in video_tag.attrs and video_tag.get('src').find('//vimeo.com/') >= 0]
assert video_pages_html is None or len(video_pages_html) >= len(video_frame_tags)
get_inner_tag_content = lambda x: x.get_text() if len(x.get_text()) > 0 else x.string
for k, tag_details in enumerate(video_frame_tags):
# j, iframe_tag, video_src_page_url = tag_details
_, _, video_src_page_url = tag_details
## getting the content of the video/iframe tags of the course's videos
if video_pages_html:
iframe_source = video_pages_html[k]
else:
print("######### fetching page at ")
resp = requests.get(video_src_page_url)
iframe_source = resp.content.decode(encoding=resp.encoding)
vimeo_page_soup = soupify_html(iframe_source)
script_tags_content = [get_inner_tag_content(tag) for tag in vimeo_page_soup.find_all('script')
if get_inner_tag_content(tag).find('''"mime":"video/mp4"''') >= 0]
# There used to be one and only one iframe with video details about the
# video to load in the vimeo video. (in 2019)
assert len(script_tags_content) == 1, "There are {} script tags with videos instead of just 1".format(len(script_tags_content))
content = script_tags_content[0]
video_formats_infos_summary, video_formats_infos = vimeo_video_infos(content)
video_title = vimeo_page_soup.title.get_text().strip()
video_info = (k + 1, video_title, video_formats_infos_summary, video_formats_infos)
videos_to_fetch.append(video_info)
return videos_to_fetch
def extract_course_page_main_content(html_page, video_pages_html=None):
content, chapter_title = extract_course_page_main_text_as_markdown(html_page)
images_to_fetch = extract_course_page_images(html_page)
videos_to_fetch = fetch_course_page_video_informations(html_page, video_pages_html=video_pages_html)
page_infos = {
'title': chapter_title,
'markdown_text': content,
'to_fetch': {
'images': images_to_fetch,
'videos': videos_to_fetch
},
'html': html_page
}
return page_infos
def paths_for_course(chapter_infos, part_nbr, chapter_nbr, video_quality, prefix, all_videos_in_same_folder):
"""Returns the target text and media infos, especially including the
target filepath
"""
### fetching the chapter's page
# base_chapter_path: "/path/to/course/directory/1-3"
base_chapter_path = os.path.join(prefix, '%i-%i' % (part_nbr, chapter_nbr))
# base_chapter_name: "1-3"
base_chapter_name = os.path.basename(base_chapter_path)
# course_root_path: "/path/to/course/directory"
course_root_path = os.path.dirname(base_chapter_path)
text_infos = (0, os.path.join(base_chapter_path, chapter_infos['title'] + ".md"), chapter_infos['markdown_text'])
html_infos = (0, os.path.join(base_chapter_path, chapter_infos['title'] + ".html"), chapter_infos['html'])
### fetching the images
# base_media_path = '.'
base_media_path = 'medias'
images = chapter_infos.get('to_fetch').get('images') if chapter_infos.get('to_fetch') is not None and chapter_infos['to_fetch'].get('images') else []
# download_infos: [(path to save to, url, image description), ...]
images_download_infos = [(os.path.join(base_chapter_path, base_media_path, "{}_{}".format(base_chapter_name, image_info[3])), # destination path
image_info[1], # url
image_info[2]) # image description
for image_info in images]
### fetching the videos
get_at_index_or_default = lambda x, k, default_value: x[k] if (x is not None and k < len(x)) else default_value
first_element = lambda arr: arr[0] if len(arr) > 0 else None
video_infos_for_quality = lambda video_infos_tuple, quality: first_element([video_infos for video_infos in video_infos_tuple[2] if str(video_infos[0]) == str(quality)])
video_for_quality = lambda video_infos_tuple, quality: get_at_index_or_default(first_element([video_infos for video_infos in video_infos_tuple[2] if str(video_infos[0]) == str(quality)]), 2, None)
videos = chapter_infos['to_fetch']['videos'] if chapter_infos.get('to_fetch') is not None and chapter_infos['to_fetch'].get('videos') else []
## Passing an invalid quality like 0 px should mean that we do NOT download videos
video_download_infos = []
for i, video_info in enumerate(videos):
# video_info: (k+1, video_title, video_formats_infos_summary, video_formats_infos, video_extension)
# video_formats_infos_summary: (video quality (ex. '540p'), *tuple*(width (int), height (int))*/tuple*, video url, file extension)
extension = get_at_index_or_default(video_infos_for_quality(video_info, video_quality), 3, None)
url = video_for_quality(video_info, video_quality)
if url is not None:
video_title = video_info[1]
filename = "{}_{}.{}".format(base_chapter_name, video_title, extension)
if all_videos_in_same_folder:
dest_path = os.path.join(course_root_path, base_media_path, filename)
else:
dest_path = os.path.join(base_chapter_path, base_media_path, filename)
url_parts = url.split('?')
assert len(url_parts) in (1, 2), "Malformed URL. Perhaps the URL extraction has a flaw or the "
if len(url_parts) == 2 and url_parts[-1].find("source=1") >= 0:
url = url_parts[0] + '?' + url_parts[-1].replace("source=1", "") # or source=0
title = video_title
video_download_infos.append((dest_path, url, title))
return text_infos, html_infos, images_download_infos, video_download_infos
def clean_filepath(path):
"""Removes characters may not be used in a file path
"""
valid_chars = string.ascii_letters + string.hexdigits
valid_chars += """.-_,()[]{}!#' =@$%"""
valid_chars += os.path.sep
valid_chars = set(valid_chars)
drive, relative = os.path.splitdrive(path)
filtered = "".join([c for c in relative if c in valid_chars])
components = filtered.split(os.path.sep)
components = [comp.strip() for comp in components]
relative = os.path.sep.join(components)
result = os.path.join(drive, relative)
return result if len(result) > 0 else None
def fetch_and_save_course_chapter_infos(chapter_infos, part_nbr, chapter_nbr, video_quality, prefix, overwrite, all_videos_in_same_folder):
"""Fetches and writes following the architecture pattern.
:param infos:
...
:param video_quality:
Also accepts 'low', 'medium', 'hd', 'full'
"""
text_infos, html_infos, images_download_infos, video_download_infos = paths_for_course(chapter_infos, part_nbr, chapter_nbr, video_quality, prefix, all_videos_in_same_folder)
### Saving the text and HTML
filepath = text_infos[1]
filepath = clean_filepath(filepath)
parent = os.path.dirname(filepath)
os.makedirs(parent, exist_ok=True)
if not os.path.exists(filepath) or overwrite:
with open(filepath, 'w', encoding="utf-8") as fh:
fh.write(text_infos[2])
filepath = html_infos[1]
filepath = clean_filepath(filepath)
parent = os.path.dirname(filepath)
os.makedirs(parent, exist_ok=True)
if not os.path.exists(filepath) or overwrite:
with open(filepath, 'w', encoding="utf-8") as fh:
fh.write(html_infos[2])
### fetching the images and videos
download_infos = images_download_infos + video_download_infos
for i, media_infos in enumerate(download_infos):
filepath, url, description, *_ = media_infos
if url is not None:
_descriptive_filepath = filepath # os.path.relpath(filepath)
if not os.path.exists(filepath) or overwrite:
print("""%i/%i) Fetching "%s" \n to "%s" ...""" % (i+1, len(download_infos), description, _descriptive_filepath))
utils.download_with_progress_indicator(url, filepath, True)
else:
print("%i/%i) Found already fetched content %s" % (i+1, len(download_infos), _descriptive_filepath))
else:
print("Did not find quality '%s' for the video %s" % (description))
print()
pass
def fetch_page_and_contents(driver, url, directory, content_prefix, image_prefix=None, video_prefix=None):
"""Fetches the page and saves it to disk
:param str directory: directory to save to.
:param str content_prefix: a filepath prefix to prepend to the saved files.
The prefix is relative to the `directory` parameter.
:param str image_prefix: a filepath prefix to prepend to image files.
The prefix is relative to the `directory` parameter.
If None, a default architecture will be used.
The default architecture uses content_prefix.
:param str video_prefix: a filepath prefix to prepend to image files.
The prefix is relative to the `directory` parameter.
If None, a default architecture will be used.
The default architecture uses content_prefix.
"""
directory = os.path.abspath(directory)
# ...
# fetch content and save them to the prefixed
# fetch imges and save them to the prefixed
# fetch videos and save them to the prefixed
# iFrame = d.find_elements_by_tag_name("iframe")[0]
# if driver:
# # iframes that have
# driver.switch_to.frame(iframe);
# driver.getPageSource();
# driver.switch_to.default_content();
pass
def fetch_course(browser, course_url, video_quality, overwrite=False, directory=None, only_chapters=None, ignored_chapters=None, all_videos_in_same_folder=True):
"""
:param str directory: where to download the course
"""
print("Configuration:\n video_quality: {}\n directory: {}\n all_videos_in_same_folder: {}\n\n".format(video_quality, directory, all_videos_in_same_folder))
ignored_chapters = [] if ignored_chapters is None else ignored_chapters
course_id, course_page, lang = helper_parse_course_page_url(course_url)
course_home_page_url = helper_course_page_url(course_id, None, lang)
reach_page(browser, course_home_page_url)
chapters = extract_course_chapters(browser.driver.page_source)
soup = soupify_html(browser.driver.page_source)
course_title = soup.title.get_text().strip()
home_page_chapter = (0, 1, course_home_page_url, course_title, course_home_page_url)
chapters = [home_page_chapter] + chapters
prefix = os.path.join(directory, course_title)
os.makedirs(prefix, exist_ok=True)
print("Will download course corresponding to url: \n\t'{}' \n\tto \n\t'{}'\n".format(course_url, prefix))
### cycle through the URLs and pages
for chapter in chapters:
part_nbr, chapter_nbr, chap_path, chap_title, chap_url = chapter
should_ignore = (part_nbr, chapter_nbr) in ignored_chapters
if only_chapters:
should_ignore = should_ignore or (part_nbr, chapter_nbr) not in only_chapters
if should_ignore:
print("Ignored chapter %i-%i" % (part_nbr, chapter_nbr))
continue
### go to a page
reach_page(browser, chap_url)
is_quiz_chapter = chap_title.strip().lower().find('Quiz') == 0
is_exercise_chapter = chap_title.strip().lower().find('Activité') == 0
if is_quiz_chapter or is_exercise_chapter:
if is_exercise_chapter:
try:
markdown_text, title = extract_course_quiz_page_as_markdown(html_page)
except Exception as err:
# added try-except after after months because I saw html_page potentially undefined. (did not test)
print("""Error fetching chapter {}-{}: {}\n\tOther infos: chapter path: '{}'. Chapter title: '{}'""".format(part_nbr, chapter_nbr, err, chap_path, chapter_title))
html_page = browser.driver.page_source if getattr(browser, 'driver', None) else browser
markdown_text, title = extract_course_quiz_page_as_markdown(html_page)
else:
markdown_text, title, html_page = extract_course_activity_page(browser)
chapter_infos = {
'title': chap_title,
'markdown_text': markdown_text,
'html': html_page
}
else:
chapter_infos = extract_course_page_main_content(browser.driver.page_source)
### save page to disk
fetch_and_save_course_chapter_infos(chapter_infos, part_nbr, chapter_nbr, video_quality, prefix, overwrite, all_videos_in_same_folder)
print()
pass
def main_selenium():
parser = argParser()
args = parser.parse_args()
if args.netrc:
args.username, args.password = credentials_from_netrc()
else:
if args.username is None or args.username == '-':
args.username = input("Please input your OpenClassrooms.com login email: ")
if args.password is None or args.username == '-':
args.password = getpass.getpass("Openclassrooms.com password: ")
nav = jmm.browsers.SeleniumHelper()
### login
nav.get('https://openclassrooms.com/fr/login')
nav.waitTillExists('input#fielduserEmail')
nav.enter_textfield('input#fielduserEmail', args.username)
nav.enter_textfield('input#fielduserEmail', Keys.RETURN)
nav.waitTime(2)
nav.enter_textfield('input#field_password', args.password)
nav.click_element('button#login-button')
nav.waitTime(5)
for url in args.courseUrls:
print("Fetching course for %s" % url)
directory = os.path.abspath(os.path.expanduser(args.destination))
# print("Parent destination directory: %s" % directory)
only_chapters = [(int(tup.split('-')[0]), int(tup.split('-')[1])) for tup in args.onlyChapters]
ignored_chapters = [(int(tup.split('-')[0]), int(tup.split('-')[1])) for tup in args.ignoreChapters]
fetch_course(nav, url, args.videoQuality, args.overwrite, directory, only_chapters=only_chapters, ignored_chapters=ignored_chapters, all_videos_in_same_folder=args.dispatchVideoFiles)
print("---- Finished fetching the course %s ----\n" % (url))
if __name__ == '__main__':
try:
main_selenium()
except KeyboardInterrupt:
print("\n...User keyboard interruption")