-
Notifications
You must be signed in to change notification settings - Fork 0
/
videoTranscriptDownloader.py
139 lines (122 loc) · 4.82 KB
/
videoTranscriptDownloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
from lxml import html,etree
from HTMLParser import HTMLParser
from pytube import YouTube
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures
import csv
import os
import pytube
import requests
import sys # sys print function on threads
### Constants
# Documents directory where we can store the downloaded data
ROOT_DIR = os.path.dirname(os.path.realpath(__file__)) + "/"
CSV_DIR = "phraseCSVs"
TXT_DIR = "fullTexts"
VIDEO_DIR = "fullVideos"
HEADER_STRING = "phrase,start,duration"
### Utilities
htmlDecoder = HTMLParser()
def decodeHtml(rawHtml):
""" Removes escape characters from HTML """
return htmlDecoder.unescape(rawHtml)
def makeDirsIfNeeded(dirName):
if not os.path.exists(dirName):
os.makedirs(dirName)
def downloadVideoForId(videoId, videoDir):
""" Downloads and saves videos with the given ID """
if os.path.exists(videoDir + videoId + ".mp4"):
sys.stdout.write("Using cached video for " + videoId + "\n")
return
sys.stdout.write("Downloading video " + videoId + "\n")
videoUrl = "http://youtube.com/watch?v=%s" % videoId
videoRsc = YouTube(videoUrl)
videoRsc.set_filename("%s" % videoId)
# Downloads low resolution mp4 for testing
videoRsc.get("mp4", "360p").download(videoDir)
def downloadTranscriptForId(videoId, csvFilename, textFilename):
""" Downloads the raw XML transcript for a video and converts it into a
useable CSV file. """
sys.stdout.write("Downloading transcript for video " + videoId + "\n")
# Retrieving the transcript will only work on videos with captions enabled
transcriptUrl = "http://video.google.com/timedtext?lang=en&v=%s" % videoId
try:
rawData = requests.get(transcriptUrl)
xmlTree = etree.fromstring(rawData.content)
except:
sys.stdout.write(">> Failed to get transcript for: %s\n" % (videoId))
csvString = HEADER_STRING + "\n"
textString = ""
for child in xmlTree:
# Remove HTML escapes
cleanText = decodeHtml(child.text).replace(",","")
# Replace newlines with spaces for CSV consistency
cleanText = cleanText.replace("\n"," ")
rowString = "%s,%.3f,%.3f\n" %\
(cleanText, float(child.get("start")), float(child.get("dur")))
csvString += rowString
# Copy regular text for the text file
textString += cleanText + " "
try:
f = open(csvFilename, "w")
f.write(csvString)
f.close()
except:
sys.stdout.write(">> Failed to write CSV file for: %s\n" % (videoId))
try:
f = open(textFilename, "w")
f.write(textString)
f.close()
except:
sys.stdout.write(">> Failed to write text file for: %s\n" % (videoId))
def downloadAllData(videoIdList, speaker):
""" Downloads both video and transcript for the given video ID """
csvDir = "%s/%s/%s/" % (ROOT_DIR, speaker, CSV_DIR)
textDir = "%s/%s/%s/" % (ROOT_DIR, speaker, TXT_DIR)
videoDir = "%s/%s/%s/" % (ROOT_DIR, speaker, VIDEO_DIR)
makeDirsIfNeeded(csvDir)
makeDirsIfNeeded(textDir)
makeDirsIfNeeded(videoDir)
# Concurrent downloads based on
# https://docs.python.org/3/library/concurrent.futures.html
with ThreadPoolExecutor(max_workers=6) as executor:
# Download the videos on separate threads
ftrs = []
for videoId in videoIdList:
csvFilename = "%s%s.csv" % (csvDir, videoId)
textFilename = "%s%s.txt" % (textDir, videoId)
ftrs.append(executor.submit(downloadVideoForId, videoId, videoDir))
ftrs.append(executor.submit(
downloadTranscriptForId, videoId, csvFilename, textFilename))
# Start the load operations and mark each future with its URL
for future in concurrent.futures.as_completed(ftrs):
try:
# Wait for thread to finish
future.result()
except Exception as exc:
print('generated an exception: %s' % (exc))
print("All threads completed!")
def retrieveVideoIds(channelName, limit=10):
""" Retrieves up to `limit` videos and transcripts from the channel """
channelUrl = "https://www.youtube.com/user/%s/videos" % channelName
linkStart = "/watch?v="
rawPage = requests.get(channelUrl)
htmlTree = html.fromstring(rawPage.content)
videoIds = []
i = 0
for link in htmlTree.cssselect('.yt-uix-tile-link'):
i += 1
if i > limit:
break
rawHref = link.get('href')
if linkStart in rawHref:
videoIds.append(rawHref.replace(linkStart, ""))
return videoIds
def lastWeekTonight():
lwtIds = retrieveVideoIds("LastWeekTonight")
downloadAllData(lwtIds, "John Oliver")
def tedTalks():
tedIds = retrieveVideoIds("TEDtalksDirector")
downloadAllData(tedIds, "TED")
if __name__ == '__main__':
tedTalks()