-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.rb
68 lines (60 loc) · 2.69 KB
/
scraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
require 'mechanize'
require 'fileutils'
require 'yaml'
class Scraper
ROOT_URL = "https://bblearn.missouri.edu"
COURSE_SCHEMA = YAML::load_file(File.join(File.dirname(__FILE__), 'courses.yml'))
CONFIG = YAML::load_file(File.join(File.dirname(__FILE__), 'config.yml'))
SAVE_DIR = File.expand_path CONFIG["save_dir"]
USERNAME = CONFIG["pawprint"]
PASSWORD = CONFIG["password"]
@@mechanize = Mechanize.new
def self.scrape!(username=USERNAME, password=PASSWORD, save_dir=SAVE_DIR)
@@mechanize.get('https://bblearn.missouri.edu/webapps/portal/execute/defaultTab') do |login_page|
login(login_page, username, password)
COURSE_SCHEMA.each do |course_name, course|
course["sections"].each do |section_name, section|
begin
section_url = "https://bblearn.missouri.edu/webapps/blackboard/content/listContent.jsp?course_id=#{course['course_id']}&content_id=#{section['content_id']}"
scrape_section section_url, [course_name, section_name], save_dir
rescue Mechanize::ResponseCodeError
break
end
end
end
end
end
def self.login(page, username, password)
form = page.form_with(:action => '/webapps/login/')
username_field = form.field_with(:name => "user_id")
username_field.value = username
password_field = form.field_with(:name => "password")
password_field.value = password
form.submit
end
def self.scrape_section(page_url, dir_array, save_dir)
page = @@mechanize.get(page_url)
page.search("//li[substring(@id, 1, 15) = 'contentListItem']").each do |content|
title = content.search("h3 span").last.text
match = title.match(/(.*) - Updated (.*)/)
title = match[1] if match
content.search("ul.attachments a").each do |attachment|
url = attachment.attributes["href"].value
subtitle = ""
subtitle_match = attachment.text.match(/(.*) - Click here to view/)
subtitle = " -- #{subtitle_match[1]}" if subtitle_match
content_title = "#{title}#{subtitle}"
next if Dir.glob(File.join(save_dir, *dir_array, "#{content_title}*")).any?
downloaded_file = @@mechanize.get(File.join(ROOT_URL, url))
file_ext = downloaded_file.filename[/\.[^.]*$/]
save_path = File.join(save_dir, *dir_array, "#{content_title}#{file_ext}")
puts "Saving #{content_title} to #{save_dir}"
downloaded_file.save!(save_path)
end
folder_link = content.search("h3 a").first
next unless folder_link
url = folder_link.attributes["href"].value
scrape_section(File.join(ROOT_URL, url), dir_array + [title], save_dir) if url.include? "/webapps/blackboard/content/listContent.jsp"
end
end
end