-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_mars.py
113 lines (97 loc) · 4.41 KB
/
scrape_mars.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# Import BeautifulSoup, Splinter, Chromedriver
from bs4 import BeautifulSoup
from splinter import Browser
from selenium.webdriver.chrome.options import Options
import pandas as pd
import requests
import time
def init_browser():
#setting up browser parser
executable_path = {"executable_path": "chromedriver"}
options=Options()
options.add_argument('--ignore-certificate-errors')
browser = Browser("chrome", **executable_path, headless=False,chrome_options=options)
return browser
# Step 1 - Scraping:
def scrape():
browser = init_browser()
# creating data dictionary to fill with all the retrieved data from several websites
data={}
# NASA Mars news
url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest"
browser.visit(url)
html = browser.html
soup = BeautifulSoup(html, 'html.parser')
NASA_Mars_news={}
for item_list in soup.find_all('li',class_='slide'):
item=item_list.find('div',class_='image_and_description_container')
news_title=item.find('div', class_='content_title').text
news_p=item.find('div', class_='article_teaser_body').text
#print(news_title)
#print(news_p)
#print()
NASA_Mars_news.update({news_title:news_p})
data.update({'NASA_MARS_NEWS':NASA_Mars_news})
# JPL Mars Space Images - Featured Image
#browser = Browser("chrome", **executable_path, headless=False,chrome_options=options)
url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
browser.visit(url)
browser.click_link_by_partial_text('FULL IMAGE')
time.sleep(3)
html = browser.html
soup = BeautifulSoup(html, 'html.parser')
temp = soup.find('div', {'id':'main_container'}).\
find('div', {'id':'site_body'}).find('div', {'id':'page'}).\
find('div', class_='carousel_container')
url_part=temp.find('article')['style'].split('/',1)[1][:-3]
featured_image_url="https://www.jpl.nasa.gov/" + url_part
#print(featured_image_url)
data.update({'JPL_MARS_SPACE_IMAGE':featured_image_url})
# Mars Weather
#browser = Browser("chrome", **executable_path, headless=False,chrome_options=options)
url='https://twitter.com/marswxreport?lang=en'
browser.visit(url)
html = browser.html
soup = BeautifulSoup(html, 'html.parser')
mars_weather=soup.find('div',{'id':'doc'}).find('div',{'id':'timeline'}).\
find('div',class_='stream').find('div',class_='content').p.text
#print(mars_weather)
data.update({'MARS_WEATHER':mars_weather})
# Mars Facts: we do NOT need to update the data dictionary for this part
request_mars_space_facts = requests.get("https://space-facts.com/mars/")
mars_facts= pd.read_html(request_mars_space_facts.text)[0]
mars_facts.columns=['Facts', 'values']
mars_facts.set_index('Facts',inplace=True)
mars_facts=mars_facts.to_html(index='False',index_names='False')
data.update({'MARS_FACTS':mars_facts})
# Mars Hemispheres
#browser = Browser("chrome", **executable_path, headless=False,chrome_options=options)
url='https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url)
html = browser.html
soup = BeautifulSoup(html, 'html.parser')
base_url='https://astrogeology.usgs.gov/'
## Hemisphere titles
collapse=soup.find('div', class_='collapsible results')
hemisphere_titles=[]
for no,item in enumerate(collapse.find_all('div',class_='item')):
hemisphere_titles.append(item.a['href'].split('/')[-1])
## Hemisphere images
img_base_url='https://astrogeology.usgs.gov/search/map/Mars/Viking/'
hemisphere_title_image =[]
for no,tk in enumerate(hemisphere_titles):
hem_tit_img={}
#browser = Browser("chrome", **executable_path, headless=False,chrome_options=options)
browser.visit(img_base_url+tk)
time.sleep(5)
html = browser.html
soup = BeautifulSoup(html, 'html.parser')
hem_tit_img['title']=tk
hem_tit_img['image']=soup.find('div',class_='container').\
find('div',class_='downloads').ul.li.a['href']
hemisphere_title_image.append(hem_tit_img)
data.update({'MARS_HEMISPHERES_IMG':hemisphere_title_image})
# Close the browser after scraping
browser.quit()
return (data)
#print(scrape())