-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraping_science_education.py
168 lines (143 loc) · 6.19 KB
/
scraping_science_education.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import requests
from bs4 import BeautifulSoup
import os
import json
import csv
from datetime import datetime
import unicodedata
"""**Function to scrape articles links from a given URL**"""
def scrape_articles_links(url):
try:
response = requests.get(url)
response.encoding = 'utf-8' # Set the encoding to UTF-8
soup = BeautifulSoup(response.text, 'html.parser')
posts_div = soup.find('div', class_='search-main')
# Initialize a list to store links
links = []
# Find all the <div> tags with class "content-box"
if posts_div:
for div in posts_div.find_all('span', class_='read-more'):
# Find all the <a> tags within the div
for link in div.find_all('a'):
# Get the href attribute of each <a> tag
href = link.get('href')
# Append the link to the list
links.append(href)
return links
except Exception as e:
print(f"Error scraping links from {url}: {e}")
return None
"""**Function to process links**"""
def process_links(articles_links):
unique_links = set()
for link in articles_links:
full_url = "https://ssec.si.edu/" + link
unique_links.add(full_url)
return list(unique_links)
"""**Function to scrape data from a given URL**"""
def scrape_data(url):
article = {}
try:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
article_content = soup.find('div', class_='blog-content-wrapper')
if article_content:
article['link'] = url
print(url)
article_title_div = article_content.find('div', class_='field-title-field')
if article_title_div:
h2_tag = article_title_div.find('h2') # Find the <h2> tag within the div
if h2_tag:
article_title = h2_tag.get_text(strip=True) # Extract text from <h2> tag
article['title']=article_title
else:
print("No <h2> tag found within the div")
article['title']=None
body = article_content.find('div', class_='field-body')
if body:
text_elements = []
# Iterate through the children of the article tag
for child in body.children:
# Check if the child is a <p> tag
if child.name == 'p':
text_elements.append(child.get_text(strip=True))
# Check if the child is an <h2> tag
elif child.name == 'h2':
text_elements.append(child.get_text(strip=True))
# Check if the child is a <ul> tag
elif child.name == 'ul':
# Iterate through the <li> tags within the <ul> tag
for li in child.find_all('li'):
text_elements.append(li.get_text(strip=True))
# Join the text elements into a single list
text = ' '.join(text_elements)
else:
# Handle the case where the article body is not found
text = None
# Remove non-ASCII characters from the content
article['content'] = ''.join(char for char in text if ord(char) < 128) if text else None
if text:
# Calculate the total number of words
words_count = len(text.split())
else:
words_count = 0
article['words_count'] = words_count
# Get current datetime
now = datetime.now()
article['datetime'] = now.strftime("%Y-%m-%d %H:%M:%S")
return article
except Exception as e:
print(f"Error scraping data from {url}: {e}")
return None
"""**Function to save scraped data (dictionary) to a file as JSON format**"""
def save_to_json(data, filename):
try:
with open(filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)
print(f"Data saved to {filename}")
except Exception as e:
print(f"Error saving data to {filename}: {e}")
"""**Function to save scraped data (dictionary) to a file as CSV format**"""
def save_to_csv(data, filename):
try:
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['datetime','label', 'title', 'link', 'content', 'words_count']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for article in data:
writer.writerow({'datetime': article['datetime'],'label': article['label'], 'title': article['title'], 'link': article['link'], 'content': article['content'], 'words_count': article['words_count']})
print(f"Data saved to {filename}")
except Exception as e:
print(f"Error saving data to {filename}: {e}")
"""**Function to scrape articles**"""
def scrape_articles(url_base, pages, min_articles):
articles_data = []
articles_count = 0
for i in range(0, pages + 1):
url = f"{url_base}?page/{i}"
print("Page Link:" + url)
articles_links = scrape_articles_links(url)
links = process_links(articles_links)
for link in links:
data = scrape_data(link)
if data and data.get('words_count', 0) > 500:
articles_data.append({'index': articles_count + 1, 'label': 'science_education', **data})
articles_count += 1
if articles_count >= min_articles:
return articles_data
return articles_data
"""Main function"""
# URL to scrape
sport_url = 'https://ssec.si.edu/stemvisions-blog'
pages = 4 # Number of pages to scrape
min_articles = 15 # Minimum number of articles to scrape
# Scrape articles
articles_data = scrape_articles(sport_url, pages, min_articles)
# Create a directory to save the articles
os.makedirs("articles", exist_ok=True)
json_file = 'articles/science_education_articles.json'
csv_file = 'articles/science_education_articles.csv'
# Save to JSON file
save_to_json(articles_data, json_file)
# Save to CSV file
save_to_csv(articles_data, csv_file)