This repository has been archived by the owner on Jan 20, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
SimplePythonScraper.py
126 lines (99 loc) · 5.11 KB
/
SimplePythonScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# ====== Written by KaramMTayyem ======
import praw
import argparse
import sys
import re
import os
from urllib.request import urlopen
#================================================================================================#
# Method scrape_list is responsible for traversing through the subreddit list. It creates a
# a new folder for each subreddit, then calls a method to start scraping the given subreddit
#================================================================================================#
def scrape_list(list_name, directory_to_save, number, only_sfw):
with open(list_name) as sublist:
for sub in sublist:
sub_directory = os.path.join(directory_to_save, sub.strip())
if not os.path.exists(sub_directory):
os.makedirs(sub_directory)
scrape_subreddit(sub.strip(), number, sub_directory, only_sfw)
#================================================================================================#
# Method scrape_subreddit is responsible for scraping an individual subreddit. It creates a new
# instance of the subreddit, then traverses through the top posts of all time. For each posts,
# it checks whether it is a picture or not and checks whether it is NSFW depending. It then
# downloads the image to the subdirectory named after the subreddit.
#================================================================================================#
def scrape_subreddit(subreddit_name, number_of_posts, sub_directory, only_sfw):
c = 0
if(number_of_posts > 1000):
print("[WARNING] Reddit limits the number of each listing to 1000.")
subreddit = reddit.subreddit(subreddit_name)
if(subreddit.over18 and only_sfw):
print("[WARNING]", subreddit_name, "is an NSFW subreddit. Skipping...")
return
for submission in subreddit.top('all', limit=None):
if(is_picture(submission.url)):
if(submission.over_18 and only_sfw):
continue
download_image(submission.url, str(c)+".jpg", sub_directory)
print("[*] Downloaded", submission.url)
c+=1
if(c >= number_of_posts):
return
#================================================================================================#
# Method is_picture checks whether a given URL is a URL of an image judging by file extension.
#================================================================================================#
def is_picture(url):
if re.search(r'https?:.*\.(jpg|jpeg|png)', url, flags=re.IGNORECASE) is None:
return False;
return True;
#================================================================================================#
# Method download_image downloads image file from a given URL.
#================================================================================================#
def download_image(url, filename, directory):
try:
imagedata = urlopen(url).read()
with open(os.path.join(directory,filename), mode='wb') as imagefile:
imagefile.write(imagedata)
except:
print("[ERROR] Failed to download", url)
return
#================================================================================================#
# Creating a Reddit instance to use the API
# YOU MUST INSERT YOUR client_id, client_secret here
#================================================================================================#
reddit = praw.Reddit(client_id='<YOUR_CLIENT_ID>',
client_secret='<YOUR_CLIENT_SECRET>',
user_agent='Python:SimpleRedditScraper:1.0 (by KTayyem)')
#================================================================================================#
# Default is to download posts regardless of whether they are SFW or not.
# This value gets changed into true when users execute with the argument -s or --sfw
#================================================================================================#
is_sfw = False
#================================================================================================#
# The following lines prepare the argument parser responsible for parsing the arguments
# given through the command line. They configure the parser and how each value should be assigned.
#================================================================================================#
parser = argparse.ArgumentParser()
parser.add_argument('-r', '--subreddits',
nargs='?', metavar='subreddits list',
help = "specify a text file containing list of subreddits.",
required=True)
parser.add_argument('-d', '--dest',
nargs='?',
metavar='destination folder',
help='specify directory for saving the images (default = current directory).',
default='.')
parser.add_argument('-n', '--num_posts',
nargs='?',
metavar='number of posts',
type=int,
help='specify number of posts to check (default = 50).',
default=50)
parser.add_argument('-s', '--sfw',
help='add this to only download SFW posts.',
action='store_true')
args = parser.parse_args()
#================================================================================================#
# The method call that starts it all!
#================================================================================================#
scrape_list(args.subreddits, args.dest, args.num_posts, args.sfw)