forked from visitishan/WebDigger
-
Notifications
You must be signed in to change notification settings - Fork 3
/
digfile.py
158 lines (140 loc) · 5.93 KB
/
digfile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# library imports
from bs4 import BeautifulSoup
import requests
from fake_useragent import UserAgent
from bannerfile import *
# List to store Links from google search page
sites = []
movies = [] #List to store link of movies
googleallpages = [] # List to store next 9 pages of google search
websitesFromCurrentSearchPage = [] #List to store temporary result links from above list links
# Using a random fake user agent for browser. It helps in preventing your IP from getting blocked by google search.
ua = UserAgent()
headers = {'User-Agent': ua.random}
#Initializing variables with blank
soup = ' '
ext = ' '
#File valid extensions (file links should end with these extensions)
ext1 = ('mkv','mov','avi','mp4','mpg','wmv')
ext2 = ('mp3','wav','ac3','ogg','flac','wma','m4a')
ext3 = ('mobi','pdf','rtf','doc','docx','txt')
ext4 = ('exe','iso','tar','rar','zip','apk')
ext5 = ('jpg','png','bmp','gif','tif','tiff','psd')
ext6 = ' '
#Function to make URL from search term and file extensions.
def makeURL(name,filetype):
# URL Prefix
urlPref = "https://www.google.com/search?q="
# URL mid part
urlMid = "%20%2B("
#File extentions to be searched (concatenating file type presets to url form)
FileExt = '%7C'.join(filetype)
#URL Suffix
urlSuff = ")%20-inurl%3A(jsp%7Cpl%7Cphp%7Chtml%7Caspx%7Chtm%7Ccf%7Cshtml)%20intitle%3Aindex.of%20-inurl%3A(listen77%7Cmp3raid%7Cmp3toss%7Cmp3drug%7Cindex_of%7Cwallywashis)"
# complete URL of first search page
url1 = urlPref + name + urlMid + FileExt + urlSuff
return url1
#Function to find and append the websites on a google search page into sites[] list
def getResLinks(url):
# Get URL page data
page = requests.get(url, headers=headers)
# Get page data parsed as HTML
global soup
soup = BeautifulSoup(page.text, "html.parser")
#Scraping google searchpage links
links = soup.findAll("cite")
print("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~ WEB FOLDER LINKS ~~~~~~~~~~~~~~~~~~~~~~~~")
for getL in links:
hrefs = getL.get_text()
sites.append(hrefs)
print(hrefs)
#function to find next 9 search pages of google and store them in googleallpages[] list
def googlekpages():
flclasslinks = []
nextpage = soup.find_all("a",class_="fl")
for pages in nextpage:
pg = (pages['href'])
pg = 'http://google.com' + pg
if "&start=" not in pg :
continue
else :
flclasslinks.append(pg)
global googleallpages
googleallpages = flclasslinks
#Function to check whether the Search term is present in final file link. It'll return 1 if matches else skip the link.
def match(filekalink, searchterm):
delimiters = {'%20','%5','_','.',',','-','/',':','%','(',')','{','}','[',']'}
filekalink = filekalink.lower()
searchterm = searchterm.lower()
for delimiter in delimiters:
filekalink = filekalink.replace(delimiter,' ')
searchterm = searchterm.replace(delimiter,' ')
filekalink = filekalink.split(' ')
if set(searchterm.split(' ')).issubset(filekalink) :
return 1
#Function to get the link of file from websites in sites[] list.
def getMovielink(websites):
print("\n~~~~~~~~~~~~~~~~~~~~~~~~~~ FILE LINKS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
for site in websites:
if ((site.startswith("https://")) or (site.startswith("http://"))):
fullsitelink = site
else :
fullsitelink = "http://" + site
try :
innersite = requests.get(fullsitelink, headers=headers)
innersoup = BeautifulSoup(innersite.text, "html.parser")
alllinks = innersoup.findAll("a")
for movielink in alllinks:
movie = (movielink['href'])
milgaya = fullsitelink + movie
if milgaya.endswith(ext) :
if match(milgaya, name) == 1:
print(milgaya)
except :
continue
# Function to ask user for more results and bring results from next search pages.
def moreLinks(linkCount):
#linkStart is the starting index for list of websites
linkStart = 10
#loop works for next search pages from google search
for xCount in range(linkCount):
ask = input("\nWant more links? (Y/N) : ").upper()
if ask == 'Y':
getResLinks(googleallpages[xCount]) #prints and appends websites of a particular search page to sites list.
# Here comes the tricky part. Everytime the getResLinks function is called, new websites are added to the sites list. Since 1-10 websites are already shown to the user, we start from 11th index.
for onesite in range(linkStart,len(sites)):
websitesFromCurrentSearchPage.append(sites[onesite]) # websitesFromCurrentSearchPage is a local temporary list that stores only 10 websites from current search page.
getMovielink(websitesFromCurrentSearchPage)
websitesFromCurrentSearchPage.clear() # list cleared after use
linkStart = linkStart + 10 # linkStart incremented by 10 so that next time it appends next 10 websites to the websitesFromCurrentSearchPage list
elif ask == 'N':
print("\nThank you for using WebDigger !\n")
break
else:
print("\nInvalid Input !!!\n")
banner() # Prints Web Digger ASCII banner on top
#Name of file to be searched - Search Term
name = input("What are you looking for? ")
contenttype = input("\nEnter its type -\nPress : \n |- 1. Video, Movies, Clips, TV Shows, Documentaries \n |- 2. Music, Songs, Audio \n |- 3. E-books, PDFs, Document, Spreadsheets, Presentations \n |- 4. Softwares, Applications, Zip Folders, ISOs \n |- 5. Images, Photos, Albums, Graphics, GIFs, PSDs \n |- 6. Custom file type \n |---> ")
if contenttype == '1':
ext = ext1
elif contenttype == '2':
ext = ext2
elif contenttype == '3':
ext = ext3
elif contenttype == '4':
ext = ext4
elif contenttype == '5':
ext = ext5
elif contenttype == '6':
ext6 = input("\nEnter file extensions seperated by comma(,) - For ex: txt,jpg,mp3\n\t")
ext6 = ext6.replace(',',' ')
ext6 = ext6.split(' ')
ext6 = tuple(ext6)
ext = ext6
url = makeURL(name,ext)
getResLinks(url)
getMovielink(sites)
googlekpages()
totalSearchPages = len(googleallpages)
moreLinks(totalSearchPages)