Skip to content

Commit

Permalink
Poster filters
Browse files Browse the repository at this point in the history
  • Loading branch information
SantiiRepair committed Sep 19, 2023
1 parent 5a9e169 commit 31fc4c0
Show file tree
Hide file tree
Showing 5 changed files with 321 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Auto detect text files and perform LF normalization
* text=auto
65 changes: 65 additions & 0 deletions INTRO.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h3>Google Image Scraper for Juypter Notebook</h3>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from GoogleImageScraper import GoogleImageScraper\n",
"from patch import webdriver_executable\n",
"\n",
"webdriver_path = os.path.normpath(os.path.join(os.getcwd(), 'webdriver', webdriver_executable()))\n",
"image_path = os.path.normpath(os.path.join(os.getcwd(), 'photos'))\n",
"#add new search key into array [\"cat\",\"t-shirt\",\"apple\",\"orange\",\"pear\",\"fish\"]\n",
"search_keys= [\"cat\",\"t-shirt\"]\n",
"number_of_images = 20\n",
"headless = False\n",
"#min_resolution = (width,height)\n",
"min_resolution=(0,0)\n",
"#max_resolution = (width,height)\n",
"max_resolution=(1920,1080)\n",
"for search_key in search_keys:\n",
" image_scraper = GoogleImageScraper(webdriver_path,image_path,search_key,number_of_images,headless,min_resolution,max_resolution)\n",
" image_urls = image_scraper.find_image_urls()\n",
" image_scraper.save_images(image_urls)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
209 changes: 209 additions & 0 deletions bot/helpers/GoogleImageScraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
# -*- coding: utf-8 -*-
"""
Created on Sat Jul 18 13:01:02 2020
@author: OHyic
"""
#import selenium drivers
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException

#import helper libraries
import time
import urllib.request
from urllib.parse import urlparse
import os
import requests
import io
from PIL import Image
import re

#custom patch libraries
import patch

class GoogleImageScraper():
def __init__(self, webdriver_path, image_path, search_key="cat", number_of_images=1, headless=True, min_resolution=(0, 0), max_resolution=(1920, 1080), max_missed=10):
#check parameter types
image_path = os.path.join(image_path, search_key)
if (type(number_of_images)!=int):
print("[Error] Number of images must be integer value.")
return
if not os.path.exists(image_path):
print("[INFO] Image path not found. Creating a new folder.")
os.makedirs(image_path)

#check if chromedriver is installed
if (not os.path.isfile(webdriver_path)):
is_patched = patch.download_lastest_chromedriver()
if (not is_patched):
exit("[ERR] Please update the chromedriver.exe in the webdriver folder according to your chrome version:https://chromedriver.chromium.org/downloads")

for i in range(1):
try:
#try going to www.google.com
options = Options()
if(headless):
options.add_argument('--headless')
driver = webdriver.Chrome(webdriver_path, chrome_options=options)
driver.set_window_size(1400,1050)
driver.get("https://www.google.com")
try:
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.ID, "W0wltc"))).click()
except Exception as e:
continue
except Exception as e:
#update chromedriver
pattern = '(\d+\.\d+\.\d+\.\d+)'
version = list(set(re.findall(pattern, str(e))))[0]
is_patched = patch.download_lastest_chromedriver(version)
if (not is_patched):
exit("[ERR] Please update the chromedriver.exe in the webdriver folder according to your chrome version:https://chromedriver.chromium.org/downloads")

self.driver = driver
self.search_key = search_key
self.number_of_images = number_of_images
self.webdriver_path = webdriver_path
self.image_path = image_path
self.url = "https://www.google.com/search?q=%s&source=lnms&tbm=isch&sa=X&ved=2ahUKEwie44_AnqLpAhUhBWMBHUFGD90Q_AUoAXoECBUQAw&biw=1920&bih=947"%(search_key)
self.headless=headless
self.min_resolution = min_resolution
self.max_resolution = max_resolution
self.max_missed = max_missed

def find_image_urls(self):
"""
This function search and return a list of image urls based on the search key.
Example:
google_image_scraper = GoogleImageScraper("webdriver_path","image_path","search_key",number_of_photos)
image_urls = google_image_scraper.find_image_urls()
"""
print("[INFO] Gathering image links")
self.driver.get(self.url)
image_urls=[]
count = 0
missed_count = 0
indx_1 = 0
indx_2 = 0
search_string = '//*[@id="islrg"]/div[1]/div[%s]/a[1]/div[1]/img'
time.sleep(3)
while self.number_of_images > count and missed_count < self.max_missed:
if indx_2 > 0:
try:
imgurl = self.driver.find_element(By.XPATH, search_string%(indx_1,indx_2+1))
imgurl.click()
indx_2 = indx_2 + 1
missed_count = 0
except Exception:
try:
imgurl = self.driver.find_element(By.XPATH, search_string%(indx_1+1,1))
imgurl.click()
indx_2 = 1
indx_1 = indx_1 + 1
except:
indx_2 = indx_2 + 1
missed_count = missed_count + 1
else:
try:
imgurl = self.driver.find_element(By.XPATH, search_string%(indx_1+1))
imgurl.click()
missed_count = 0
indx_1 = indx_1 + 1
except Exception:
try:
imgurl = self.driver.find_element(By.XPATH, '//*[@id="islrg"]/div[1]/div[%s]/div[%s]/a[1]/div[1]/img'%(indx_1,indx_2+1))
imgurl.click()
missed_count = 0
indx_2 = indx_2 + 1
search_string = '//*[@id="islrg"]/div[1]/div[%s]/div[%s]/a[1]/div[1]/img'
except Exception:
indx_1 = indx_1 + 1
missed_count = missed_count + 1

try:
#select image from the popup
time.sleep(1)
class_names = ["n3VNCb","iPVvYb","r48jcc","pT0Scc"]
images = [self.driver.find_elements(By.CLASS_NAME, class_name) for class_name in class_names if len(self.driver.find_elements(By.CLASS_NAME, class_name)) != 0 ][0]
for image in images:
#only download images that starts with http
src_link = image.get_attribute("src")
if(("http" in src_link) and (not "encrypted" in src_link)):
print(
f"[INFO] {self.search_key} \t #{count} \t {src_link}")
image_urls.append(src_link)
count +=1
break
except Exception:
print("[INFO] Unable to get link")

try:
#scroll page to load next image
if(count%3==0):
self.driver.execute_script("window.scrollTo(0, "+str(indx_1*60)+");")
element = self.driver.find_element(By.CLASS_NAME,"mye4qd")
element.click()
print("[INFO] Loading next page")
time.sleep(3)
except Exception:
time.sleep(1)



self.driver.quit()
print("[INFO] Google search ended")
return image_urls

def save_images(self,image_urls, keep_filenames):
print(keep_filenames)
#save images into file directory
"""
This function takes in an array of image urls and save it into the given image path/directory.
Example:
google_image_scraper = GoogleImageScraper("webdriver_path","image_path","search_key",number_of_photos)
image_urls=["https://example_1.jpg","https://example_2.jpg"]
google_image_scraper.save_images(image_urls)
"""
print("[INFO] Saving image, please wait...")
for indx,image_url in enumerate(image_urls):
try:
print("[INFO] Image url:%s"%(image_url))
search_string = ''.join(e for e in self.search_key if e.isalnum())
image = requests.get(image_url,timeout=5)
if image.status_code == 200:
with Image.open(io.BytesIO(image.content)) as image_from_web:
try:
if (keep_filenames):
#extact filename without extension from URL
o = urlparse(image_url)
image_url = o.scheme + "://" + o.netloc + o.path
name = os.path.splitext(os.path.basename(image_url))[0]
#join filename and extension
filename = "%s.%s"%(name,image_from_web.format.lower())
else:
filename = "%s%s.%s"%(search_string,str(indx),image_from_web.format.lower())

image_path = os.path.join(self.image_path, filename)
print(
f"[INFO] {self.search_key} \t {indx} \t Image saved at: {image_path}")
image_from_web.save(image_path)
except OSError:
rgb_im = image_from_web.convert('RGB')
rgb_im.save(image_path)
image_resolution = image_from_web.size
if image_resolution != None:
if image_resolution[0]<self.min_resolution[0] or image_resolution[1]<self.min_resolution[1] or image_resolution[0]>self.max_resolution[0] or image_resolution[1]>self.max_resolution[1]:
image_from_web.close()
os.remove(image_path)

image_from_web.close()
except Exception as e:
print("[ERROR] Download failed: ",e)
pass
print("--------------------------------------------------")
print("[INFO] Downloads completed. Please note that some photos were not downloaded as they were not in the correct format (e.g. jpg, jpeg, png)")
44 changes: 44 additions & 0 deletions bot/helpers/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import os
import concurrent.futures
from GoogleImageScraper import GoogleImageScraper
from patch import webdriver_executable


def worker_thread(search_key):
image_scraper = GoogleImageScraper(
webdriver_path,
image_path,
search_key,
number_of_images,
headless,
min_resolution,
max_resolution,
max_missed)
image_urls = image_scraper.find_image_urls()
image_scraper.save_images(image_urls, keep_filenames)

#Release resources
del image_scraper

if __name__ == "__main__":
#Define file path
webdriver_path = os.path.normpath(os.path.join(os.getcwd(), 'webdriver', webdriver_executable()))
image_path = os.path.normpath(os.path.join(os.getcwd(), 'photos'))

#Add new search key into array ["cat","t-shirt","apple","orange","pear","fish"]
search_keys = list(set(["cat","t-shirt"]))

#Parameters
number_of_images = 5 # Desired number of images
headless = True # True = No Chrome GUI
min_resolution = (0, 0) # Minimum desired image resolution
max_resolution = (9999, 9999) # Maximum desired image resolution
max_missed = 10 # Max number of failed images before exit
number_of_workers = 1 # Number of "workers" used
keep_filenames = False # Keep original URL image filenames

#Run each search_key in a separate thread
#Automatically waits for all threads to finish
#Removes duplicate strings from search_keys
with concurrent.futures.ThreadPoolExecutor(max_workers=number_of_workers) as executor:
executor.map(worker_thread, search_keys)
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ pyrogram==2.0.106
python-dotenv==1.0.0
PyPDF2==3.0.1
requests==2.31.0
selenium==4.11.2
shutup==0.2.0
speedtest-cli==2.1.3
termcolor==1.1
Expand Down

0 comments on commit 31fc4c0

Please sign in to comment.