Skip to content

Commit

Permalink
Increase webscraper efficiency and speed by only initializing it once
Browse files Browse the repository at this point in the history
  • Loading branch information
January committed Jan 17, 2024
1 parent 80148fe commit cce1557
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 89 deletions.
2 changes: 1 addition & 1 deletion brib.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
#not fully supported yet
pluginMangager()
# gets the defualt browser and system information
browser = get_default_browser()
browser = WebScraper.get_default_browser()
saveBrowser(config, browser)
# gets the version of Alfred
version = versionToPass
Expand Down
18 changes: 10 additions & 8 deletions modules/scanmodules.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@

date = datetime.date.today()

# gets driver from the config
config.read("./config/config.ini")
selected_webdriver = config.get("main", "browser")
scraper = WebScraper(selected_webdriver)


# scan logic
def Startscan(
Expand All @@ -33,7 +38,7 @@ def Startscan(
):
try:
headers = {"User-Agent": config["Personalizations"]["useragent"]}

# Timeout and proxies settings based on the mode flags
timeout_setting = 1.5
allow_redirects_setting = ars if "-d" in modes else False
Expand All @@ -47,20 +52,17 @@ def Startscan(
proxies=proxies_setting,
json=False, # Assuming json=False is default for all requests
)

if not webscrape:
result = ""
if webscrape:
# error message to find
error_to_find = siteErrors
# combineds to make url
website_url = siteN + uname
# gets driver from the config
config.read("./config/config.ini")
selected_webdriver = config.get("main", "browser")
if selected_webdriver:
result = scrape(
website_url, error_to_find, selected_webdriver, language_module
result = scraper.scrape(
website_url, error_to_find, language_module
)

# print(result)
Expand Down Expand Up @@ -176,7 +178,7 @@ def Startscan(
if response.status_code == 406 and "-N" not in modes and result == "No":
print("[" + Fore.GREEN + "+" + Fore.RESET + "] " + siteN + uname)
f.write("[" + "+" + "] " + siteN + uname + "\n")

if not webscrape and "-a" in modes and 300 <= response.status_code <= 510:
print("[" + Fore.RED + "-" + Fore.RESET + "] " + siteN + uname)
f.write(str(date) + "[" + "-" + "] " + siteN + uname + "\n")
Expand Down
175 changes: 95 additions & 80 deletions modules/webscrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,110 +11,125 @@
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.remote.remote_connection import LOGGER

class WebScraper:
selected_webdriver = ""
language_module = None
driver = None

def get_default_browser_windows():
try:
browser_key = r"Software\Microsoft\Windows\Shell\Associations\UrlAssociations\http\UserChoice"
with os.popen(
f'reg query "HKEY_CURRENT_USER\\{browser_key}" /v ProgId'
) as reg_query:
output = reg_query.read()
browser_name = output.split()[-1].strip()
return browser_name
except Exception:
return None


def get_default_browser_mac():
try:
command = "osascript -e 'get id of app id \"com.apple.Safari\"'"
output = subprocess.check_output(command, shell=True, text=True)
if "Safari" in output:
return "Safari"
else:
return "Unknown"
except Exception:
return None


def get_default_browser_linux():
try:
# Check the BROWSER environment variable
browser = os.getenv("BROWSER")
if browser:
return browser
# Try using xdg-settings
xdg_browser_command = "xdg-settings get default-web-browser"
browser = os.popen(xdg_browser_command).read().strip()
if browser:
return browser
return "Unknown"
except Exception:
return None


def get_default_browser():
os_name = platform.system()
if os_name == "Windows":
return get_default_browser_windows()
elif os_name == "Darwin": # macOS
return get_default_browser_mac()
elif os_name == "Linux":
return get_default_browser_linux()
else:
return "Unknown"


# web scraper
def scrape(url, target_error_message, selected_webdriver, language_module):
try:
# Set the log level to suppress webdriver console output
LOGGER.setLevel(logging.ERROR)
def __init__(self, selected_webdriver):
print("Initializing WebScraper.")
self.selected_webdriver = selected_webdriver
if(selected_webdriver not in ["Chrome", "Firefox", "Edge"]):
print(f"Alfred doesn't recognize webdriver {selected_webdriver}!")
print("We're going to try to use Chrome.")
selected_webdriver = "Chrome"

if selected_webdriver == "Chrome":
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.headless = True
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_experimental_option("prefs", prefs)
chrome_options.add_argument(
"--log-level=3"
) # Set the log level to suppress logging
driver = webdriver.Chrome(options=chrome_options)
WebScraper.driver = webdriver.Chrome(options=chrome_options)
elif selected_webdriver == "Firefox":
firefox_options = FirefoxOptions()
firefox_options.add_argument("--headless")
firefox_options.headless = True
firefox_options.set_preference('permissions.default.image', 2)
firefox_options.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')
firefox_options.add_argument(
"--log-level=3"
) # Set the log level to suppress logging
driver = webdriver.Firefox(options=firefox_options)
WebScraper.driver = webdriver.Firefox(options=firefox_options)
elif selected_webdriver == "Edge":
edge_options = EdgeOptions()
edge_options.use_chromium = True
edge_options.add_argument("--headless")
edge_options.headless = True
prefs = {"profile.managed_default_content_settings.images": 2}
edge_options.add_experimental_option("prefs", prefs)
edge_options.add_argument(
"--log-level=3"
) # Set the log level to suppress logging
driver = webdriver.Edge(options=edge_options)
WebScraper.driver = webdriver.Edge(options=edge_options)
else:
print(language_module.error10)
print(WebScraper.language_module.error10)
return None

def get_default_browser_windows():
try:
browser_key = r"Software\Microsoft\Windows\Shell\Associations\UrlAssociations\http\UserChoice"
with os.popen(
f'reg query "HKEY_CURRENT_USER\\{browser_key}" /v ProgId'
) as reg_query:
output = reg_query.read()
browser_name = output.split()[-1].strip()
return browser_name
except Exception:
return None

def get_default_browser_mac():
try:
command = "osascript -e 'get id of app id \"com.apple.Safari\"'"
output = subprocess.check_output(command, shell=True, text=True)
if "Safari" in output:
return "Safari"
else:
return "Unknown"
except Exception:
return None

def get_default_browser_linux():
try:
# Check the BROWSER environment variable
browser = os.getenv("BROWSER")
if browser:
return browser
# Try using xdg-settings
xdg_browser_command = "xdg-settings get default-web-browser"
browser = os.popen(xdg_browser_command).read().strip()
if browser:
return browser
return "Unknown"
except Exception:
return None

def get_default_browser():
os_name = platform.system()
if os_name == "Windows":
return WebScraper.get_default_browser_windows()
elif os_name == "Darwin": # macOS
return WebScraper.get_default_browser_mac()
elif os_name == "Linux":
return WebScraper.get_default_browser_linux()
else:
return "Unknown"

# web scraper
def scrape(self, url, target_error_message, language_module):
try:
# Set the log level to suppress webdriver console output
LOGGER.setLevel(logging.ERROR)

WebScraper.driver.get(url)
WebScraper.driver.implicitly_wait(0.5)
elements = WebScraper.driver.find_elements(
By.XPATH, f'//*[contains(text(), "{target_error_message}")]'
)
if elements:
there = "Yes"
# line 133 is for dev testing
# print(f"Found the error message: '{target_error_message} {url}'")
return there
else: # f"Error message '{target_error_message}' not found on the page."
there = "No"
# line 138 is for dev testing
# print(f"Error message '{target_error_message}' not found on the page. '{url}'")
return there
except Exception as e:
print(f"{WebScraper.language_module.error11}{e}")
return None

driver.get(url)
driver.implicitly_wait(10)
elements = driver.find_elements(
By.XPATH, f'//*[contains(text(), "{target_error_message}")]'
)
if elements:
there = "Yes"
# line 133 is for dev testing
# print(f"Found the error message: '{target_error_message} {url}'")
return there
else: # f"Error message '{target_error_message}' not found on the page."
there = "No"
# line 138 is for dev testing
# print(f"Error message '{target_error_message}' not found on the page. '{url}'")
return there
except Exception as e:
print(f"{language_module.error11}{e}")
return None

0 comments on commit cce1557

Please sign in to comment.