diff --git a/README.md b/README.md index 6a88c81..7c67574 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ [![Python >= 3.8](https://img.shields.io/badge/python->=3.6-red.svg)](https://www.python.org/downloads/) [![](https://badgen.net/github/release/deedy5/fake_traffic)](https://github.com/deedy5/fake_traffic/releases) [![](https://badge.fury.io/py/fake-traffic.svg)](https://pypi.org/project/fake-traffic) # fake_traffic -Imitating an Internet user by mimicking popular web traffic (internet traffic generator). +Internet traffic generator. --- ### Install @@ -20,14 +20,10 @@ CLI examples: ```python3 # user located in Turkey, who speaks Kurdish and is interested in hot stories fake_traffic -c tr -l ku-tr -ca h -# user located in Brazil, who speaks Portuguese and is interested in sports -fake_traffic -c br -l pt-br -ca s # save logs into 'fake_traffic.log' fake_traffic -c ru -l ru-ru -ca s -lf -# define wait times between requests -fake_traffic -c fr -l fr-fr -ca b -min_w 1 -max_w 100 -lf # use none-headless mode -fake_traffic -c en -l en-us -ca t -nh -lf +fake_traffic -c en -l en-us -ca t -nh ``` --- ### Simple usage @@ -42,14 +38,12 @@ FakeTraffic(country='US', language='en-US").crawl() from fake_traffic import FakeTraffic ft = FakeTraffic(country='US', language='en-US', category='h', min_wait=1, max_wait=5, headless=True) - """ Imitating an Internet user by mimicking popular web traffic (internet traffic generator). + """Internet traffic generator. country = country code ISO 3166-1 Alpha-2 code (https://www.iso.org/obp/ui/), language = country-language code ISO-639 and ISO-3166 (https://www.fincher.org/Utilities/CountryLanguageList.shtml), category = сategory of interest of a user (defaults to 'h'): 'all' (all), 'b' (business), 'e' (entertainment), 'm' (health), 's' (sports), 't' (sci/tech), 'h' (top stories); - min_wait = minimal delay between requests (defaults to 1), - max_wait = maximum delay between requests (defaults to 10), headless = True/False (defaults to True). """ ft.crawl() @@ -91,11 +85,11 @@ Country | Language | Function | France | French | `FakeTraffic(country="FR", language="fr-FR")` | Germany | German | `FakeTraffic(country="DE", language="de-DE", category='b')` | India | English | `FakeTraffic(country="IN", language="en-IN", category='all')` | -India | Hindi | `FakeTraffic(country="IN", language="hi-IN", max_wait=10)` | +India | Hindi | `FakeTraffic(country="IN", language="hi-IN")` | Russia | English | `FakeTraffic(country="RU", language="en-US", category='b', headless=False)` | -Russia | Russian | `FakeTraffic(country="RU", language="ru-RU", min_wait=0.5, max_wait=3)` | -Brazil | Portuguese | `FakeTraffic(country="BR", language="pt-BR", category='s', threads=2, max_wait=60)` | +Russia | Russian | `FakeTraffic(country="RU", language="ru-RU")` | +Brazil | Portuguese | `FakeTraffic(country="BR", language="pt-BR", category='s')` | United Kingdom | English | `FakeTraffic(country="GB", language="en-GB")` | -United States | English | `FakeTraffic(country="US", language="en-US", min_wait=60, max_wait=300)` | +United States | English | `FakeTraffic(country="US", language="en-US")` | United States | Hebrew Israel | `FakeTraffic(country="US", language="he-IL")` | diff --git a/fake_traffic/__init__.py b/fake_traffic/__init__.py index df13cb1..14e247f 100644 --- a/fake_traffic/__init__.py +++ b/fake_traffic/__init__.py @@ -1,2 +1,8 @@ +import logging + from .fake_traffic import FakeTraffic -from .version import __version__ \ No newline at end of file +from .version import __version__ + +# A do-nothing logging handler +# https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library +logging.getLogger("fake_traffic").addHandler(logging.NullHandler()) diff --git a/fake_traffic/cli.py b/fake_traffic/cli.py index bbe3777..751d0bd 100644 --- a/fake_traffic/cli.py +++ b/fake_traffic/cli.py @@ -4,9 +4,7 @@ from .fake_traffic import FakeTraffic -parser = argparse.ArgumentParser( - description="fake_traffic. Imitating an Internet user by mimicking popular web traffic (internet traffic generator)." -) +parser = argparse.ArgumentParser(description="Internet traffic generator") parser.add_argument( "-c", "--country", @@ -29,20 +27,6 @@ choices=["all", "b", "e", "m", "s", "t", "h"], required=False, ) -parser.add_argument( - "-min_w", - "--min_wait", - default=1, - help="default=1. Minimum wait time between requests.", - required=False, -) -parser.add_argument( - "-max_w", - "--max_wait", - default=10, - help="default=10. Maximum wait time between requests.", - required=False, -) parser.add_argument( "-nh", "--no-headless", @@ -51,16 +35,9 @@ help="Run the browser in non-headless mode", required=False, ) -parser.add_argument( - "-ll", - "--logging_level", - default="INFO", - help="logging level. default=INFO", - required=False, -) parser.add_argument( "-lf", - "--logging_file", + "--logfile", action="store_true", help="save the log into 'fake_traffic.log'", required=False, @@ -69,12 +46,12 @@ # logging logging.basicConfig( - level=args.logging_level, + level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", force=True, handlers=[logging.FileHandler("fake_traffic.log"), logging.StreamHandler()] - if args.logging_file + if args.logfile else [logging.StreamHandler()], ) @@ -82,7 +59,7 @@ language_split = args.language.split("-") language = f"{language_split[0]}-{language_split[1].upper()}" logging.info( - f"Run crawl with: {country=}, {language=}, category={args.category} min_w={args.min_wait}, max_w={args.max_wait}, headless={args.headless}, logging_level={args.logging_level}, logging_file={args.logging_file}" + f"Run crawl with: {country=}, {language=}, category={args.category}, headless={args.headless}, logfile={args.logfile}" ) @@ -90,8 +67,6 @@ country=country, language=language, category=args.category, - min_wait=int(args.min_wait), - max_wait=int(args.max_wait), headless=args.headless, ) fake_traffic.crawl() diff --git a/fake_traffic/fake_traffic.py b/fake_traffic/fake_traffic.py index 6ca2da4..ec32151 100644 --- a/fake_traffic/fake_traffic.py +++ b/fake_traffic/fake_traffic.py @@ -1,12 +1,12 @@ +import asyncio import logging import subprocess -from collections import deque -from random import choice, randint, shuffle, uniform -from time import sleep -from urllib.parse import urljoin -from playwright.sync_api import sync_playwright -from playwright_stealth import stealth_sync +from playwright.async_api import async_playwright +from playwright_stealth import stealth_async + +logger = logging.getLogger("__name__") +SEMAPHORE = asyncio.Semaphore(5) # playwright install chromium @@ -17,66 +17,7 @@ capture_output=True, text=True, ) -logging.info(res.stdout) - -BLACKLIST = ( - ".cs", - ".css", - ".gif", - ".ico", - ".iso", - ".jpeg", - ".jpg", - ".js", - ".json", - ".png", - ".svg", - ".xml", - "/auth/", - "/authorize?", - "/captcha", - "/chat", - "/click", - "/feed?", - "/help", - "/join?", - "/joinchat", - "/privacy", - "/registration", - "/share", - "/showcaptcha", - "/stat/", - "/support", - "/terms", - "/tos", - "/tweet", - "Login", - "Special:", - "_click_", - "bit.ly", - "clickserve", - "https://t.co", - "itunes.apple.com", - "javascript:", - "l.facebook.com", - "legal.twitter.com", - "login", - "mail.", - "mailto:", - "mediawiki", - "messenger.com", - "policies", - "s.click", - "showcaptcha?", - "signup", - "smart-captcha/", - "support.", - "t.umblr.com", - "tel:", - "tg://", - "whatsapp://", - "zendesk", -) +logger.info(res.stdout) class FakeTraffic: @@ -85,133 +26,84 @@ def __init__( country="US", language="en-US", category="h", - min_wait=1, - max_wait=10, headless=True, ): - """Imitating an Internet user by mimicking popular web traffic (internet traffic generator). + """Internet traffic generator. country = country code ISO 3166-1 Alpha-2 code (https://www.iso.org/obp/ui/), language = country-language code ISO-639 and ISO-3166 (https://www.fincher.org/Utilities/CountryLanguageList.shtml), category = category of interest of a user (defaults to 'h'): 'all' (all), 'b' (business), 'e' (entertainment), 'm' (health), 's' (sports), 't' (sci/tech), 'h' (top stories); - min_wait = minimal delay between requests (defaults to 1), - max_wait = maximum delay between requests (defaults to 10), headless = True/False (defaults to True). """ self.country = country self.language = language self.category = category - self.min_wait = min_wait - self.max_wait = max_wait self.headless = headless - self.urls_queue = deque() - self.trends = set() - self.page = self.initialize_browser() - - @staticmethod - def url_in_blacklist(url): - if any(x in url for x in BLACKLIST): - logging.info(f"{url}, STATUS: in BLACKLIST") - return True + self.browser = None - @staticmethod - def url_fix(url): - if "https://" not in url and "http://" not in url: - url = f"https://{url}" - url = url.split("#")[0].split("?")[0] - return url + async def abrowse(self, url): + async with SEMAPHORE: + page = await self.browser.new_page() + await stealth_async(page) + try: + resp = await page.goto(url, wait_until="load") + logger.info(f"{resp.status} {resp.url}") + except Exception as ex: + logger.warning(f"{type(ex).__name__}: {ex}") + await page.close() - def initialize_browser(self): - """Initialize browser""" - try: - p = sync_playwright().__enter__() - browser = p.chromium.launch( + async def acrawl(self): + async with async_playwright() as p: + browser = await p.chromium.launch( args=["--disable-blink-features=AutomationControlled"], headless=self.headless, - slow_mo=100, ) - context = browser.new_context( + context = await browser.new_context( locale=self.language, viewport={"width": 1920, "height": 1080}, ) - page = context.new_page() - stealth_sync(page) - return page - except Exception as ex: - logging.warning(f"{type(ex).__name__}: {ex}") - - def get_url(self, url): - url = self.url_fix(url) - if not self.url_in_blacklist(url): - try: - resp = self.page.goto(url, wait_until="load") - logging.info(f"{resp.url} {resp.status}") - return self.page - except Exception as ex: - logging.warning(f"{url} {type(ex).__name__}: {ex}") - - def google_search(self, query): - self.page.goto("https://www.google.com") - self.page.fill('textarea[name="q"]', query) - self.page.press('textarea[name="q"]', "Enter") - self.page.wait_for_load_state("load") - result_urls = self.page.query_selector_all( - "//div[starts-with(@class, 'g ')]//span/a[@href]" - ) - result_urls = [link.get_attribute("href") for link in result_urls] - logging.info(f"google_search() {query=} GOT {len(result_urls)} results") - return result_urls - - def google_trends(self): - url = f"https://trends.google.com/trends/trendingsearches/realtime?geo={self.country}&hl={self.language}&category={self.category}" - self.page.goto(url, wait_until="load") - elements = self.page.query_selector_all("//div[@class='title']") - trends = [x for e in elements for x in e.inner_text().split(" • ")] - logging.info(f"google_trends() GOT {len(trends)} trends") - - for e in elements: - e.click() - self.page.wait_for_selector("//div[@class='carousel-wrapper']") - related_urls_elements = self.page.query_selector_all("//div[@class='carousel-wrapper']//a") - related_urls = [link.get_attribute("href") for link in related_urls_elements] - self.urls_queue.extend(related_urls) - return trends - - def parse_urls(self, page, base_url): - try: - elements = page.query_selector_all("a") - urls = [ - urljoin(base_url, x) for e in elements if (x := e.get_attribute("href")) - ] - return urls - except Exception as ex: - logging.warning(f"parse_urls() {type(ex).__name__}: {ex}") - return [] - - def recursive_browse(self, url, depth): - if depth: - resp = self.get_url(url) - if resp: - urls = self.parse_urls(resp, resp.url) - if urls: - url = choice(urls) - sleep(uniform(self.min_wait, self.max_wait)) - self.recursive_browse(url, depth - 1) + self.browser = context + + page = await self.browser.new_page() + await stealth_async(page) + + # google trends + url = f"https://trends.google.com/trends/trendingsearches/realtime?geo={self.country}&hl={self.language}&category={self.category}" + await page.goto(url, wait_until="load") + elements = await page.query_selector_all("//div[@class='title']") + keywords = [x for e in elements for x in (await e.inner_text()).split(" • ")] + logger.info(f"google_trends() GOT {len(keywords)} keywords") + + # google search + for keyword in keywords: + await page.goto("https://www.google.com") + await page.fill('textarea[name="q"]', keyword) + await page.press('textarea[name="q"]', "Enter") + while True: + # Check for a popup window and close it + if len(self.browser.pages) > 1: + await self.browser.pages[1].close() + # Scroll to the bottom of the page + await page.mouse.wheel(0, 1000) + await page.wait_for_load_state("networkidle") + await asyncio.sleep(0.2) + elements = await page.query_selector_all( + "//div[starts-with(@class, 'g ')]//span/a[@href]" + ) + if len(elements) > 50: + break + result_urls = [await link.get_attribute("href") for link in elements] + logger.info(f"google_search() {keyword=} GOT {len(result_urls)} results") + + # browse urls in parallel + tasks = [ + asyncio.create_task(self.abrowse(url)) for url in result_urls + ] + await asyncio.gather(*tasks) def crawl(self): - while True: - if not self.urls_queue: - if not self.trends: - self.trends = self.google_trends() - shuffle(self.trends) - trend = self.trends.pop() - search_results = self.google_search(trend) - self.urls_queue = deque(search_results) - - url = self.urls_queue.popleft() - depth = randint(3, 10) - self.recursive_browse(url, depth) + asyncio.run(self.acrawl()) if __name__ == "__main__": @@ -219,8 +111,6 @@ def crawl(self): country="US", language="en-US", category="h", - min_wait=1, - max_wait=10, headless=True, ) fake_traffic.crawl() diff --git a/fake_traffic/version.py b/fake_traffic/version.py index 5a6bc65..dac7778 100644 --- a/fake_traffic/version.py +++ b/fake_traffic/version.py @@ -1 +1 @@ -__version__ = "2.0.0" \ No newline at end of file +__version__ = "2.1.0" \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index b1f5721..29dd4f0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,11 +4,11 @@ build-backend = "setuptools.build_meta" [project] name = "fake_traffic" -description = "Imitating an Internet user by mimicking popular web traffic." +description = "Internet traffic generator." readme = "README.md" requires-python = ">=3.8" license = {text = "MIT License"} -keywords = ["python", "traffic generator"] +keywords = ["python", "traffic generator", "fake traffic"] authors = [ {name = "deedy5"} ] @@ -33,7 +33,7 @@ dependencies = [ dynamic = ["version"] [project.urls] # Optional -"Homepage" = "https://github.com/deedy5/duckduckgo_search" +"Homepage" = "https://github.com/deedy5/fake_traffic" [project.scripts] fake_traffic = "fake_traffic.cli:fake_traffic"