Proxy support and firefox as default engine

raznem · Aug 16, 2024 · 1b594ae · 1b594ae
1 parent 9deb134
commit 1b594ae
Show file tree

Hide file tree

Showing 3 changed files with 36 additions and 9 deletions.
diff --git a/parsera/main.py b/parsera/main.py
@@ -14,16 +14,27 @@ def __init__(self, model: BaseChatModel | None = None):
         else:
             self.model = model
 
-    async def _run(self, url: str, elements: dict) -> dict:
-        content = await fetch_page_content(url=url)
+    async def _run(
+        self, url: str, elements: dict, proxy_settings: dict | None = None
+    ) -> dict:
+        if proxy_settings:
+            content = await fetch_page_content(url=url, proxy_settings=proxy_settings)
+        else:
+            content = await fetch_page_content(url=url)
         extractor = TabularExtractor(
             elements=elements, model=self.model, content=content
         )
         result = await extractor.run()
         return result
 
-    def run(self, url: str, elements: dict) -> dict:
-        return asyncio.run(self._run(url=url, elements=elements))
+    def run(self, url: str, elements: dict, proxy_settings: dict | None = None) -> dict:
+        return asyncio.run(
+            self._run(url=url, elements=elements, proxy_settings=proxy_settings)
+        )
 
-    async def arun(self, url: str, elements: dict) -> dict:
-        return await self._run(url=url, elements=elements)
+    async def arun(
+        self, url: str, elements: dict, proxy_settings: dict | None = None
+    ) -> dict:
+        return await self._run(
+            url=url, elements=elements, proxy_settings=proxy_settings
+        )
diff --git a/parsera/page.py b/parsera/page.py
@@ -1,11 +1,27 @@
+from typing import TypedDict
+
 from playwright.async_api import async_playwright
 from playwright_stealth import stealth_async
 
 
-async def fetch_page_content(url: str) -> str:
+class ProxySettings(TypedDict, total=False):
+    server: str
+    bypass: str | None = None
+    username: str | None = None
+    password: str | None = None
+
+
+async def fetch_page_content(
+    url: str,
+    proxy_settings: ProxySettings | None = None,
+    browser: str = "firefox",
+) -> str:
     async with async_playwright() as p:
         # Launch the browser
-        browser = await p.chromium.launch(headless=True)
+        if browser == "firefox":
+            browser = await p.firefox.launch(headless=True, proxy=proxy_settings)
+        else:
+            browser = await p.chromium.launch(headless=True, proxy=proxy_settings)
         # Open a new browser context
         context = await browser.new_context()
         # Open a new page

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "parsera"
-version = "0.1.2"
+version = "0.1.3"
 description = "Lightweight library for scraping web-sites with LLMs"
 authors = ["Mikhail Zanka <raznem@gmail.com>"]
 license = "GPL-2.0-or-later"