Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Close inactive contexts #206

Open
wants to merge 36 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
c297346
scrapy_playwright._utils module
elacuesta Feb 9, 2023
43710fb
PLAYWRIGHT_CLOSE_INACTIVE_CONTEXT_INTERVAL setting
elacuesta Feb 9, 2023
92ad7f1
Simplify logging
elacuesta Feb 10, 2023
863d2b4
Simplify setting processing
elacuesta Feb 10, 2023
bf4fbf2
Readme: add PLAYWRIGHT_CLOSE_INACTIVE_CONTEXT_INTERVAL
elacuesta Feb 10, 2023
78fcc5b
Merge branch 'main' into close-inactive-contexts
elacuesta Feb 10, 2023
95b1c5e
Reading PLAYWRIGHT_CLOSE_INACTIVE_CONTEXT_INTERVAL setting
elacuesta Feb 17, 2023
a4ff860
Rename setting
elacuesta Feb 19, 2023
bd17f9c
Test closing inactive contexts
elacuesta Feb 19, 2023
ef0be70
Remove unused import
elacuesta Feb 19, 2023
e18cd70
Rename context lock
elacuesta Feb 19, 2023
f14e751
Update close inactive context test
elacuesta Feb 19, 2023
690a207
Waiting event
elacuesta Feb 19, 2023
2f543af
Removed unused _async_delay helper
elacuesta Feb 19, 2023
6fdc229
Remove irrelevant comment in tests
elacuesta Feb 19, 2023
5acff74
Only attempt to close context if it's still working
elacuesta Jul 13, 2023
8035e27
Merge remote-tracking branch 'origin/main' into close-inactive-contexts
elacuesta Aug 7, 2023
c779e19
Merge remote-tracking branch 'origin/main' into close-inactive-contexts
elacuesta Aug 16, 2023
65104db
Adapt test for caplog fixture as instance attribute
elacuesta Aug 16, 2023
d97c6f3
Merge branch 'main' into close-inactive-contexts
elacuesta Aug 24, 2023
01649ce
Rename util method, add tests
elacuesta Aug 24, 2023
a609ae5
except Exception
elacuesta Aug 25, 2023
12eb537
Merge remote-tracking branch 'origin/main' into close-inactive-contexts
elacuesta Aug 29, 2023
a1a040d
Test zero value
elacuesta Aug 29, 2023
baf4f57
Merge remote-tracking branch 'origin/main' into close-inactive-contexts
elacuesta Nov 29, 2023
3dd74c9
Make black & flake8 happy
elacuesta Nov 29, 2023
a87e0bb
Merge remote-tracking branch 'origin/main' into close-inactive-contexts
elacuesta Dec 9, 2023
1b22c81
Merge remote-tracking branch 'origin/main' into close-inactive-contexts
elacuesta Dec 11, 2023
77b6721
Merge remote-tracking branch 'origin/main' into close-inactive-contexts
elacuesta Jun 14, 2024
53365c3
Merge remote-tracking branch 'origin/main' into close-inactive-contexts
elacuesta Jun 21, 2024
98ba7bf
Remove pytest.mark.asyncio
elacuesta Jun 21, 2024
27c4509
Merge remote-tracking branch 'origin/main' into close-inactive-contexts
elacuesta Jun 28, 2024
996105c
Remove experimental notes from the readme
elacuesta Jun 28, 2024
5ed0764
Allow Windows test for closing inactive contexts
elacuesta Jun 28, 2024
b1079d7
Merge remote-tracking branch 'origin/main' into close-inactive-contexts
elacuesta Jul 3, 2024
891c95b
Merge remote-tracking branch 'origin/main' into close-inactive-contexts
elacuesta Jul 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,17 @@ See the [notes about leaving unclosed pages](#receiving-page-objects-in-callback
PLAYWRIGHT_MAX_PAGES_PER_CONTEXT = 4
```

### `PLAYWRIGHT_CLOSE_CONTEXT_INTERVAL`
Type `Optional[float]`, default `None`

If set to a non-zero value, browser contexts will be automatically closed after
spending the specified amount of seconds without open pages. Set to `None`
(the default) to disable, i.e. contexts remain open until explicitly closed.

```python
PLAYWRIGHT_CLOSE_CONTEXT_INTERVAL = 5 * 60 # 5 minutes
```

### `PLAYWRIGHT_ABORT_REQUEST`
Type `Optional[Union[Callable, str]]`, default `None`

Expand Down Expand Up @@ -692,6 +703,12 @@ yield scrapy.Request(
Please note that if a context with the specified name already exists,
that context is used and `playwright_context_kwargs` are ignored.

### Automatically closing inactive contexts

Specifying a non-negative integer value for the
[`PLAYWRIGHT_CLOSE_CONTEXT_INTERVAL`](#playwright_close_context_interval)
setting enables closing browser contexts which have no active pages.

### Closing contexts while crawling

After [receiving the Page object in your callback](#receiving-page-objects-in-callbacks),
Expand Down
37 changes: 35 additions & 2 deletions scrapy_playwright/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,10 @@
@dataclass
class BrowserContextWrapper:
context: BrowserContext
semaphore: asyncio.Semaphore
persistent: bool
semaphore: asyncio.Semaphore # limit amount of pages
inactive: asyncio.Event
waiting_close: asyncio.Event


@dataclass
Expand Down Expand Up @@ -92,6 +94,7 @@ class Config:
startup_context_kwargs: dict
navigation_timeout: Optional[float]
restart_disconnected_browser: bool
close_context_interval: Optional[float]

@classmethod
def from_settings(cls, settings: Settings) -> "Config":
Expand All @@ -115,6 +118,9 @@ def from_settings(cls, settings: Settings) -> "Config":
restart_disconnected_browser=settings.getbool(
"PLAYWRIGHT_RESTART_DISCONNECTED_BROWSER", default=True
),
close_context_interval=_get_float_setting(
settings, "PLAYWRIGHT_CLOSE_CONTEXT_INTERVAL"
),
)
cfg.cdp_kwargs.pop("endpoint_url", None)
cfg.connect_kwargs.pop("ws_endpoint", None)
Expand Down Expand Up @@ -264,12 +270,34 @@ async def _create_browser_context(
context.set_default_navigation_timeout(self.config.navigation_timeout)
self.context_wrappers[name] = BrowserContextWrapper(
context=context,
semaphore=asyncio.Semaphore(value=self.config.max_pages_per_context),
persistent=persistent,
semaphore=asyncio.Semaphore(value=self.config.max_pages_per_context),
inactive=asyncio.Event(),
waiting_close=asyncio.Event(),
)
if self.config.close_context_interval is not None:
asyncio.create_task(self._maybe_close_inactive_context(name=name, spider=spider))
self._set_max_concurrent_context_count()
return self.context_wrappers[name]

async def _maybe_close_inactive_context(
self, name: str, spider: Optional[Spider] = None
) -> None:
"""Close a context if it has had no pages for a certain amount of time."""
while name in self.context_wrappers:
context_wrapper = self.context_wrappers[name]
await context_wrapper.inactive.wait()
context_wrapper.waiting_close.set()
await asyncio.sleep(self.config.close_context_interval) # type: ignore [arg-type]
if context_wrapper.waiting_close.is_set() and not context_wrapper.context.pages:
logger.info(
"[Context=%s] Closing inactive browser context",
name,
extra={"spider": spider, "context_name": name},
)
await context_wrapper.context.close()
break

async def _create_page(self, request: Request, spider: Spider) -> Page:
"""Create a new page in a context, also creating a new context if necessary."""
context_name = request.meta.setdefault("playwright_context", DEFAULT_CONTEXT_NAME)
Expand All @@ -285,6 +313,8 @@ async def _create_page(self, request: Request, spider: Spider) -> Page:
)

await ctx_wrapper.semaphore.acquire()
ctx_wrapper.inactive.clear()
ctx_wrapper.waiting_close.clear()
page = await ctx_wrapper.context.new_page()
self.stats.inc_value("playwright/page_count")
total_page_count = self._get_total_page_count()
Expand Down Expand Up @@ -339,6 +369,7 @@ def close(self) -> Deferred:
_ThreadedLoopAdapter.stop()

async def _close(self) -> None:
logger.info("Closing %i contexts", len(self.context_wrappers))
await asyncio.gather(*[ctx.context.close() for ctx in self.context_wrappers.values()])
self.context_wrappers.clear()
if hasattr(self, "browser"):
Expand Down Expand Up @@ -626,6 +657,8 @@ def _make_close_page_callback(self, context_name: str) -> Callable:
def close_page_callback() -> None:
if context_name in self.context_wrappers:
self.context_wrappers[context_name].semaphore.release()
if not self.context_wrappers[context_name].context.pages:
self.context_wrappers[context_name].inactive.set()

return close_page_callback

Expand Down
35 changes: 34 additions & 1 deletion tests/tests_asyncio/test_browser_contexts.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import asyncio
import logging
import platform
import tempfile
from pathlib import Path
Expand All @@ -11,10 +12,15 @@
from scrapy_playwright.page import PageMethod

from tests import allow_windows, make_handler
from tests.mockserver import StaticMockServer
from tests.mockserver import MockServer, StaticMockServer


class MixinTestCaseMultipleContexts:
@pytest.fixture(autouse=True)
def inject_fixtures(self, caplog):
caplog.set_level(logging.DEBUG)
self._caplog = caplog

@allow_windows
async def test_context_kwargs(self):
settings_dict = {
Expand Down Expand Up @@ -224,6 +230,33 @@ async def test_contexts_dynamic(self):
assert cookie["value"] == "qwerty"
assert cookie["domain"] == "example.org"

@allow_windows
async def test_close_inactive_context(self):
spider = Spider("foo")
async with make_handler(
{
"PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
"PLAYWRIGHT_CLOSE_CONTEXT_INTERVAL": 0.5,
}
) as handler:
assert len(handler.context_wrappers) == 0
with MockServer() as server:
await handler._download_request(
Request(server.urljoin("/headers"), meta={"playwright": True}), spider
)
assert len(handler.context_wrappers) == 1
await asyncio.sleep(0.3)
await handler._download_request(
Request(server.urljoin("/delay/1"), meta={"playwright": True}), spider
)
await asyncio.sleep(0.7)
assert len(handler.context_wrappers) == 0
assert (
"scrapy-playwright",
logging.INFO,
"[Context=default] Closing inactive browser context",
) in self._caplog.record_tuples


class TestCaseMultipleContextsChromium(IsolatedAsyncioTestCase, MixinTestCaseMultipleContexts):
browser_type = "chromium"
Expand Down
Loading