Skip to content

Commit

Permalink
Merge pull request #98 from Gallaecio/serp
Browse files Browse the repository at this point in the history
Add Serp
  • Loading branch information
kmike authored Sep 4, 2024
2 parents 7cc0936 + c3abb74 commit f93ab09
Show file tree
Hide file tree
Showing 12 changed files with 211 additions and 1 deletion.
3 changes: 3 additions & 0 deletions docs/reference/components.rst
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ tied to any specific item type.
.. autoclass:: zyte_common_items.Request(**kwargs)
:members:

.. autoclass:: zyte_common_items.SerpOrganicResult(**kwargs)
:members:

.. autoclass:: zyte_common_items.SocialMediaPostAuthor(**kwargs)
:members:

Expand Down
11 changes: 11 additions & 0 deletions docs/reference/items.rst
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,17 @@ Job posting
:members: dateDownloaded, probability, validationMessages


Search engine results
=====================

.. autoclass:: zyte_common_items.Serp(**kwargs)
:members:
:inherited-members:

.. autoclass:: zyte_common_items.SerpMetadata(**kwargs)
:members: dateDownloaded, displayedQuery, searchedQuery, totalOrganicResults, validationMessages


Social media post
=================

Expand Down
12 changes: 12 additions & 0 deletions docs/reference/pages.rst
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,18 @@ Job posting
.. autoclass:: zyte_common_items.AutoJobPostingPage(**kwargs)
:show-inheritance:

Search engine results
=====================

.. autoclass:: zyte_common_items.BaseSerpPage(**kwargs)
:show-inheritance:

.. autoclass:: zyte_common_items.SerpPage(**kwargs)
:show-inheritance:

.. autoclass:: zyte_common_items.AutoSerpPage(**kwargs)
:show-inheritance:

Social media post
=================

Expand Down
10 changes: 10 additions & 0 deletions tests/test_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
Reactions,
RealEstateArea,
Request,
SerpOrganicResult,
SocialMediaPostAuthor,
SocialMediaPostMetadata,
StarRating,
Expand Down Expand Up @@ -199,6 +200,15 @@ def test_reactions():
Reactions(reposts=1, likes=2, dislikes=3)


def test_serp_organic_result():
SerpOrganicResult(
description="used as metasyntactic variables and placeholder names in computer programming or computer-related documentation.",
name="Foobar",
url="https://en.wikipedia.org/wiki/Foobar",
rank=1,
)


def test_social_media_post_author():
SocialMediaPostAuthor(
numberOfFollowers=5,
Expand Down
51 changes: 51 additions & 0 deletions tests/test_items.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@
RealEstateArea,
RealEstateMetadata,
Request,
Serp,
SerpMetadata,
SerpOrganicResult,
SocialMediaPost,
SocialMediaPostAuthor,
SocialMediaPostMetadata,
Expand Down Expand Up @@ -498,6 +501,30 @@
),
}

_SERP_MIN_KWARGS: dict = {
"url": "https://example.com/search?q=foo+bar",
}

_SERP_ALL_KWARGS: dict = {
**_SERP_MIN_KWARGS,
"organicResults": [
SerpOrganicResult(
description="used as metasyntactic variables and placeholder names in computer programming or computer-related documentation.",
name="Foobar",
url="https://en.wikipedia.org/wiki/Foobar",
rank=1,
),
],
"url": "https://example.com/search?q=foo+bar",
"pageNumber": 1,
"metadata": SerpMetadata(
dateDownloaded="2022-12-31T13:01:54Z",
displayedQuery="foo bar",
searchedQuery="foo bar",
totalOrganicResults=999_999_999_999,
),
}

_SOCIAL_MEDIA_POST_MIN_KWARGS: dict = {
"url": "https://example.com/viewjob/12345",
}
Expand Down Expand Up @@ -837,6 +864,28 @@ def test_job_posting_missing_fields():
JobPosting(**incomplete_kwargs)


def test_serp_all_fields():
serp = Serp(**_SERP_ALL_KWARGS)
for field in list(_SERP_ALL_KWARGS):
assert getattr(serp, field) == _SERP_ALL_KWARGS[field]


def test_serp_min_fields():
serp = Serp(**_SERP_MIN_KWARGS)
for field in list(_SERP_ALL_KWARGS):
if field in _SERP_MIN_KWARGS:
continue
assert getattr(serp, field) is None


def test_serp_missing_fields():
for required_field in list(_SERP_MIN_KWARGS):
incomplete_kwargs: dict = copy(_SERP_MIN_KWARGS)
del incomplete_kwargs[required_field]
with pytest.raises(TypeError):
Serp(**incomplete_kwargs)


def test_social_media_post_all_fields():
social_media_post = SocialMediaPost(**_SOCIAL_MEDIA_POST_ALL_KWARGS)
for field in list(_SOCIAL_MEDIA_POST_ALL_KWARGS):
Expand Down Expand Up @@ -874,6 +923,8 @@ def test_social_media_post_missing_fields():
(ProductNavigation, False),
(ProductVariant, False),
(RealEstate, True),
(Serp, False),
(SocialMediaPost, True),
),
)
def test_get_probability_request(cls, has_proba):
Expand Down
7 changes: 7 additions & 0 deletions tests/test_pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,13 @@ def test_matching_items():
"ProductNavigation": {"dateDownloaded", "validationMessages"},
"RealEstate": {"dateDownloaded", "probability", "validationMessages"},
"JobPosting": {"dateDownloaded", "probability", "searchText", "validationMessages"},
"Serp": {
"dateDownloaded",
"displayedQuery",
"searchedQuery",
"totalOrganicResults",
"validationMessages",
},
"SocialMediaPost": {
"dateDownloaded",
"probability",
Expand Down
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ commands = mypy zyte_common_items tests
basepython = python3
deps =
twine==5.1.1
build==0.10.0
build==1.2.1
commands =
python -m build --sdist
twine check dist/*
Expand Down
6 changes: 6 additions & 0 deletions zyte_common_items/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@
RealEstateMetadata,
SearchRequestTemplate,
SearchRequestTemplateMetadata,
Serp,
SerpMetadata,
SerpOrganicResult,
SocialMediaPost,
SocialMediaPostMetadata,
)
Expand All @@ -77,6 +80,7 @@
AutoProductNavigationPage,
AutoProductPage,
AutoRealEstatePage,
AutoSerpPage,
AutoSocialMediaPostPage,
BaseArticleListPage,
BaseArticleNavigationPage,
Expand All @@ -88,6 +92,7 @@
BaseProductNavigationPage,
BaseProductPage,
BaseRealEstatePage,
BaseSerpPage,
BaseSocialMediaPostPage,
BusinessPlacePage,
HasMetadata,
Expand All @@ -99,5 +104,6 @@
ProductPage,
RealEstatePage,
SearchRequestTemplatePage,
SerpPage,
SocialMediaPostPage,
)
1 change: 1 addition & 0 deletions zyte_common_items/items/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@
SearchRequestTemplate,
SearchRequestTemplateMetadata,
)
from .serp import Serp, SerpMetadata, SerpOrganicResult
from .social_media_post import SocialMediaPost, SocialMediaPostMetadata
69 changes: 69 additions & 0 deletions zyte_common_items/items/serp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from typing import List, Optional

import attrs

from zyte_common_items.base import Item
from zyte_common_items.components import ListMetadata
from zyte_common_items.converters import (
to_metadata_optional,
url_to_str,
url_to_str_optional,
)


@attrs.define(kw_only=True)
class SerpOrganicResult:
"""Data from a non-paid result of a search engine results page."""

#: Result excerpt.
description: Optional[str] = None

#: Result title.
name: Optional[str] = None

#: Result URL.
url: Optional[str] = attrs.field(
default=None, converter=url_to_str_optional, kw_only=True
)

#: Result position among other organic results from the same search engine
#: results page.
#:
#: This is the rank within a specific page, not within an entire search.
#: That is, the first result of any page, even if it not the first page of
#: a search, must be 1.
rank: Optional[int] = None


@attrs.define(kw_only=True)
class SerpMetadata(ListMetadata):
"""Metadata class for :data:`zyte_common_items.Serp.metadata`."""

#: Search query as seen in the webpage.
displayedQuery: Optional[str] = None

#: Search query as specified in the input URL.
searchedQuery: Optional[str] = None

#: Total number of organic results reported by the search engine.
totalOrganicResults: Optional[int] = None


@attrs.define(kw_only=True)
class Serp(Item):
"""Data from a `search engine results page
<https://en.wikipedia.org/wiki/Search_engine_results_page>`_."""

#: List of search results excluding paid results.
organicResults: Optional[List[SerpOrganicResult]] = None

#: Search URL.
url: str = attrs.field(converter=url_to_str)

#: Page number.
pageNumber: Optional[int] = None

#: Contains metadata about the data extraction process.
metadata: Optional[SerpMetadata] = attrs.field(
default=None, converter=to_metadata_optional(SerpMetadata), kw_only=True # type: ignore[misc]
)
1 change: 1 addition & 0 deletions zyte_common_items/pages/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
)
from .real_estate import AutoRealEstatePage, BaseRealEstatePage, RealEstatePage
from .search_request_template import SearchRequestTemplatePage
from .serp import AutoSerpPage, BaseSerpPage, SerpPage
from .social_media_post import (
AutoSocialMediaPostPage,
BaseSocialMediaPostPage,
Expand Down
39 changes: 39 additions & 0 deletions zyte_common_items/pages/serp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from typing import List, Optional

import attrs
from web_poet import Returns

from zyte_common_items.fields import auto_field
from zyte_common_items.items import Serp, SerpMetadata, SerpOrganicResult

from .base import BasePage, Page
from .mixins import HasMetadata


class BaseSerpPage(BasePage, Returns[Serp], HasMetadata[SerpMetadata]):
pass


class SerpPage(Page, Returns[Serp], HasMetadata[SerpMetadata]):
pass


@attrs.define
class AutoSerpPage(BaseSerpPage):
serp: Serp

@auto_field
def organicResults(self) -> Optional[List[SerpOrganicResult]]:
return self.serp.organicResults

@auto_field
def url(self) -> str:
return self.serp.url

@auto_field
def pageNumber(self) -> Optional[int]:
return self.serp.pageNumber

@auto_field
def metadata(self) -> Optional[SerpMetadata]:
return self.serp.metadata

0 comments on commit f93ab09

Please sign in to comment.