Skip to content

Commit

Permalink
Change the DropLowProbabilityItemPipeline logic for nested items.
Browse files Browse the repository at this point in the history
  • Loading branch information
wRAR committed Dec 24, 2024
1 parent 70f6204 commit 719b841
Showing 1 changed file with 32 additions and 26 deletions.
58 changes: 32 additions & 26 deletions zyte_common_items/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,36 +115,42 @@ def get_threshold_for_item(self, item, spider):
def get_item_name(self, item):
return item.__class__.__name__

def process_item(self, item, spider):
from scrapy.exceptions import DropItem

if isinstance(item, dict):
# support for custom attrs
for item_type, item_instance in item.items():
if item_type is not CustomAttributes:
real_item = item_instance
break
else:
return item
else:
real_item = item

item_name = self.get_item_name(real_item)
item_proba = real_item.get_probability()
threshold = self.get_threshold_for_item(real_item, spider)

def _process_probability(self, item, threshold):
item_name = self.get_item_name(item)
item_proba = item.get_probability()
self.stats.inc_value("drop_low_probability_item/processed")
self.stats.inc_value(f"drop_low_probability_item/processed/{item_name}")

if item_proba is None or item_proba >= threshold:
self.stats.inc_value("drop_low_probability_item/kept")
self.stats.inc_value(f"drop_low_probability_item/kept/{item_name}")
return item
return True

Check warning on line 126 in zyte_common_items/pipelines.py

View check run for this annotation

Codecov / codecov/patch

zyte_common_items/pipelines.py#L126

Added line #L126 was not covered by tests
else:
self.stats.inc_value("drop_low_probability_item/dropped")
self.stats.inc_value(f"drop_low_probability_item/dropped/{item_name}")
return False

Check warning on line 130 in zyte_common_items/pipelines.py

View check run for this annotation

Codecov / codecov/patch

zyte_common_items/pipelines.py#L128-L130

Added lines #L128 - L130 were not covered by tests

self.stats.inc_value("drop_low_probability_item/dropped")
self.stats.inc_value(f"drop_low_probability_item/dropped/{item_name}")
def process_item(self, item, spider):
from scrapy.exceptions import DropItem

Check warning on line 133 in zyte_common_items/pipelines.py

View check run for this annotation

Codecov / codecov/patch

zyte_common_items/pipelines.py#L133

Added line #L133 was not covered by tests

raise DropItem(
f"This item is dropped since the probability ({item_proba}) "
f"is below the threshold ({threshold}):\n{item!r}"
)
if isinstance(item, dict):

Check warning on line 135 in zyte_common_items/pipelines.py

View check run for this annotation

Codecov / codecov/patch

zyte_common_items/pipelines.py#L135

Added line #L135 was not covered by tests
# for nested items remove sub-items that have low probability
# instead of dropping the whole result
new_item = {}
for item_type, sub_item in item.items():
if item_type is CustomAttributes:
continue
threshold = self.get_threshold_for_item(sub_item, spider)
if self._process_probability(sub_item, threshold):
new_item[item_type] = sub_item
if not new_item:

Check warning on line 145 in zyte_common_items/pipelines.py

View check run for this annotation

Codecov / codecov/patch

zyte_common_items/pipelines.py#L138-L145

Added lines #L138 - L145 were not covered by tests
# everything has been removed
raise DropItem
return new_item

Check warning on line 148 in zyte_common_items/pipelines.py

View check run for this annotation

Codecov / codecov/patch

zyte_common_items/pipelines.py#L147-L148

Added lines #L147 - L148 were not covered by tests
else:
threshold = self.get_threshold_for_item(item, spider)
if self._process_probability(item, threshold):
return item
raise DropItem(

Check warning on line 153 in zyte_common_items/pipelines.py

View check run for this annotation

Codecov / codecov/patch

zyte_common_items/pipelines.py#L150-L153

Added lines #L150 - L153 were not covered by tests
f"This item is dropped since the probability ({item.get_probability()}) "
f"is below the threshold ({threshold}):\n{item!r}"
)

0 comments on commit 719b841

Please sign in to comment.