Skip to content

Commit

Permalink
bug 1909875: improve CrashIdsFailedToPublish information
Browse files Browse the repository at this point in the history
When a user uses the Reprocessing API, they send some number of crash
ids to be reprocessed. If something happens when publishing the crash
ids to PubSub, the crash id is added to a "failed" list and the
exception is logged. Then at the end if there were any failures, a
CrashIdsFailedToPublish exception is thrown with the list of failures.

That causes Django to return an HTTP 500 to the user and sentry captures
the CrashIdsFailedToPublish exception, but the event isn't very
helpful--we have no idea what happened.

This attempts to alleviate that in a couple of ways:

1. It uses sentry_sdk.capture_exception() to capture each problem
   enountered when publishing to PubSub.

   We'll be able to see what's going on in Sentry events.

2. It keeps track of the errors along with the crash ids and wraps that
   in the CrashIdsFailedToPublish exception message so we're not ending
   up with just a list of crash ids--we get the errors, too.
  • Loading branch information
willkg committed Oct 30, 2024
1 parent a1e14a5 commit 654c7ff
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 7 deletions.
12 changes: 6 additions & 6 deletions socorro/external/pubsub/crashqueue.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from google.cloud.pubsub_v1 import PublisherClient, SubscriberClient
from google.cloud.pubsub_v1.types import BatchSettings, PublisherOptions
from more_itertools import chunked
import sentry_sdk

from socorro.external.crashqueue_base import CrashQueueBase

Expand Down Expand Up @@ -260,15 +261,14 @@ def publish(self, queue, crash_ids):
for i, future in enumerate(futures):
try:
future.result()
except Exception:
except Exception as exc:
sentry_sdk.capture_exception(exc)
logger.exception(
"Crashid failed to publish: %s %s",
"crashid failed to publish: %s %s",
queue,
batch[i],
)
failed.append(batch[i])
failed.append((repr(exc), batch[i]))

if failed:
raise CrashIdsFailedToPublish(
f"Crashids failed to publish: {','.join(failed)}"
)
raise CrashIdsFailedToPublish(f"Crashids failed to publish: {failed!r}")
31 changes: 30 additions & 1 deletion socorro/tests/external/pubsub/test_crashqueue.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@

import pytest

from socorro import settings
from socorro.external.pubsub.crashqueue import CrashIdsFailedToPublish
from socorro.libclass import build_instance_from_settings
from socorro.lib.libooid import create_new_ooid
from socorro import settings


# Amount of time to sleep between publish and pull so messages are available
PUBSUB_DELAY_PULL = 0.5
Expand Down Expand Up @@ -108,3 +110,30 @@ def test_publish_many(self, pubsub_helper, queue):

published_crash_ids = pubsub_helper.get_published_crashids(queue)
assert set(published_crash_ids) == {crash_id_1, crash_id_2, crash_id_3}

def test_publish_with_error(self, pubsub_helper, sentry_helper):
queue = "reprocessing"
crash_id = create_new_ooid()

crashqueue = build_instance_from_settings(settings.QUEUE_PUBSUB)

# Run teardown_queues in the helper so there's no queue. That will cause an
# error to get thrown by PubSub.
pubsub_helper.teardown_queues()

with sentry_helper.init() as sentry_client:
try:
crashqueue.publish(queue, [crash_id])
except CrashIdsFailedToPublish as exc:
print(exc)

# wait for published messages to become available before pulling
time.sleep(PUBSUB_DELAY_PULL)

(envelope,) = sentry_client.envelope_payloads
errors = [
f"{error['type']} {error['value']}"
for error in envelope["exception"]["values"]
]

assert "NotFound Topic not found" in errors

0 comments on commit 654c7ff

Please sign in to comment.