Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions src/sentry/integrations/utils/issue_summary_for_alerts.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
from typing import Any

import sentry_sdk
from django.contrib.auth.models import AnonymousUser

from sentry import features, options
from sentry.issues.grouptype import GroupCategory
from sentry.models.group import Group
from sentry.seer.autofix.constants import SeerAutomationSource
from sentry.seer.autofix.issue_summary import get_issue_summary
from sentry.seer.autofix.issue_summary import get_issue_summary, run_automation
from sentry.seer.autofix.utils import is_seer_scanner_rate_limited
from sentry.utils.concurrent import ContextPropagatingThreadPoolExecutor

Expand Down Expand Up @@ -52,9 +53,19 @@ def fetch_issue_summary(group: Group) -> dict[str, Any] | None:
future = executor.submit(
get_issue_summary, group, source=SeerAutomationSource.ALERT
)
summary_result, status_code = future.result(timeout=timeout)
summary_result, status_code, event = future.result(timeout=timeout)

if status_code == 200:
if event is not None:
try:
run_automation(
group, AnonymousUser(), event, SeerAutomationSource.ALERT
)
except Exception:
logger.exception(
"seer.automation.run_automation_failed",
extra={"group_id": group.id},
)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alert path runs automation without timeout protection

Medium Severity

The run_automation call, which makes HTTP requests to the Seer API, now runs synchronously on the main thread after the timed issue summary generation. This work is no longer covered by the timeout, allowing slow Seer responses to block the alert notification thread.

Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit 901ed71. Configure here.

return summary_result
return None
except concurrent.futures.TimeoutError:
Expand Down
46 changes: 16 additions & 30 deletions src/sentry/seer/autofix/issue_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -520,15 +520,13 @@ def _generate_summary(
group: Group,
user: User | RpcUser | AnonymousUser,
force_event_id: str | None,
source: SeerAutomationSource,
cache_key: str,
should_run_automation: bool = True,
) -> tuple[dict[str, Any], int]:
) -> tuple[dict[str, Any], int, GroupEvent | None]:
Comment thread
cursor[bot] marked this conversation as resolved.
"""Core logic to generate and cache the issue summary."""
serialized_event, event = _get_event(group, user, provided_event_id=force_event_id)

if not serialized_event or not event:
return {"detail": "Could not find an event for the issue"}, 400
return {"detail": "Could not find an event for the issue"}, 400, None

trace_tree = None
if event:
Expand Down Expand Up @@ -571,15 +569,7 @@ def _generate_summary(
summary_dict["event_id"] = event.event_id
cache.set(cache_key, summary_dict, timeout=int(timedelta(days=7).total_seconds()))

if should_run_automation:
try:
run_automation(group, user, event, source)
except Exception:
logger.exception(
"Error auto-triggering autofix from issue summary", extra={"group_id": group.id}
)

return summary_dict, 200
return summary_dict, 200, event
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved the automation responsibility to the calling spot, it was getting too complicated tracing all the various paths and implicit behavior.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can't remove this here. The old Seer SKU (not per seat orgs) still use this flow.
It is the "old Seer" in this diagram - https://miro.com/app/board/uXjVJqn1-fQ=/?focusWidget=3458764648325594380

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that should still be preserved.

def kick_off_seer_automation(job: PostProcessJob) -> None:
  ...
    if not is_seer_seat_based_tier_enabled(group.organization):
        generate_summary_and_run_automation.delay(group.id, trigger_path="old_seer_automation")

def generate_summary_and_run_automation(group_id: int, **kwargs) -> None:
    if status_code == 200 and event is not None:
        run_automation(group, AnonymousUser(), event, SeerAutomationSource.POST_PROCESS)

The generate_summary_and_run_automation() func now explicitly calls run_automation() instead of relying on get_issue_summary() to call it.



def _log_seer_scanner_billing_event(group: Group, source: SeerAutomationSource):
Expand All @@ -604,8 +594,7 @@ def get_issue_summary(
user: User | RpcUser | AnonymousUser | None = None,
force_event_id: str | None = None,
source: SeerAutomationSource = SeerAutomationSource.ISSUE_DETAILS,
should_run_automation: bool = True,
) -> tuple[dict[str, Any], int]:
) -> tuple[dict[str, Any], int, GroupEvent | None]:
Comment thread
cursor[bot] marked this conversation as resolved.
"""
Generate an AI summary for an issue.

Expand All @@ -614,18 +603,17 @@ def get_issue_summary(
user: The user requesting the summary
force_event_id: Optional event ID to force summarizing a specific event
source: The source triggering the summary generation
should_run_automation: Whether to trigger automation after generating summary
Comment thread
cursor[bot] marked this conversation as resolved.

Returns:
A tuple containing (summary_data, status_code)
A tuple containing (summary_data, status_code, event)
"""
if user is None:
user = AnonymousUser()
if not features.has("organizations:gen-ai-features", group.organization, actor=user):
return {"detail": "Feature flag not enabled"}, 400
return {"detail": "Feature flag not enabled"}, 400, None

if group.organization.get_option("sentry:hide_ai_features"):
return {"detail": "AI features are disabled for this organization."}, 403
return {"detail": "AI features are disabled for this organization."}, 403, None

cache_key = get_issue_summary_cache_key(group.id)
lock_key, lock_name = get_issue_summary_lock_key(group.id)
Expand All @@ -634,15 +622,13 @@ def get_issue_summary(

# if force_event_id is set, we always generate a new summary
if force_event_id:
summary_dict, status_code = _generate_summary(
group, user, force_event_id, source, cache_key, should_run_automation
)
summary_dict, status_code, event = _generate_summary(group, user, force_event_id, cache_key)
_log_seer_scanner_billing_event(group, source)
return convert_dict_key_case(summary_dict, snake_to_camel_case), status_code
return convert_dict_key_case(summary_dict, snake_to_camel_case), status_code, event

# 1. Check cache first
if cached_summary := cache.get(cache_key):
return convert_dict_key_case(cached_summary, snake_to_camel_case), 200
return convert_dict_key_case(cached_summary, snake_to_camel_case), 200, None

# 2. Try to acquire lock
try:
Expand All @@ -653,17 +639,17 @@ def get_issue_summary(
# Re-check cache after acquiring lock, in case another process finished
# while we were waiting for the lock.
if cached_summary := cache.get(cache_key):
return convert_dict_key_case(cached_summary, snake_to_camel_case), 200
return convert_dict_key_case(cached_summary, snake_to_camel_case), 200, None

# Lock acquired and cache is still empty, proceed with generation
summary_dict, status_code = _generate_summary(
group, user, force_event_id, source, cache_key, should_run_automation
summary_dict, status_code, event = _generate_summary(
group, user, force_event_id, cache_key
)
_log_seer_scanner_billing_event(group, source)
return convert_dict_key_case(summary_dict, snake_to_camel_case), status_code
return convert_dict_key_case(summary_dict, snake_to_camel_case), status_code, event

except UnableToAcquireLock:
# Failed to acquire lock within timeout. Check cache one last time.
if cached_summary := cache.get(cache_key):
return convert_dict_key_case(cached_summary, snake_to_camel_case), 200
return {"detail": "Timeout waiting for summary generation lock"}, 503
return convert_dict_key_case(cached_summary, snake_to_camel_case), 200, None
return {"detail": "Timeout waiting for summary generation lock"}, 503, None
2 changes: 1 addition & 1 deletion src/sentry/seer/endpoints/group_ai_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def post(self, request: Request, group: Group) -> Response:
data = orjson.loads(request.body) if request.body else {}
force_event_id = data.get("event_id", None)

summary_data, status_code = get_issue_summary(
summary_data, status_code, _ = get_issue_summary(
group=group,
user=request.user,
force_event_id=force_event_id,
Expand Down
4 changes: 2 additions & 2 deletions src/sentry/tasks/post_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -1529,7 +1529,7 @@ def kick_off_seer_automation(job: PostProcessJob) -> None:
is_seer_seat_based_tier_enabled,
)
from sentry.tasks.seer.autofix import (
generate_issue_summary_only,
generate_summary_and_fixability_score,
generate_summary_and_run_automation,
run_automation_only_task,
)
Expand All @@ -1555,7 +1555,7 @@ def kick_off_seer_automation(job: PostProcessJob) -> None:
"seer.automation.filtered", tags={"reason": skip_reason, "tier": "seat_based"}
)
if skip_reason == "below_occurrence_threshold":
generate_issue_summary_only.delay(group.id)
generate_summary_and_fixability_score.delay(group.id)
return

# Check if summary exists in cache
Expand Down
17 changes: 11 additions & 6 deletions src/sentry/tasks/seer/autofix.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,9 @@ def check_autofix_status(run_id: int, organization_id: int) -> None:
retry=Retry(times=1),
)
def generate_summary_and_run_automation(group_id: int, **kwargs) -> None:
from sentry.seer.autofix.issue_summary import get_issue_summary
from django.contrib.auth.models import AnonymousUser

from sentry.seer.autofix.issue_summary import get_issue_summary, run_automation
Comment on lines +77 to +79
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do these imports need to be inlined or can they go to top of file?

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

post_process.py is very picky about imports which is why this is all inlined.


trigger_path = kwargs.get("trigger_path", "unknown")
sentry_sdk.set_tag("trigger_path", trigger_path)
Expand All @@ -98,7 +100,12 @@ def generate_summary_and_run_automation(group_id: int, **kwargs) -> None:
)
)

get_issue_summary(group=group, source=SeerAutomationSource.POST_PROCESS)
_, status_code, event = get_issue_summary(group=group, source=SeerAutomationSource.POST_PROCESS)
if status_code == 200 and event is not None:
try:
run_automation(group, AnonymousUser(), event, SeerAutomationSource.POST_PROCESS)
except Exception:
logger.exception("seer.automation.run_automation_failed", extra={"group_id": group.id})
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this issue warden caught is correct.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From what I can tell this issue is present in the old code as well. get_issue_summary() checks:

    # 1. Check cache first
    if cached_summary := cache.get(cache_key):
        return convert_dict_key_case(cached_summary, snake_to_camel_case), 200

which early returns before the automation gets triggered. We can fix if you think this is a bug but trying to keep this PR having no behavior changes.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Although to Warden's point maybe this should be named generate_summary_and_maybe_run_automation()

Comment thread
sentry[bot] marked this conversation as resolved.


@instrumented_task(
Expand All @@ -107,7 +114,7 @@ def generate_summary_and_run_automation(group_id: int, **kwargs) -> None:
processing_deadline_duration=35,
retry=Retry(times=3, delay=3, on=(Exception,)),
)
def generate_issue_summary_only(group_id: int) -> None:
def generate_summary_and_fixability_score(group_id: int) -> None:
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can revert the naming if you find it confusing to not match the task name, but the _only suffix was really confusing to read when it clearly does not only generate the summary lol.

"""
Generate issue summary WITHOUT triggering automation.
Used for triage signals flow when event count < AUTOFIX_AUTOMATION_OCCURRENCE_THRESHOLD or when summary doesn't exist yet.
Expand Down Expand Up @@ -137,9 +144,7 @@ def generate_issue_summary_only(group_id: int) -> None:
)

# Generate and cache the summary
get_issue_summary(
group=group, source=SeerAutomationSource.POST_PROCESS, should_run_automation=False
)
get_issue_summary(group=group, source=SeerAutomationSource.POST_PROCESS)

get_and_update_group_fixability_score(group, force_generate=True)

Expand Down
10 changes: 5 additions & 5 deletions tests/sentry/integrations/slack/test_message_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -946,7 +946,7 @@ def test_build_group_block_with_ai_summary(self) -> None:
patch(patch_path) as mock_get_summary,
patch(serializer_path, serializer_mock),
):
mock_get_summary.return_value = (mock_summary, 200)
mock_get_summary.return_value = (mock_summary, 200, None)

blocks = SlackIssuesMessageBuilder(group).build()

Expand Down Expand Up @@ -1037,7 +1037,7 @@ def test_build_group_block_with_ai_summary_text_truncation(self) -> None:
patch(patch_path) as mock_get_summary,
patch(serializer_path, serializer_mock),
):
mock_get_summary.return_value = (mock_summary, 200)
mock_get_summary.return_value = (mock_summary, 200, None)
blocks = SlackIssuesMessageBuilder(group1, event1.for_group(group1)).build()
title_text = blocks["blocks"][0]["text"]["text"]

Expand All @@ -1049,7 +1049,7 @@ def test_build_group_block_with_ai_summary_text_truncation(self) -> None:
patch(patch_path) as mock_get_summary,
patch(serializer_path, serializer_mock),
):
mock_get_summary.return_value = (mock_summary, 200)
mock_get_summary.return_value = (mock_summary, 200, None)
blocks = SlackIssuesMessageBuilder(group2, event2.for_group(group2)).build()
title_text = blocks["blocks"][0]["text"]["text"]

Expand Down Expand Up @@ -1092,7 +1092,7 @@ def test_build_group_block_with_ai_summary_text_truncation(self) -> None:
patch(patch_path) as mock_get_summary,
patch(serializer_path, serializer_mock),
):
mock_get_summary.return_value = (mock_summary, 200)
mock_get_summary.return_value = (mock_summary, 200, None)
blocks = SlackIssuesMessageBuilder(group_lb, event_lb.for_group(group_lb)).build()
title_block = blocks["blocks"][0]["text"]["text"]
assert f": {expected_headline_part}*>" in title_block, f"Failed for {name}"
Expand Down Expand Up @@ -1184,7 +1184,7 @@ def test_compact_alerts_with_ai_summary(self) -> None:
patch(patch_path) as mock_get_summary,
patch(serializer_path, serializer_mock),
):
mock_get_summary.return_value = (mock_summary, 200)
mock_get_summary.return_value = (mock_summary, 200, None)

blocks = SlackIssuesMessageBuilder(group).build()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def test_fetch_issue_summary_with_hide_ai_features_disabled(
"whatsWrong": "Something went wrong",
"possibleCause": "Test cause",
}
mock_get_issue_summary.return_value = (mock_summary, 200)
mock_get_issue_summary.return_value = (mock_summary, 200, None)

result = fetch_issue_summary(self.group)

Expand Down Expand Up @@ -174,7 +174,7 @@ def test_fetch_issue_summary_api_error(
mock_has_budget.return_value = True

# Mock error response
mock_get_issue_summary.return_value = (None, 500)
mock_get_issue_summary.return_value = (None, 500, None)

result = fetch_issue_summary(self.group)

Expand Down
Loading
Loading