Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions cloud_pipelines_backend/instrumentation/error_normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,30 @@
import json
import re

try:
from ..launchers.interfaces import LauncherError as _LauncherError

_LAUNCHER_ERROR_AVAILABLE = True
except ImportError:
_LauncherError = None # type: ignore[assignment,misc]
_LAUNCHER_ERROR_AVAILABLE = False

_POD_NAME_PATTERN = re.compile(r"(?:task|tangle(?:-ce)?)-[a-zA-Z0-9]+-[a-zA-Z0-9]+")
_OBJECT_REPR_PATTERN = re.compile(r"<[^>]+ object at 0x[0-9a-fA-F]+>")
_HEX_ADDRESS_PATTERN = re.compile(r"\b0x[0-9a-fA-F]+\b")
_UUID_PATTERN = re.compile(
r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", re.IGNORECASE
)
_LONG_ALNUM_ID_PATTERN = re.compile(r"\b[a-zA-Z0-9]{16,}\b")
# Matches from the first `{"`, `{'`, or `{ "` / `{ '` to end of string.
# Both the embedded dict/JSON literal and any trailing message text are replaced
# with `{...}` — the greedy match is intentional: anything after a runtime-data
# dict in an error message is typically also variable and should not affect grouping.
_JSON_OBJECT_PATTERN = re.compile(r"\{\s*['\"].*", re.DOTALL)


def _strip_generic(*, message: str) -> str:
message = _JSON_OBJECT_PATTERN.sub("{...}", message)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should literally replace the _JSON_OBJECT_PATTERN with {...} and not just ... correct?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🤖 Yes — the pattern replaces the matched span with {...}, which includes the opening {. So the result is {...} not ....

Also improved the pattern to handle optional whitespace after { (e.g. { "key": ...}) and updated the comment to document that the greedy match intentionally strips everything from the first dict/JSON literal to end-of-string.

message = _OBJECT_REPR_PATTERN.sub("{object}", message)
message = _HEX_ADDRESS_PATTERN.sub("{addr}", message)
message = _UUID_PATTERN.sub("{uuid}", message)
Expand Down Expand Up @@ -85,13 +99,21 @@ def _normalize_orchestrator_error(*, exception: BaseException) -> str | None:
return f"OrchestratorError: {message}"


def _normalize_launcher_error(*, exception: BaseException) -> str | None:
if not _LAUNCHER_ERROR_AVAILABLE or not isinstance(exception, _LauncherError):
return None
message = _JSON_OBJECT_PATTERN.sub("{...}", str(exception))
return f"LauncherError: {message.strip()}"


def normalize_error_message(*, exception: BaseException) -> str:
"""Return a stable normalized string for error grouping."""
for normalizer in (
_normalize_k8s_api_exception,
_normalize_max_retry_error,
_normalize_unicode_decode_error,
_normalize_orchestrator_error,
_normalize_launcher_error,
):
result = normalizer(exception=exception)
if result is not None:
Expand Down
56 changes: 56 additions & 0 deletions tests/instrumentation/test_error_normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,52 @@ def test_strips_object_repr(self):
)


class TestNormalizeLauncherError:
def _make_launcher_error(
self, message: str, cause: BaseException | None = None
) -> Exception:
try:
from cloud_pipelines_backend.launchers.interfaces import LauncherError
except ImportError:
pytest.skip("LauncherError not importable")
if cause:
try:
raise LauncherError(message) from cause
except LauncherError as exc:
return exc
return LauncherError(message)

def test_strips_pod_spec_json(self):
pod_spec = (
"{'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'name': 'task-abc-xyz'}}"
)
exc = self._make_launcher_error(f"Failed to create pod: {pod_spec}")
result = error_normalization.normalize_error_message(exception=exc)
assert result == "LauncherError: Failed to create pod: {...}"

def test_with_timeout_cause(self):
cause = TimeoutError("The read operation timed out")
exc = self._make_launcher_error(
"Failed to create pod: {'apiVersion': 'v1'}", cause=cause
)
result = error_normalization.normalize_error_message(exception=exc)
assert result == "LauncherError: Failed to create pod: {...}"

def test_no_colon_in_message(self):
exc = self._make_launcher_error("launch failed")
result = error_normalization.normalize_error_message(exception=exc)
assert result == "LauncherError: launch failed"

def test_multi_colon_diagnostic_preserved(self):
exc = self._make_launcher_error(
"creating pod: spec invalid: missing field 'name'"
)
result = error_normalization.normalize_error_message(exception=exc)
assert (
result == "LauncherError: creating pod: spec invalid: missing field 'name'"
)


Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🤖 This is an AI-generated code review comment.

Consider adding a test for multi-colon LauncherError messages.

The three new tests cover JSON suffix, cause chain, and no-colon. None covers the case where there is real diagnostic info after the first colon — exactly the scenario where the verb-phrase truncation silently degrades grouping quality (see the other comment on _normalize_launcher_error).

Adding:

def test_multiple_colons_in_message(self):
    exc = self._make_launcher_error("creating pod: spec invalid: missing field 'name'")
    result = error_normalization.normalize_error_message(exception=exc)
    assert result == "LauncherError: creating pod"  # or whatever the desired behavior is

…either pins the truncation as intentional or surfaces the regression so the policy gets revisited.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🤖 Added test_multi_colon_diagnostic_preserved which asserts creating pod: spec invalid: missing field 'name' passes through unmodified, confirming that diagnostic colons are kept now that we no longer truncate at the first colon.

class TestFallback:
def test_strips_hex_address(self):
exc = ValueError("object at 0xdeadbeef failed")
Expand All @@ -204,3 +250,13 @@ def test_stable_message_unchanged(self):
exc = AttributeError("'NoneType' object has no attribute 'encode'")
result = error_normalization.normalize_error_message(exception=exc)
assert result == "AttributeError: 'NoneType' object has no attribute 'encode'"

def test_strips_json_object(self):
exc = RuntimeError("operation failed: {'key': 'value', 'nested': {'a': 1}}")
result = error_normalization.normalize_error_message(exception=exc)
assert result == "RuntimeError: operation failed: {...}"

def test_strips_json_object_double_quotes(self):
exc = RuntimeError('operation failed: {"key": "value"}')
result = error_normalization.normalize_error_message(exception=exc)
assert result == "RuntimeError: operation failed: {...}"
Loading