diff --git a/cloud_pipelines_backend/instrumentation/error_normalization.py b/cloud_pipelines_backend/instrumentation/error_normalization.py index 41d2977..a08517c 100644 --- a/cloud_pipelines_backend/instrumentation/error_normalization.py +++ b/cloud_pipelines_backend/instrumentation/error_normalization.py @@ -9,6 +9,14 @@ import json import re +try: + from ..launchers.interfaces import LauncherError as _LauncherError + + _LAUNCHER_ERROR_AVAILABLE = True +except ImportError: + _LauncherError = None # type: ignore[assignment,misc] + _LAUNCHER_ERROR_AVAILABLE = False + _POD_NAME_PATTERN = re.compile(r"(?:task|tangle(?:-ce)?)-[a-zA-Z0-9]+-[a-zA-Z0-9]+") _OBJECT_REPR_PATTERN = re.compile(r"<[^>]+ object at 0x[0-9a-fA-F]+>") _HEX_ADDRESS_PATTERN = re.compile(r"\b0x[0-9a-fA-F]+\b") @@ -16,9 +24,15 @@ r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", re.IGNORECASE ) _LONG_ALNUM_ID_PATTERN = re.compile(r"\b[a-zA-Z0-9]{16,}\b") +# Matches from the first `{"`, `{'`, or `{ "` / `{ '` to end of string. +# Both the embedded dict/JSON literal and any trailing message text are replaced +# with `{...}` — the greedy match is intentional: anything after a runtime-data +# dict in an error message is typically also variable and should not affect grouping. +_JSON_OBJECT_PATTERN = re.compile(r"\{\s*['\"].*", re.DOTALL) def _strip_generic(*, message: str) -> str: + message = _JSON_OBJECT_PATTERN.sub("{...}", message) message = _OBJECT_REPR_PATTERN.sub("{object}", message) message = _HEX_ADDRESS_PATTERN.sub("{addr}", message) message = _UUID_PATTERN.sub("{uuid}", message) @@ -85,6 +99,13 @@ def _normalize_orchestrator_error(*, exception: BaseException) -> str | None: return f"OrchestratorError: {message}" +def _normalize_launcher_error(*, exception: BaseException) -> str | None: + if not _LAUNCHER_ERROR_AVAILABLE or not isinstance(exception, _LauncherError): + return None + message = _JSON_OBJECT_PATTERN.sub("{...}", str(exception)) + return f"LauncherError: {message.strip()}" + + def normalize_error_message(*, exception: BaseException) -> str: """Return a stable normalized string for error grouping.""" for normalizer in ( @@ -92,6 +113,7 @@ def normalize_error_message(*, exception: BaseException) -> str: _normalize_max_retry_error, _normalize_unicode_decode_error, _normalize_orchestrator_error, + _normalize_launcher_error, ): result = normalizer(exception=exception) if result is not None: diff --git a/tests/instrumentation/test_error_normalization.py b/tests/instrumentation/test_error_normalization.py index dc01b55..f7845a8 100644 --- a/tests/instrumentation/test_error_normalization.py +++ b/tests/instrumentation/test_error_normalization.py @@ -184,6 +184,52 @@ def test_strips_object_repr(self): ) +class TestNormalizeLauncherError: + def _make_launcher_error( + self, message: str, cause: BaseException | None = None + ) -> Exception: + try: + from cloud_pipelines_backend.launchers.interfaces import LauncherError + except ImportError: + pytest.skip("LauncherError not importable") + if cause: + try: + raise LauncherError(message) from cause + except LauncherError as exc: + return exc + return LauncherError(message) + + def test_strips_pod_spec_json(self): + pod_spec = ( + "{'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'name': 'task-abc-xyz'}}" + ) + exc = self._make_launcher_error(f"Failed to create pod: {pod_spec}") + result = error_normalization.normalize_error_message(exception=exc) + assert result == "LauncherError: Failed to create pod: {...}" + + def test_with_timeout_cause(self): + cause = TimeoutError("The read operation timed out") + exc = self._make_launcher_error( + "Failed to create pod: {'apiVersion': 'v1'}", cause=cause + ) + result = error_normalization.normalize_error_message(exception=exc) + assert result == "LauncherError: Failed to create pod: {...}" + + def test_no_colon_in_message(self): + exc = self._make_launcher_error("launch failed") + result = error_normalization.normalize_error_message(exception=exc) + assert result == "LauncherError: launch failed" + + def test_multi_colon_diagnostic_preserved(self): + exc = self._make_launcher_error( + "creating pod: spec invalid: missing field 'name'" + ) + result = error_normalization.normalize_error_message(exception=exc) + assert ( + result == "LauncherError: creating pod: spec invalid: missing field 'name'" + ) + + class TestFallback: def test_strips_hex_address(self): exc = ValueError("object at 0xdeadbeef failed") @@ -204,3 +250,13 @@ def test_stable_message_unchanged(self): exc = AttributeError("'NoneType' object has no attribute 'encode'") result = error_normalization.normalize_error_message(exception=exc) assert result == "AttributeError: 'NoneType' object has no attribute 'encode'" + + def test_strips_json_object(self): + exc = RuntimeError("operation failed: {'key': 'value', 'nested': {'a': 1}}") + result = error_normalization.normalize_error_message(exception=exc) + assert result == "RuntimeError: operation failed: {...}" + + def test_strips_json_object_double_quotes(self): + exc = RuntimeError('operation failed: {"key": "value"}') + result = error_normalization.normalize_error_message(exception=exc) + assert result == "RuntimeError: operation failed: {...}"