From 50de609ea6d5dc3209d937b461df965558e0f10b Mon Sep 17 00:00:00 2001 From: RoyLin Date: Sat, 27 Jun 2026 10:48:15 +0800 Subject: [PATCH] feat(retry): retry transient network errors, not just HTTP status codes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The retry path only retried HTTP statuses (429/5xx/529); a network error (timeout, connection reset, mid-flight drop — common on throttled endpoints like GLM) was classified Fatal and failed the turn outright. Add is_transient_error and, in the streaming request path (anthropic + openai-compatible/GLM), classify transient network errors as Retryable (synthetic 503) so they back off + retry like Claude Code. Real errors (bad key, model not found) still bail. Bumps 4.2.8. --- Cargo.lock | 2 +- core/Cargo.toml | 2 +- core/src/llm/anthropic.rs | 18 ++++++- core/src/llm/openai.rs | 18 ++++++- core/src/retry.rs | 47 +++++++++++++++++++ sdk/node/Cargo.lock | 4 +- sdk/node/Cargo.toml | 4 +- sdk/node/examples/package-lock.json | 14 +++--- sdk/node/package-lock.json | 16 +++---- sdk/node/package.json | 14 +++--- sdk/node/ptc_soak.mjs | 4 +- sdk/python-bootstrap/pyproject.toml | 2 +- .../src/a3s_code/_bootstrap.py | 2 +- sdk/python/Cargo.toml | 4 +- sdk/python/pyproject.toml | 2 +- 15 files changed, 117 insertions(+), 36 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5fb950b6..3488168b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -37,7 +37,7 @@ dependencies = [ [[package]] name = "a3s-code-core" -version = "4.2.7" +version = "4.2.8" dependencies = [ "a3s-acl 0.2.0", "a3s-ahp", diff --git a/core/Cargo.toml b/core/Cargo.toml index c6d8d8f2..de7d4005 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "a3s-code-core" -version = "4.2.7" +version = "4.2.8" edition = "2021" authors = ["A3S Lab Team"] license = "MIT" diff --git a/core/src/llm/anthropic.rs b/core/src/llm/anthropic.rs index b1ba650a..5be11b60 100644 --- a/core/src/llm/anthropic.rs +++ b/core/src/llm/anthropic.rs @@ -356,7 +356,23 @@ impl AnthropicClient { match result { Ok(r) => r, Err(e) => { - return AttemptOutcome::Fatal(anyhow::anyhow!("HTTP request failed: {}", e)); + // A transient network error (timeout, reset, + // mid-flight drop — common on throttled + // endpoints) carries no HTTP status. Retry it + // with backoff like 429/5xx instead of failing + // the turn; a real fatal error still bails. + return if crate::retry::is_transient_error(&e) { + AttemptOutcome::Retryable { + status: reqwest::StatusCode::SERVICE_UNAVAILABLE, + body: format!("network error: {e}"), + retry_after: None, + } + } else { + AttemptOutcome::Fatal(anyhow::anyhow!( + "HTTP request failed: {}", + e + )) + }; } } } diff --git a/core/src/llm/openai.rs b/core/src/llm/openai.rs index 4d6e2187..38c54ee1 100644 --- a/core/src/llm/openai.rs +++ b/core/src/llm/openai.rs @@ -573,7 +573,23 @@ impl OpenAiClient { match result { Ok(r) => r, Err(e) => { - return AttemptOutcome::Fatal(anyhow::anyhow!("HTTP request failed: {}", e)); + // Transient network error (timeout, reset, + // mid-flight drop — common on throttled + // endpoints): retry with backoff like 429/5xx + // instead of failing the turn. GLM and other + // OpenAI-compatible endpoints hit this most. + return if crate::retry::is_transient_error(&e) { + AttemptOutcome::Retryable { + status: reqwest::StatusCode::SERVICE_UNAVAILABLE, + body: format!("network error: {e}"), + retry_after: None, + } + } else { + AttemptOutcome::Fatal(anyhow::anyhow!( + "HTTP request failed: {}", + e + )) + }; } } } diff --git a/core/src/retry.rs b/core/src/retry.rs index 8e7fccb7..648cf2b7 100644 --- a/core/src/retry.rs +++ b/core/src/retry.rs @@ -183,12 +183,59 @@ where ) } +/// Heuristic: is this a transient *network* error worth retrying — a timeout, +/// connection reset/refused/closed, broken pipe, DNS failure, or a request that +/// dropped mid-flight? These carry no HTTP status (so `is_retryable_status` +/// can't see them), yet Claude Code retries them just like 429/5xx. We only have +/// the error's rendered text (a `CodeError`/`anyhow::Error` chain) to classify. +pub fn is_transient_error(e: &E) -> bool { + let m = e.to_string().to_lowercase(); + [ + "timed out", + "timeout", + "connection reset", + "connection refused", + "connection closed", + "connection aborted", + "connection error", + "broken pipe", + "reset by peer", + "error sending request", + "incomplete message", + "unexpected eof", + "dns error", + "unreachable", + "tls handshake", + "request error", + "body error", + "decoding response", + "channel closed", + "stream closed", + ] + .iter() + .any(|p| m.contains(p)) +} + #[cfg(test)] mod tests { use super::*; use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::Arc; + #[test] + fn transient_error_classification() { + let t = |s: &str| is_transient_error(&anyhow::anyhow!("{s}")); + // Transient network errors → retry. + assert!(t("error sending request for url: operation timed out")); + assert!(t("connection reset by peer")); + assert!(t("LLM error: connection closed before message completed")); + assert!(t("tls handshake eof")); + // Real application errors → do NOT retry. + assert!(!t("invalid api key")); + assert!(!t("model not found")); + assert!(!t("context length exceeded")); + } + // ======================================================================== // RetryConfig unit tests // ======================================================================== diff --git a/sdk/node/Cargo.lock b/sdk/node/Cargo.lock index c1f9d3f7..3b631b22 100644 --- a/sdk/node/Cargo.lock +++ b/sdk/node/Cargo.lock @@ -37,7 +37,7 @@ dependencies = [ [[package]] name = "a3s-code-core" -version = "4.2.6" +version = "4.2.7" dependencies = [ "a3s-acl 0.2.0", "a3s-ahp", @@ -92,7 +92,7 @@ dependencies = [ [[package]] name = "a3s-code-node" -version = "4.2.6" +version = "4.2.7" dependencies = [ "a3s-code-core", "anyhow", diff --git a/sdk/node/Cargo.toml b/sdk/node/Cargo.toml index 70eb1e44..774b5263 100644 --- a/sdk/node/Cargo.toml +++ b/sdk/node/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "a3s-code-node" -version = "4.2.7" +version = "4.2.8" edition = "2021" authors = ["A3S Lab Team"] license = "MIT" @@ -11,7 +11,7 @@ description = "A3S Code Node.js bindings - Native addon via napi-rs" crate-type = ["cdylib"] [dependencies] -a3s-code-core = { version = "4.2.7", path = "../../core", features = ["ahp", "s3", "serve"] } +a3s-code-core = { version = "4.2.8", path = "../../core", features = ["ahp", "s3", "serve"] } napi = { version = "2", features = ["async", "napi6", "serde-json"] } napi-derive = "2" tokio = { version = "1.35", features = ["full"] } diff --git a/sdk/node/examples/package-lock.json b/sdk/node/examples/package-lock.json index 81204245..c81b526f 100644 --- a/sdk/node/examples/package-lock.json +++ b/sdk/node/examples/package-lock.json @@ -18,7 +18,7 @@ }, "..": { "name": "@a3s-lab/code", - "version": "4.2.7", + "version": "4.2.8", "license": "MIT", "devDependencies": { "@napi-rs/cli": "^2", @@ -27,12 +27,12 @@ "typescript": "^5.9.3" }, "optionalDependencies": { - "@a3s-lab/code-darwin-arm64": "4.2.7", - "@a3s-lab/code-linux-arm64-gnu": "4.2.7", - "@a3s-lab/code-linux-arm64-musl": "4.2.7", - "@a3s-lab/code-linux-x64-gnu": "4.2.7", - "@a3s-lab/code-linux-x64-musl": "4.2.7", - "@a3s-lab/code-win32-x64-msvc": "4.2.7" + "@a3s-lab/code-darwin-arm64": "4.2.8", + "@a3s-lab/code-linux-arm64-gnu": "4.2.8", + "@a3s-lab/code-linux-arm64-musl": "4.2.8", + "@a3s-lab/code-linux-x64-gnu": "4.2.8", + "@a3s-lab/code-linux-x64-musl": "4.2.8", + "@a3s-lab/code-win32-x64-msvc": "4.2.8" } }, "node_modules/@a3s-lab/code": { diff --git a/sdk/node/package-lock.json b/sdk/node/package-lock.json index d69609dd..5fa0ba98 100644 --- a/sdk/node/package-lock.json +++ b/sdk/node/package-lock.json @@ -1,12 +1,12 @@ { "name": "@a3s-lab/code", - "version": "4.2.7", + "version": "4.2.8", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "@a3s-lab/code", - "version": "4.2.7", + "version": "4.2.8", "license": "MIT", "devDependencies": { "@napi-rs/cli": "^2", @@ -15,12 +15,12 @@ "typescript": "^5.9.3" }, "optionalDependencies": { - "@a3s-lab/code-darwin-arm64": "4.2.7", - "@a3s-lab/code-linux-arm64-gnu": "4.2.7", - "@a3s-lab/code-linux-arm64-musl": "4.2.7", - "@a3s-lab/code-linux-x64-gnu": "4.2.7", - "@a3s-lab/code-linux-x64-musl": "4.2.7", - "@a3s-lab/code-win32-x64-msvc": "4.2.7" + "@a3s-lab/code-darwin-arm64": "4.2.8", + "@a3s-lab/code-linux-arm64-gnu": "4.2.8", + "@a3s-lab/code-linux-arm64-musl": "4.2.8", + "@a3s-lab/code-linux-x64-gnu": "4.2.8", + "@a3s-lab/code-linux-x64-musl": "4.2.8", + "@a3s-lab/code-win32-x64-msvc": "4.2.8" } }, "node_modules/@a3s-lab/code-darwin-arm64": { diff --git a/sdk/node/package.json b/sdk/node/package.json index b9dba450..06729ff5 100644 --- a/sdk/node/package.json +++ b/sdk/node/package.json @@ -1,6 +1,6 @@ { "name": "@a3s-lab/code", - "version": "4.2.7", + "version": "4.2.8", "description": "A3S Code - Native Node.js bindings for the coding-agent runtime", "main": "index.js", "types": "index.d.ts", @@ -43,11 +43,11 @@ "test:helpers": "node test-helpers.mjs" }, "optionalDependencies": { - "@a3s-lab/code-darwin-arm64": "4.2.7", - "@a3s-lab/code-linux-x64-gnu": "4.2.7", - "@a3s-lab/code-linux-x64-musl": "4.2.7", - "@a3s-lab/code-linux-arm64-gnu": "4.2.7", - "@a3s-lab/code-linux-arm64-musl": "4.2.7", - "@a3s-lab/code-win32-x64-msvc": "4.2.7" + "@a3s-lab/code-darwin-arm64": "4.2.8", + "@a3s-lab/code-linux-x64-gnu": "4.2.8", + "@a3s-lab/code-linux-x64-musl": "4.2.8", + "@a3s-lab/code-linux-arm64-gnu": "4.2.8", + "@a3s-lab/code-linux-arm64-musl": "4.2.8", + "@a3s-lab/code-win32-x64-msvc": "4.2.8" } } diff --git a/sdk/node/ptc_soak.mjs b/sdk/node/ptc_soak.mjs index 747737e8..1531fcb9 100644 --- a/sdk/node/ptc_soak.mjs +++ b/sdk/node/ptc_soak.mjs @@ -25,13 +25,15 @@ function script(n) { return JSON.stringify(res); }`; } +// NO explicit limits → exercises the DEFAULT script timeout. With 50-word +// subtasks (each >30s on this model), 4.2.6's 30s default would time out; 4.2.7 +// gives delegation-capable scripts 10min, so it should complete. const promptFor = (n) => 'Call the `program` tool exactly once, now, with these arguments, then stop.\n\nArguments:\n' + JSON.stringify({ type: 'script', language: 'javascript', source: script(n), - limits: { timeoutMs: 180000, maxToolCalls: 20 }, }); const agent = await Agent.create(CONFIG); diff --git a/sdk/python-bootstrap/pyproject.toml b/sdk/python-bootstrap/pyproject.toml index 6e04494f..09026735 100644 --- a/sdk/python-bootstrap/pyproject.toml +++ b/sdk/python-bootstrap/pyproject.toml @@ -7,7 +7,7 @@ name = "a3s-code" # Keep in sync with crates/code core release. The bootstrap loader fetches # the matching native wheel from `https://github.com/AI45Lab/Code/releases/tag/v` # at import time. -version = "4.2.7" +version = "4.2.8" description = "A3S Code Python SDK — pure-Python bootstrap that fetches the native wheel from GitHub Releases" readme = "README.md" license = {text = "MIT"} diff --git a/sdk/python-bootstrap/src/a3s_code/_bootstrap.py b/sdk/python-bootstrap/src/a3s_code/_bootstrap.py index 3b07b76f..7af6a178 100644 --- a/sdk/python-bootstrap/src/a3s_code/_bootstrap.py +++ b/sdk/python-bootstrap/src/a3s_code/_bootstrap.py @@ -31,7 +31,7 @@ # Version is the bootstrap's own version, which equals the matching native # wheel version on GH Releases. Bumped by the release workflow. -__version__ = "4.2.7" +__version__ = "4.2.8" _DEFAULT_BASE_URL = "https://github.com/AI45Lab/Code/releases/download" _REQUEST_TIMEOUT_S = 120 diff --git a/sdk/python/Cargo.toml b/sdk/python/Cargo.toml index 17eaab03..294f1fba 100644 --- a/sdk/python/Cargo.toml +++ b/sdk/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "a3s-code-py" -version = "4.2.7" +version = "4.2.8" edition = "2021" authors = ["A3S Lab Team"] license = "MIT" @@ -12,7 +12,7 @@ name = "a3s_code" crate-type = ["cdylib"] [dependencies] -a3s-code-core = { version = "4.2.7", path = "../../core", features = ["ahp", "s3", "serve"] } +a3s-code-core = { version = "4.2.8", path = "../../core", features = ["ahp", "s3", "serve"] } pyo3 = "0.23" tokio = { version = "1.35", features = ["full"] } serde_json = "1.0" diff --git a/sdk/python/pyproject.toml b/sdk/python/pyproject.toml index 220318d6..7e262d90 100644 --- a/sdk/python/pyproject.toml +++ b/sdk/python/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "a3s-code" -version = "4.2.7" +version = "4.2.8" description = "A3S Code - Native Python bindings for the coding-agent runtime" readme = "README.md" license = {text = "MIT"}