Skip to content

Commit 8c72d41

Browse files
authored
fix: add retry with exponential backoff to schema fetch for transient HTTP errors (#2582)
Gateway startup fails when `raw.githubusercontent.com` rate-limits the JSON config schema fetch (HTTP 429). Any transient error was treated as fatal with no retry. ## Changes **`internal/config/validation_schema.go`** - Added `isTransientHTTPError()` identifying 429, 503, and all 5xx codes - `fetchAndFixSchema()` now retries up to `maxSchemaFetchRetries` (3) times with exponential backoff (1s → 2s) for transient errors; permanent errors (4xx except 429) fail immediately - `schemaFetchRetryDelay` and `schemaHTTPClientTimeout` are package-level vars so tests can zero them out without sleeps **`internal/config/validation_schema_fetch_test.go`** - `init()` sets zero delay + 200ms HTTP timeout for all tests in the file - `TestFetchAndFixSchema_HTTPError` extended with 429 case and per-error-type request-count assertions (1 request for permanent, 3 for transient) - Added `TestFetchAndFixSchema_RetrySucceedsAfterTransientError`: 2×429 → 200 succeeds - Added `TestFetchAndFixSchema_ExponentialBackoffDelays`: verifies exactly `maxSchemaFetchRetries` requests are made on persistent transient failure **Note**: The secrecy-vs-integrity notice labeling fix (Problem 2 from the issue) is already present in the codebase (`difcPolicyLabel()` + `IsSecrecyViolation` in `difc_log.go` / `difc/resource.go`). > [!WARNING] > > <details> > <summary>Firewall rules blocked me from connecting to one or more addresses (expand for details)</summary> > > #### I tried to connect to the following addresses, but was blocked by firewall rules: > > - `example.com` > - Triggering command: `/tmp/go-build3732201295/b330/launcher.test /tmp/go-build3732201295/b330/launcher.test -test.testlogfile=/tmp/go-build3732201295/b330/testlog.txt -test.paniconexit0 -test.timeout=10m0s -test.v=true g_.a --global git pull.rebase abis` (dns block) > - Triggering command: `/tmp/go-build63582137/b334/launcher.test /tmp/go-build63582137/b334/launcher.test -test.testlogfile=/tmp/go-build63582137/b334/testlog.txt -test.paniconexit0 -test.timeout=10m0s` (dns block) > - `invalid-host-that-does-not-exist-12345.com` > - Triggering command: `/tmp/go-build3732201295/b315/config.test /tmp/go-build3732201295/b315/config.test -test.testlogfile=/tmp/go-build3732201295/b315/testlog.txt -test.paniconexit0 -test.timeout=10m0s -test.v=true -c=4 -nolocalimports -importcfg /tmp/go-build3732201295/b287/importcfg -pack /home/REDACTED/go/pkg/mod/github.com/modelcontextprotocol/go-sdk@v1.4.1/auth/auth.go /home/REDACTED/go/pkg/mod/github.com/modelcontextprotocol/go-sdk@v1.4.1/auth/client.go ortc�� go 9p-W5oE9v ache/go/1.25.8/x64/pkg/tool/linu-Werror credential.helpe/opt/hostedtoolcache/go/1.25.8/x64/pkg/tool/linux_amd64/compile` (dns block) > - Triggering command: `/tmp/go-build2566417517/b001/config.test /tmp/go-build2566417517/b001/config.test -test.testlogfile=/tmp/go-build2566417517/b001/testlog.txt -test.paniconexit0 -test.timeout=10m0s -test.run=TestFetchAndFixSchema -test.v=true ache/go/1.25.8/x64/src/crypto/ec-ifaceassert x_amd64/vet -p internal/reflectdocker-cli-plugin-metadata -lang=go1.25 x_amd64/vet ap 2201295/b171=/tmp/go-build .cfg 64/pkg/tool/linux_amd64/vet -p ache/go/1.25.8/x-c -lang=go1.25 64/pkg/tool/linux_amd64/vet` (dns block) > - Triggering command: `/tmp/go-build1075949314/b001/config.test /tmp/go-build1075949314/b001/config.test -test.testlogfile=/tmp/go-build1075949314/b001/testlog.txt -test.paniconexit0 -test.run=TestFetchAndFixSchema -test.v=true -test.timeout=30s SkhS/x3QNW0bkE1zmain 64/pkg/tool/linu-lang=go1.25 -I /tmp/go-build373docker-cli-plugin-metadata -I 64/pkg/tool/linu-dwarf=false 2201�� azero@v1.11.0/ingo1.25.8 azero@v1.11.0/in-c=4 64/pkg/tool/linu-nolocalimports 2201295/b165/ /dev/null ctor 64/pkg/tool/linu/tmp/go-build3732201295/b345/_testmain.go` (dns block) > - `nonexistent.local` > - Triggering command: `/tmp/go-build3732201295/b330/launcher.test /tmp/go-build3732201295/b330/launcher.test -test.testlogfile=/tmp/go-build3732201295/b330/testlog.txt -test.paniconexit0 -test.timeout=10m0s -test.v=true g_.a --global git pull.rebase abis` (dns block) > - Triggering command: `/tmp/go-build63582137/b334/launcher.test /tmp/go-build63582137/b334/launcher.test -test.testlogfile=/tmp/go-build63582137/b334/testlog.txt -test.paniconexit0 -test.timeout=10m0s` (dns block) > - `slow.example.com` > - Triggering command: `/tmp/go-build3732201295/b330/launcher.test /tmp/go-build3732201295/b330/launcher.test -test.testlogfile=/tmp/go-build3732201295/b330/testlog.txt -test.paniconexit0 -test.timeout=10m0s -test.v=true g_.a --global git pull.rebase abis` (dns block) > - Triggering command: `/tmp/go-build63582137/b334/launcher.test /tmp/go-build63582137/b334/launcher.test -test.testlogfile=/tmp/go-build63582137/b334/testlog.txt -test.paniconexit0 -test.timeout=10m0s` (dns block) > - `this-host-does-not-exist-12345.com` > - Triggering command: `/tmp/go-build3732201295/b339/mcp.test /tmp/go-build3732201295/b339/mcp.test -test.testlogfile=/tmp/go-build3732201295/b339/testlog.txt -test.paniconexit0 -test.timeout=10m0s -test.v=true 64/src/runtime/cgo committer.name x_amd64/vet` (dns block) > - Triggering command: `/tmp/go-build63582137/b343/mcp.test /tmp/go-build63582137/b343/mcp.test -test.testlogfile=/tmp/go-build63582137/b343/testlog.txt -test.paniconexit0 -test.timeout=10m0s ortc�� TOKEN&#34;; }; f get TOKEN&#34;; }; f get ache/go/1.25.8/x64/pkg/tool/linux_amd64/vet -pthread telabs/wazero -fmessage-length=0 ache/go/1.25.8/x64/pkg/tool/linux_amd64/vet -I eNqb/Hpbq2t4bcSrSRPw9eNqb -I x_amd64/vet --gdwarf-5 --64 -o x_amd64/vet` (dns block) > > If you need me to access, download, or install something from one of these locations, you can either: > > - Configure [Actions setup steps](https://gh.io/copilot/actions-setup-steps) to set up my environment, which run before the firewall is enabled > - Add the appropriate URLs or hosts to the custom allowlist in this repository's [Copilot coding agent settings](https://github.com/github/gh-aw-mcpg/settings/copilot/coding_agent) (admins only) > > </details> <!-- START COPILOT CODING AGENT TIPS --> --- 📍 Connect Copilot coding agent with [Jira](https://gh.io/cca-jira-docs), [Azure Boards](https://gh.io/cca-azure-boards-docs) or [Linear](https://gh.io/cca-linear-docs) to delegate work to Copilot in one click without leaving your project management tool.
2 parents 77757ca + 7cb2046 commit 8c72d41

2 files changed

Lines changed: 159 additions & 28 deletions

File tree

internal/config/validation_schema.go

Lines changed: 67 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,28 @@ import (
1616
"github.com/santhosh-tekuri/jsonschema/v5"
1717
)
1818

19+
const (
20+
// maxSchemaFetchRetries is the number of fetch attempts before giving up.
21+
maxSchemaFetchRetries = 3
22+
)
23+
24+
// schemaFetchRetryDelay is the base delay between retry attempts using exponential
25+
// backoff (1×, 2×, 4×, …). It is a variable so tests can override it to zero for
26+
// fast execution.
27+
var schemaFetchRetryDelay = time.Second
28+
29+
// schemaHTTPClientTimeout is the per-attempt HTTP request timeout. It is a variable
30+
// so tests can shorten it to avoid long waits when testing timeout behaviour.
31+
var schemaHTTPClientTimeout = 10 * time.Second
32+
33+
// isTransientHTTPError returns true for status codes that indicate a temporary
34+
// server-side condition (rate-limiting or transient failure) worth retrying.
35+
func isTransientHTTPError(statusCode int) bool {
36+
return statusCode == http.StatusTooManyRequests ||
37+
statusCode == http.StatusServiceUnavailable ||
38+
(statusCode >= 500 && statusCode < 600)
39+
}
40+
1941
var (
2042
// Compile regex patterns from schema for additional validation
2143
containerPattern = regexp.MustCompile(`^[a-zA-Z0-9][a-zA-Z0-9./_-]*(:([a-zA-Z0-9._-]+|latest))?$`)
@@ -83,22 +105,56 @@ func fetchAndFixSchema(url string) ([]byte, error) {
83105
logSchema.Printf("Fetching schema from URL: %s", url)
84106

85107
client := &http.Client{
86-
Timeout: 10 * time.Second,
108+
Timeout: schemaHTTPClientTimeout,
87109
}
88110

89-
fetchStart := time.Now()
90-
resp, err := client.Get(url)
91-
if err != nil {
92-
logSchema.Printf("Schema fetch failed after %v: %v", time.Since(fetchStart), err)
93-
return nil, fmt.Errorf("failed to fetch schema from %s: %w", url, err)
111+
var resp *http.Response
112+
var lastErr error
113+
114+
for attempt := 1; attempt <= maxSchemaFetchRetries; attempt++ {
115+
if attempt > 1 {
116+
delay := schemaFetchRetryDelay << uint(attempt-2) // 1×, 2×, 4× base delay
117+
logSchema.Printf("Retrying schema fetch (attempt %d/%d) after %v: %v", attempt, maxSchemaFetchRetries, delay, lastErr)
118+
time.Sleep(delay)
119+
}
120+
121+
fetchStart := time.Now()
122+
var err error
123+
resp, err = client.Get(url)
124+
if err != nil {
125+
logSchema.Printf("Schema fetch attempt %d failed after %v: %v", attempt, time.Since(fetchStart), err)
126+
lastErr = fmt.Errorf("failed to fetch schema from %s: %w", url, err)
127+
resp = nil
128+
continue
129+
}
130+
logSchema.Printf("HTTP request attempt %d completed in %v with status %d", attempt, time.Since(fetchStart), resp.StatusCode)
131+
132+
if resp.StatusCode == http.StatusOK {
133+
lastErr = nil
134+
break
135+
}
136+
137+
if isTransientHTTPError(resp.StatusCode) {
138+
lastErr = fmt.Errorf("failed to fetch schema: HTTP %d", resp.StatusCode)
139+
logSchema.Printf("Schema fetch attempt %d returned transient error: HTTP %d, will retry", attempt, resp.StatusCode)
140+
resp.Body.Close()
141+
resp = nil
142+
continue
143+
}
144+
145+
// Permanent HTTP error (404, 403, 401, etc.) — do not retry.
146+
lastErr = fmt.Errorf("failed to fetch schema: HTTP %d", resp.StatusCode)
147+
logSchema.Printf("Schema fetch returned permanent error: HTTP %d", resp.StatusCode)
148+
resp.Body.Close()
149+
resp = nil
150+
break
94151
}
95-
defer resp.Body.Close()
96-
logSchema.Printf("HTTP request completed in %v", time.Since(fetchStart))
97152

98-
if resp.StatusCode != http.StatusOK {
99-
logSchema.Printf("Schema fetch returned non-OK status: %d", resp.StatusCode)
100-
return nil, fmt.Errorf("failed to fetch schema: HTTP %d", resp.StatusCode)
153+
if resp == nil {
154+
return nil, lastErr
101155
}
156+
defer resp.Body.Close()
157+
logSchema.Printf("HTTP request completed in %v", time.Since(startTime))
102158

103159
readStart := time.Now()
104160
schemaBytes, err := io.ReadAll(resp.Body)

internal/config/validation_schema_fetch_test.go

Lines changed: 92 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,21 @@ import (
66
"net/http"
77
"net/http/httptest"
88
"strings"
9+
"sync/atomic"
910
"testing"
1011
"time"
1112

1213
"github.com/stretchr/testify/assert"
1314
"github.com/stretchr/testify/require"
1415
)
1516

17+
func init() {
18+
// Speed up all retry-related tests by disabling inter-attempt delays and
19+
// shortening the per-attempt HTTP timeout so timeout tests don't take 30 s.
20+
schemaFetchRetryDelay = 0
21+
schemaHTTPClientTimeout = 200 * time.Millisecond
22+
}
23+
1624
// TestFetchAndFixSchema_SuccessfulFetch tests the happy path where schema is fetched successfully
1725
func TestFetchAndFixSchema_SuccessfulFetch(t *testing.T) {
1826
// Create a minimal valid schema for testing
@@ -52,35 +60,48 @@ func TestFetchAndFixSchema_SuccessfulFetch(t *testing.T) {
5260
// TestFetchAndFixSchema_HTTPError tests handling of HTTP error responses
5361
func TestFetchAndFixSchema_HTTPError(t *testing.T) {
5462
tests := []struct {
55-
name string
56-
statusCode int
57-
wantErr string
63+
name string
64+
statusCode int
65+
wantErr string
66+
wantRequests int // expected number of HTTP requests (1 = no retry, 3 = full retry)
5867
}{
5968
{
60-
name: "404 Not Found",
61-
statusCode: http.StatusNotFound,
62-
wantErr: "failed to fetch schema: HTTP 404",
69+
name: "404 Not Found",
70+
statusCode: http.StatusNotFound,
71+
wantErr: "failed to fetch schema: HTTP 404",
72+
wantRequests: 1, // permanent error, no retry
6373
},
6474
{
65-
name: "500 Internal Server Error",
66-
statusCode: http.StatusInternalServerError,
67-
wantErr: "failed to fetch schema: HTTP 500",
75+
name: "500 Internal Server Error",
76+
statusCode: http.StatusInternalServerError,
77+
wantErr: "failed to fetch schema: HTTP 500",
78+
wantRequests: maxSchemaFetchRetries, // transient, retried
6879
},
6980
{
70-
name: "403 Forbidden",
71-
statusCode: http.StatusForbidden,
72-
wantErr: "failed to fetch schema: HTTP 403",
81+
name: "403 Forbidden",
82+
statusCode: http.StatusForbidden,
83+
wantErr: "failed to fetch schema: HTTP 403",
84+
wantRequests: 1, // permanent error, no retry
7385
},
7486
{
75-
name: "503 Service Unavailable",
76-
statusCode: http.StatusServiceUnavailable,
77-
wantErr: "failed to fetch schema: HTTP 503",
87+
name: "503 Service Unavailable",
88+
statusCode: http.StatusServiceUnavailable,
89+
wantErr: "failed to fetch schema: HTTP 503",
90+
wantRequests: maxSchemaFetchRetries, // transient, retried
91+
},
92+
{
93+
name: "429 Too Many Requests",
94+
statusCode: http.StatusTooManyRequests,
95+
wantErr: "failed to fetch schema: HTTP 429",
96+
wantRequests: maxSchemaFetchRetries, // transient, retried
7897
},
7998
}
8099

81100
for _, tt := range tests {
82101
t.Run(tt.name, func(t *testing.T) {
102+
var requestCount atomic.Int32
83103
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
104+
requestCount.Add(1)
84105
w.WriteHeader(tt.statusCode)
85106
}))
86107
defer server.Close()
@@ -90,6 +111,8 @@ func TestFetchAndFixSchema_HTTPError(t *testing.T) {
90111
assert.Error(t, err)
91112
assert.Nil(t, result)
92113
assert.Contains(t, err.Error(), tt.wantErr)
114+
assert.Equal(t, int32(tt.wantRequests), requestCount.Load(),
115+
"expected %d HTTP request(s) for status %d", tt.wantRequests, tt.statusCode)
93116
})
94117
}
95118
}
@@ -108,9 +131,11 @@ func TestFetchAndFixSchema_NetworkError(t *testing.T) {
108131

109132
// TestFetchAndFixSchema_Timeout tests handling of request timeouts
110133
func TestFetchAndFixSchema_Timeout(t *testing.T) {
111-
// Create a server that delays longer than the client timeout
134+
// Create a server that delays longer than the configured client timeout.
135+
// The init() in this test file sets schemaHTTPClientTimeout = 200ms, so any
136+
// delay > 200ms will trigger a timeout on each attempt.
112137
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
113-
time.Sleep(15 * time.Second) // fetchAndFixSchema has 10 second timeout
138+
time.Sleep(500 * time.Millisecond)
114139
w.WriteHeader(http.StatusOK)
115140
}))
116141
defer server.Close()
@@ -589,3 +614,53 @@ func TestFetchAndFixSchema_LargeSchema(t *testing.T) {
589614
require.True(t, ok)
590615
assert.Equal(t, 100, len(properties), "Should preserve all 100 properties")
591616
}
617+
618+
// TestFetchAndFixSchema_RetrySucceedsAfterTransientError verifies that a transient
619+
// error on the first attempt is retried and the eventual success is returned.
620+
func TestFetchAndFixSchema_RetrySucceedsAfterTransientError(t *testing.T) {
621+
validSchema := map[string]interface{}{
622+
"$schema": "http://json-schema.org/draft-07/schema#",
623+
"type": "object",
624+
}
625+
schemaJSON, err := json.Marshal(validSchema)
626+
require.NoError(t, err)
627+
628+
var requestCount atomic.Int32
629+
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
630+
n := requestCount.Add(1)
631+
if n < 3 {
632+
// First two attempts return 429
633+
w.WriteHeader(http.StatusTooManyRequests)
634+
return
635+
}
636+
w.WriteHeader(http.StatusOK)
637+
w.Write(schemaJSON)
638+
}))
639+
defer server.Close()
640+
641+
result, err := fetchAndFixSchema(server.URL)
642+
643+
require.NoError(t, err)
644+
assert.NotNil(t, result)
645+
assert.Equal(t, int32(3), requestCount.Load(), "should have made 3 requests (2 failures + 1 success)")
646+
}
647+
648+
// TestFetchAndFixSchema_ExponentialBackoffDelays verifies that all retries are attempted
649+
// when transient errors occur, and that the function eventually gives up after
650+
// maxSchemaFetchRetries attempts.
651+
func TestFetchAndFixSchema_ExponentialBackoffDelays(t *testing.T) {
652+
var requestCount atomic.Int32
653+
654+
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
655+
requestCount.Add(1)
656+
w.WriteHeader(http.StatusTooManyRequests)
657+
}))
658+
defer server.Close()
659+
660+
_, err := fetchAndFixSchema(server.URL)
661+
662+
require.Error(t, err)
663+
assert.Equal(t, int32(maxSchemaFetchRetries), requestCount.Load(),
664+
"should make exactly maxSchemaFetchRetries requests before giving up")
665+
assert.Contains(t, err.Error(), "failed to fetch schema: HTTP 429")
666+
}

0 commit comments

Comments
 (0)