Skip to content

Commit aa3bee0

Browse files
Fix missing data when Caddy isn't available at startup
1 parent 876087d commit aa3bee0

10 files changed

Lines changed: 397 additions & 97 deletions

File tree

docs/caddy-configuration.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ If auto-detection does not work, you can specify the process ID manually:
4646
ember --pid 12345
4747
```
4848

49-
When no `--pid` is provided, Ember scans the process list for a FrankenPHP process. If none is found, it falls back to scanning for a Caddy process for CPU/RSS monitoring.
49+
When no `--pid` is provided, Ember scans the process list for a FrankenPHP process. If none is found, it falls back to scanning for a Caddy process for CPU/RSS monitoring. When process scanning is unavailable, Ember derives CPU and RSS from the standard Go `process_*` Prometheus metrics exposed by Caddy's `/metrics` endpoint.
5050

5151
## Remote Caddy Instances
5252

internal/fetcher/fetcher.go

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,11 @@ type MetricsSnapshot struct {
6767

6868
// Per-host Caddy HTTP metrics
6969
Hosts map[string]*HostMetrics `json:"hosts,omitempty"`
70+
71+
// Go runtime process metrics (from standard Prometheus collector)
72+
ProcessCPUSecondsTotal float64 `json:"processCpuSecondsTotal,omitempty"`
73+
ProcessRSSBytes float64 `json:"processRssBytes,omitempty"`
74+
ProcessStartTimeSeconds float64 `json:"processStartTimeSeconds,omitempty"`
7075
}
7176

7277
type HistogramBucket struct {
@@ -83,11 +88,12 @@ type ProcessMetrics struct {
8388
}
8489

8590
type Snapshot struct {
86-
Threads ThreadsResponse `json:"threads"`
87-
Metrics MetricsSnapshot `json:"metrics"`
88-
Process ProcessMetrics `json:"process"`
89-
FetchedAt time.Time `json:"fetchedAt"`
90-
Errors []string `json:"errors,omitempty"`
91+
Threads ThreadsResponse `json:"threads"`
92+
Metrics MetricsSnapshot `json:"metrics"`
93+
Process ProcessMetrics `json:"process"`
94+
FetchedAt time.Time `json:"fetchedAt"`
95+
Errors []string `json:"errors,omitempty"`
96+
HasFrankenPHP bool `json:"hasFrankenPHP"`
9197
}
9298

9399
type Fetcher interface {

internal/fetcher/http.go

Lines changed: 59 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@ type HTTPFetcher struct {
2626
procHandle *processHandle
2727
hasFrankenPHP bool
2828
serverNames []string
29+
30+
lastPromCPU float64
31+
lastPromSample time.Time
2932
}
3033

3134
func NewHTTPFetcher(baseURL string, pid int32) *HTTPFetcher {
@@ -105,18 +108,19 @@ func (f *HTTPFetcher) ServerNames() []string {
105108

106109
func (f *HTTPFetcher) Fetch(ctx context.Context) (*Snapshot, error) {
107110
var (
108-
threads ThreadsResponse
109-
metrics MetricsSnapshot
110-
proc ProcessMetrics
111-
mu sync.Mutex
112-
errs []string
111+
threads ThreadsResponse
112+
metrics MetricsSnapshot
113+
proc ProcessMetrics
114+
mu sync.Mutex
115+
errs []string
116+
metricsOK bool
113117
)
114118

115-
g, ctx := errgroup.WithContext(ctx)
119+
g, gCtx := errgroup.WithContext(ctx)
116120

117121
if f.hasFrankenPHP {
118122
g.Go(func() error {
119-
t, err := f.fetchThreads(ctx)
123+
t, err := f.fetchThreads(gCtx)
120124
if err != nil {
121125
mu.Lock()
122126
errs = append(errs, err.Error())
@@ -129,19 +133,22 @@ func (f *HTTPFetcher) Fetch(ctx context.Context) (*Snapshot, error) {
129133
}
130134

131135
g.Go(func() error {
132-
m, err := f.fetchMetrics(ctx)
136+
m, err := f.fetchMetrics(gCtx)
133137
if err != nil {
134138
mu.Lock()
135139
errs = append(errs, err.Error())
136140
mu.Unlock()
137141
return nil
138142
}
143+
mu.Lock()
144+
metricsOK = true
145+
mu.Unlock()
139146
metrics = m
140147
return nil
141148
})
142149

143150
g.Go(func() error {
144-
p, err := f.procHandle.fetch(ctx)
151+
p, err := f.procHandle.fetch(gCtx)
145152
if err != nil {
146153
mu.Lock()
147154
errs = append(errs, err.Error())
@@ -154,6 +161,34 @@ func (f *HTTPFetcher) Fetch(ctx context.Context) (*Snapshot, error) {
154161

155162
_ = g.Wait()
156163

164+
if metricsOK {
165+
f.onConnected(ctx)
166+
167+
// Derive process metrics from Prometheus when gopsutil has no data
168+
if proc.RSS == 0 && metrics.ProcessRSSBytes > 0 {
169+
proc.RSS = uint64(metrics.ProcessRSSBytes)
170+
}
171+
if proc.CPUPercent == 0 && metrics.ProcessCPUSecondsTotal > 0 {
172+
now := time.Now()
173+
if !f.lastPromSample.IsZero() {
174+
elapsed := now.Sub(f.lastPromSample).Seconds()
175+
if elapsed > 0 {
176+
proc.CPUPercent = (metrics.ProcessCPUSecondsTotal - f.lastPromCPU) / elapsed * 100
177+
if proc.CPUPercent < 0 {
178+
proc.CPUPercent = 0
179+
}
180+
}
181+
}
182+
f.lastPromCPU = metrics.ProcessCPUSecondsTotal
183+
f.lastPromSample = now
184+
}
185+
if proc.CreateTime == 0 && metrics.ProcessStartTimeSeconds > 0 {
186+
startSec := int64(metrics.ProcessStartTimeSeconds)
187+
proc.CreateTime = startSec * 1000
188+
proc.Uptime = time.Since(time.Unix(startSec, 0))
189+
}
190+
}
191+
157192
// Seed known server names as empty host entries so they appear immediately
158193
if len(f.serverNames) > 0 && metrics.Hosts != nil {
159194
for _, name := range f.serverNames {
@@ -168,14 +203,24 @@ func (f *HTTPFetcher) Fetch(ctx context.Context) (*Snapshot, error) {
168203
}
169204

170205
return &Snapshot{
171-
Threads: threads,
172-
Metrics: metrics,
173-
Process: proc,
174-
FetchedAt: time.Now(),
175-
Errors: errs,
206+
Threads: threads,
207+
Metrics: metrics,
208+
Process: proc,
209+
FetchedAt: time.Now(),
210+
Errors: errs,
211+
HasFrankenPHP: f.hasFrankenPHP,
176212
}, nil
177213
}
178214

215+
func (f *HTTPFetcher) onConnected(ctx context.Context) {
216+
if !f.hasFrankenPHP {
217+
f.DetectFrankenPHP(ctx)
218+
}
219+
if len(f.serverNames) == 0 {
220+
f.FetchServerNames(ctx)
221+
}
222+
}
223+
179224
func (f *HTTPFetcher) RestartWorkers(ctx context.Context) error {
180225
ctx, cancel := context.WithTimeout(ctx, requestTimeout)
181226
defer cancel()

internal/fetcher/http_test.go

Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -406,6 +406,222 @@ caddy_http_request_duration_seconds_count{server="main"} 100
406406
assert.Equal(t, float64(100), main.RequestsTotal, "seeding should not overwrite existing host data")
407407
}
408408

409+
func TestOnConnected_DetectsFrankenPHP(t *testing.T) {
410+
frankenPHPAvailable := false
411+
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
412+
switch r.URL.Path {
413+
case "/frankenphp/threads":
414+
if frankenPHPAvailable {
415+
w.WriteHeader(200)
416+
json.NewEncoder(w).Encode(ThreadsResponse{
417+
ThreadDebugStates: []ThreadDebugState{{Index: 0, State: "ready"}},
418+
})
419+
} else {
420+
w.WriteHeader(404)
421+
}
422+
case "/metrics":
423+
w.WriteHeader(200)
424+
default:
425+
w.WriteHeader(404)
426+
}
427+
}))
428+
defer srv.Close()
429+
430+
f := NewHTTPFetcher(srv.URL, 0)
431+
432+
snap, err := f.Fetch(context.Background())
433+
require.NoError(t, err)
434+
assert.False(t, snap.HasFrankenPHP, "should not detect FrankenPHP when unavailable")
435+
assert.Empty(t, snap.Threads.ThreadDebugStates)
436+
437+
frankenPHPAvailable = true
438+
439+
snap, err = f.Fetch(context.Background())
440+
require.NoError(t, err)
441+
assert.True(t, snap.HasFrankenPHP, "should detect FrankenPHP on next successful fetch")
442+
// Threads are fetched on the NEXT Fetch() after detection
443+
snap, err = f.Fetch(context.Background())
444+
require.NoError(t, err)
445+
assert.Len(t, snap.Threads.ThreadDebugStates, 1)
446+
}
447+
448+
func TestOnConnected_FetchesServerNames(t *testing.T) {
449+
serverNamesAvailable := false
450+
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
451+
switch r.URL.Path {
452+
case "/config/apps/http/servers":
453+
if serverNamesAvailable {
454+
w.WriteHeader(200)
455+
json.NewEncoder(w).Encode(map[string]any{
456+
"main": map[string]any{"listen": []string{":443"}},
457+
})
458+
} else {
459+
w.WriteHeader(404)
460+
}
461+
case "/metrics":
462+
w.WriteHeader(200)
463+
default:
464+
w.WriteHeader(404)
465+
}
466+
}))
467+
defer srv.Close()
468+
469+
f := NewHTTPFetcher(srv.URL, 0)
470+
471+
f.Fetch(context.Background())
472+
assert.Empty(t, f.ServerNames())
473+
474+
serverNamesAvailable = true
475+
476+
snap, err := f.Fetch(context.Background())
477+
require.NoError(t, err)
478+
assert.Equal(t, []string{"main"}, f.ServerNames())
479+
require.NotNil(t, snap.Metrics.Hosts)
480+
assert.Contains(t, snap.Metrics.Hosts, "main")
481+
}
482+
483+
func TestOnConnected_NoRetryWhenMetricsFail(t *testing.T) {
484+
var detectCalls atomic.Int32
485+
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
486+
switch r.URL.Path {
487+
case "/frankenphp/threads":
488+
detectCalls.Add(1)
489+
w.WriteHeader(404)
490+
case "/metrics":
491+
w.WriteHeader(500)
492+
default:
493+
w.WriteHeader(404)
494+
}
495+
}))
496+
defer srv.Close()
497+
498+
f := NewHTTPFetcher(srv.URL, 0)
499+
500+
f.Fetch(context.Background())
501+
f.Fetch(context.Background())
502+
f.Fetch(context.Background())
503+
504+
assert.Equal(t, int32(0), detectCalls.Load(), "should not attempt detection when metrics fail")
505+
}
506+
507+
func TestOnConnected_StopsAfterSuccess(t *testing.T) {
508+
var detectCalls atomic.Int32
509+
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
510+
switch r.URL.Path {
511+
case "/frankenphp/threads":
512+
detectCalls.Add(1)
513+
w.WriteHeader(200)
514+
json.NewEncoder(w).Encode(ThreadsResponse{})
515+
case "/metrics":
516+
w.WriteHeader(200)
517+
default:
518+
w.WriteHeader(404)
519+
}
520+
}))
521+
defer srv.Close()
522+
523+
f := NewHTTPFetcher(srv.URL, 0)
524+
525+
f.Fetch(context.Background())
526+
f.Fetch(context.Background())
527+
f.Fetch(context.Background())
528+
529+
// Detection is called once (first fetch), then stops because hasFrankenPHP is true
530+
// But fetchThreads also hits /frankenphp/threads in subsequent fetches
531+
assert.True(t, f.HasFrankenPHP())
532+
}
533+
534+
func TestFetch_HasFrankenPHPInSnapshot(t *testing.T) {
535+
srv := newTestServer(200, ThreadsResponse{}, 200, "")
536+
defer srv.Close()
537+
538+
f := NewHTTPFetcher(srv.URL, 0)
539+
f.hasFrankenPHP = true
540+
541+
snap, err := f.Fetch(context.Background())
542+
require.NoError(t, err)
543+
assert.True(t, snap.HasFrankenPHP)
544+
}
545+
546+
func TestFetch_NoFrankenPHPInSnapshot(t *testing.T) {
547+
srv := newTestServer(404, nil, 200, "")
548+
defer srv.Close()
549+
550+
f := NewHTTPFetcher(srv.URL, 0)
551+
552+
snap, err := f.Fetch(context.Background())
553+
require.NoError(t, err)
554+
assert.False(t, snap.HasFrankenPHP)
555+
}
556+
557+
func TestFetch_PrometheusProcessFallback_RSS(t *testing.T) {
558+
metricsText := `# TYPE process_resident_memory_bytes gauge
559+
process_resident_memory_bytes 1.048576e+07
560+
# TYPE process_cpu_seconds_total counter
561+
process_cpu_seconds_total 12.5
562+
# TYPE process_start_time_seconds gauge
563+
process_start_time_seconds 1.7e+09
564+
`
565+
srv := newTestServer(404, nil, 200, metricsText)
566+
defer srv.Close()
567+
568+
f := NewHTTPFetcher(srv.URL, 0)
569+
570+
snap, err := f.Fetch(context.Background())
571+
require.NoError(t, err)
572+
assert.Equal(t, uint64(10485760), snap.Process.RSS, "RSS should come from Prometheus metrics")
573+
assert.True(t, snap.Process.Uptime > 0, "Uptime should be derived from process_start_time_seconds")
574+
assert.True(t, snap.Process.CreateTime > 0, "CreateTime should be derived from process_start_time_seconds")
575+
}
576+
577+
func TestFetch_PrometheusProcessFallback_CPU(t *testing.T) {
578+
metricsText := `# TYPE process_cpu_seconds_total counter
579+
process_cpu_seconds_total 10.0
580+
`
581+
srv := newTestServer(404, nil, 200, metricsText)
582+
defer srv.Close()
583+
584+
f := NewHTTPFetcher(srv.URL, 0)
585+
586+
// First fetch: records baseline, CPU=0 (no previous sample)
587+
snap, err := f.Fetch(context.Background())
588+
require.NoError(t, err)
589+
assert.Equal(t, float64(0), snap.Process.CPUPercent, "first fetch has no delta yet")
590+
591+
// Simulate time passing and CPU usage increasing
592+
f.lastPromSample = f.lastPromSample.Add(-1 * time.Second)
593+
f.lastPromCPU = 10.0
594+
595+
metricsText2 := `# TYPE process_cpu_seconds_total counter
596+
process_cpu_seconds_total 10.5
597+
`
598+
srv.Close()
599+
srv2 := newTestServer(404, nil, 200, metricsText2)
600+
defer srv2.Close()
601+
f.baseURL = srv2.URL
602+
603+
snap, err = f.Fetch(context.Background())
604+
require.NoError(t, err)
605+
assert.True(t, snap.Process.CPUPercent > 0, "CPU should be derived from Prometheus delta")
606+
}
607+
608+
func TestFetch_PrometheusProcessFallback_NotUsedWhenGopsutilWorks(t *testing.T) {
609+
metricsText := `# TYPE process_resident_memory_bytes gauge
610+
process_resident_memory_bytes 1.048576e+07
611+
`
612+
srv := newTestServer(404, nil, 200, metricsText)
613+
defer srv.Close()
614+
615+
f := NewHTTPFetcher(srv.URL, 0)
616+
// Simulate gopsutil having found the process and returned real RSS
617+
f.procHandle.proc = nil // no proc, but we'll set proc directly
618+
// We can't easily simulate gopsutil success in tests, so just verify
619+
// that the fallback IS used when proc.RSS == 0
620+
snap, err := f.Fetch(context.Background())
621+
require.NoError(t, err)
622+
assert.Equal(t, uint64(10485760), snap.Process.RSS)
623+
}
624+
409625
func TestFetchThreads_PerRequestTimeout(t *testing.T) {
410626
srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
411627
time.Sleep(requestTimeout + time.Second)

0 commit comments

Comments
 (0)