ogx-ai
diff --git a/‎benchmarking/__init__.py‎
Lines changed: 5 additions & 0 deletions b/‎benchmarking/__init__.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎benchmarking/api_latency_comparison/README.md‎
Lines changed: 80 additions & 0 deletions b/‎benchmarking/api_latency_comparison/README.md‎
Lines changed: 80 additions & 0 deletions
diff --git a/‎benchmarking/api_latency_comparison/configs/stack-config-benchmark.yaml‎
Lines changed: 70 additions & 0 deletions b/‎benchmarking/api_latency_comparison/configs/stack-config-benchmark.yaml‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎benchmarking/api_latency_comparison/experiment/__init__.py‎
Lines changed: 5 additions & 0 deletions b/‎benchmarking/api_latency_comparison/experiment/__init__.py‎
Lines changed: 5 additions & 0 deletions
@@ -0,0 +1,5 @@
+# Copyright (c) The OGX Contributors.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
@@ -0,0 +1,80 @@
+# API Latency Comparison Benchmark
+
+Measures per-request latency of two OGX versions under a controlled
+agentic workload. Compares an older release against a newer commit by
+running both through a mocked agentic workload and recording
+per-request response times.
+
+Analysis and model fitting are added in a follow-up PR.
+
+## Overview
+
+The experiment is a randomized complete block design with three trials
+(treatment combinations). Each trial is replicated multiple times, and
+each replicate is a **run**: one row in the design matrix. The design
+matrix generator randomizes run order to guard against temporal
+confounding.
+
+The three trials are:
+
+- **Baseline**: the older version (e.g., latest release tag)
+- **Comparison**: the newer version under test
+- **Comparison control**: same commit as comparison, run independently as
+  a negative control for false positive detection
+
+Each run starts a fresh OGX server against a mock backend, sends
+agentic requests (with web_search tool calls) via Locust for a fixed
+duration, and records per-request latencies. The false positive
+detection runs the negative control (same code as comparison, run
+independently) to verify the experiment isn't producing spurious
+differences.
+
+Components:
+
+- **Mock server** (`experiment/mock_server.py`): canned OpenAI + Brave Search responses
+- **Locust** (`experiment/locustfile_responses.py`): load generator, 1 concurrent user
+- **Experiment orchestrator** (`experiment/benchmark.py`): run execution with CPU pinning
+- **Worktree setup** (`experiment/setup-worktree.sh`): isolated git worktrees per version
+- **Design matrix** (`experiment/generate_design_matrix.py`): randomized experiment design
+
+## Prerequisites
+
+```bash
+# Benchmark experiment dependencies (Locust, mirakuru)
+uv sync --group api-latency-comparison
+```
+
+## Quick Start
+
+The orchestrator handles worktree setup, matrix generation, and
+experiment execution in one command:
+
+```bash
+uv run python -m benchmarking.api_latency_comparison.experiment.benchmark \
+  --baseline-ref v1.1.0 --comparison-ref HEAD --replicates 5
+```
+
+Output lands in an auto-timestamped directory under `results/`.
+
+## Configuration
+
+| Environment variable | Default | Description |
+|---|---|---|
+| `RESULTS_DIR` | auto-timestamped | Where to write results |
+| `MATRIX_CSV` | `$RESULTS_DIR/experiment-matrix.csv` | Experiment matrix |
+| `RUN_DURATION` | 10 | Seconds per run |
+| `MOCK_PORT` | 8080 | Mock server port |
+| `STACK_PORT` | 8321 | OGX server port |
+| `CPU_OGX` | 0 | Core for OGX server |
+| `CPU_LOCUST` | 1 | Core for Locust |
+| `CPU_MOCK` | 2 | Core for mock server |
+
+## Implementation Notes
+
+**CPU pinning**: Processes are pinned via `os.sched_setaffinity()` in
+`preexec_fn` callbacks, applied at fork before exec. Pinning is verified
+per run via `os.sched_getaffinity(pid)` after each server start.
+
+**Brave Search patching**: Older OGX versions don't have the `base_url`
+field on `BraveSearchToolConfig`. The setup script patches it via `sed`
+so the mock server can serve search results locally.
@@ -0,0 +1,70 @@
+version: 2
+distro_name: vertical-scaling-responses-agentic
+apis:
+- inference
+- responses
+- vector_io
+- tool_runtime
+- files
+providers:
+  inference:
+  - provider_id: openai
+    provider_type: remote::openai
+    config:
+      api_key: ${env.OPENAI_API_KEY:=fake-token}
+      base_url: ${env.OPENAI_BASE_URL:=http://localhost:8080/v1}
+  responses:
+  - provider_id: builtin
+    provider_type: inline::builtin
+    config:
+      persistence:
+        agent_state:
+          namespace: agents
+          backend: kv_default
+        responses:
+          table_name: responses
+          backend: sql_default
+  vector_io:
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      kvstore:
+        namespace: vector_io::faiss
+        backend: kv_default
+      persistence:
+        namespace: vector_io::faiss_persistence
+        backend: kv_default
+  tool_runtime:
+  - provider_id: brave-search
+    provider_type: remote::brave-search
+    config:
+      api_key: "fake-benchmark-key"
+      max_results: 1
+      base_url: ${env.MOCK_SEARCH_URL:=http://localhost:8080}
+  files:
+  - provider_id: builtin-files
+    provider_type: inline::localfs
+    config:
+      storage_dir: /tmp/ogx-benchmark/files
+      metadata_store:
+        table_name: files_metadata
+        backend: sql_default
+storage:
+  backends:
+    kv_default:
+      type: kv_sqlite
+      db_path: /tmp/ogx-benchmark/kvstore.db
+    sql_default:
+      type: sql_sqlite
+      db_path: /tmp/ogx-benchmark/sql_store.db
+  stores:
+    metadata:
+      namespace: registry
+      backend: kv_default
+registered_resources:
+  models:
+  - model_id: mock-model
+    provider_id: openai
+    model_type: llm
+server:
+  port: 8321
@@ -0,0 +1,5 @@
+# Copyright (c) The OGX Contributors.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.