mitdbg
diff --git a/‎evals/quest/eval.py‎
Lines changed: 159 additions & 0 deletions b/‎evals/quest/eval.py‎
Lines changed: 159 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/capture_litellm_stats.py‎
Lines changed: 7 additions & 2 deletions b/‎scripts/capture_litellm_stats.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎scripts/capture_provider_stats.py‎
Lines changed: 82 additions & 2 deletions b/‎scripts/capture_provider_stats.py‎
Lines changed: 82 additions & 2 deletions
diff --git a/‎scripts/generate_test_messages.py‎
Lines changed: 5 additions & 0 deletions b/‎scripts/generate_test_messages.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎scripts/update_model_info.py‎
Lines changed: 2 additions & 1 deletion b/‎scripts/update_model_info.py‎
Lines changed: 2 additions & 1 deletion
@@ -0,0 +1,159 @@
+import argparse
+import copy
+import json
+import os
+import random
+import time
+
+import palimpzest as pz
+
+
+def prepare_docs_for_query(items: list, gt_docs: list) -> list:
+    items = copy.copy(items)
+    random.shuffle(items)
+    final_items = [doc for doc in items if doc["title"] in gt_docs]
+    while len(final_items) < 1000 and len(items) > 0:
+        item = items.pop(0)
+        if item not in final_items:
+            final_items.append(item)
+    return final_items
+
+
+def palimpzest_run_query(query: dict, documents: list) -> list[str]:
+    gt_docs = query["docs"]
+    items = prepare_docs_for_query(documents, gt_docs)
+
+    schema = [
+        {"name": "title", "type": str, "desc": "Document title"},
+        {"name": "text", "type": str, "desc": "Document content"},
+    ]
+
+    dataset = pz.MemoryDataset(
+        id="quest-docs",
+        vals=items,
+        schema=schema,
+    )
+
+    query_text = query["query"]
+    plan = dataset.sem_filter(
+        f'This document is relevant to the entity-seeking query: "{query_text}". '
+        "Return True if the document helps answer the query, False otherwise.",
+        depends_on=["text"],
+    ).project(["title"])
+
+    config = pz.QueryProcessorConfig(
+        policy=pz.MaxQuality(),
+        execution_strategy="parallel",
+        progress=True,
+    )
+    output = plan.run(config)
+    execution_stats = output.execution_stats
+    time_secs = execution_stats.total_execution_time if execution_stats else 0.0
+    cost = execution_stats.total_execution_cost if execution_stats else 0.0
+    return [record["title"] for record in output], time_secs, cost
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate Palimpzest on QUEST")
+    parser.add_argument(
+        "--domain",
+        type=str,
+        required=True,
+        choices=["films", "books"],
+        help="The domain to evaluate.",
+    )
+    parser.add_argument(
+        "--queries",
+        type=str,
+        required=True,
+        help="Path to the file containing the queries (e.g. test.jsonl).",
+    )
+    parser.add_argument(
+        "--documents",
+        type=str,
+        default="data/documents.jsonl",
+        help="Path to documents.jsonl (QUEST format: title, text per line).",
+    )
+    parser.add_argument(
+        "--limit",
+        type=int,
+        default=None,
+        help="Limit number of queries to evaluate (for debugging).",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="Random seed for document shuffling.",
+    )
+    args = parser.parse_args()
+
+    random.seed(args.seed)
+
+    if not os.path.exists(args.documents):
+        raise FileNotFoundError(
+            f"Documents file not found: {args.documents}\n"
+        )
+    with open(args.documents) as f:
+        documents = [json.loads(line) for line in f]
+
+    queries = []
+    with open(args.queries) as f:
+        for line in f:
+            d = json.loads(line)
+            if d["metadata"]["domain"] == args.domain:
+                queries.append(d)
+
+    if args.limit:
+        queries = queries[: args.limit]
+
+    results = []
+    for i, query in enumerate(queries):
+        print(f"[{i + 1}/{len(queries)}] Executing query: {query['query']}")
+        pred_docs, cur_time, cur_cost = palimpzest_run_query(query, documents)
+
+        gt_docs = query["docs"]
+        preds = set(pred_docs)
+        labels = set(gt_docs)
+
+        tp = sum(1 for pred in preds if pred in labels)
+        fp = len(preds) - tp
+        fn = sum(1 for label in labels if label not in preds)
+
+        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
+        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
+        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
+
+        result = {
+            "query": query["query"],
+            "predicted_docs": pred_docs,
+            "ground_truth_docs": gt_docs,
+            "precision": precision,
+            "recall": recall,
+            "f1_score": f1,
+            "time": cur_time,
+            "cost": cur_cost
+        }
+        results.append(result)
+
+    ts = int(time.time())
+    out_path = f"results_{args.domain}_{ts}.json"
+    with open(out_path, "w") as f:
+        json.dump(results, f, indent=4)
+    print(f"\nResults saved to {out_path}")
+
+    n = len(results)
+    avg_precision = sum(r["precision"] for r in results) / n
+    avg_recall = sum(r["recall"] for r in results) / n
+    avg_f1 = sum(r["f1_score"] for r in results) / n
+    avg_time = sum(r["time"] for r in results) / n
+    avg_cost = sum(r["cost"] for r in results) / n
+
+    print(f"Average Precision: {avg_precision:.4f}")
+    print(f"Average Recall: {avg_recall:.4f}")
+    print(f"Average F1 Score: {avg_f1:.4f}")
+    print(f"Average Time: {avg_time:.4f}s")
+    print(f"Average Cost: {avg_cost:.4f}$")
+
+if __name__ == "__main__":
+    main()
@@ -1,6 +1,6 @@
 [project]
 name = "palimpzest"
-version = "1.4.0"
+version = "1.5.0"
 description = "Palimpzest is a system which enables anyone to process AI-powered analytical queries simply by defining them in a declarative language"
 readme = "README.md"
 requires-python = ">=3.12"
 
@@ -18,6 +18,7 @@
 - Google/Gemini: gemini-2.5-flash (all seven modality combinations)
 - OpenAI: gpt-4o-2024-08-06 (text, image, text+image)
 - OpenAI: gpt-4o-audio-preview (text+audio, audio)
+- Azure: gpt-4o via Azure OpenAI (text, image, text+image)
 
 Output files are saved to: scripts/litellm_stats/
 """
@@ -168,6 +169,10 @@ def get_captured_data(self) -> dict[str, Any]:
             "text-image-audio",
         ],
     },
+    "azure": {
+        "model": Model.AZURE_GPT_4o,
+        "supported_modalities": ["text-only", "image-only", "text-image"],
+    },
 }
 
 
@@ -305,7 +310,7 @@ def call_litellm_api(
 
     # Apply provider-specific caching configuration
     # Messages from generator_messages already have cache_control markers for Anthropic
-    if model.is_provider_openai() and cache_key:
+    if (model.is_provider_openai() or model.is_provider_azure()) and cache_key:
         # OpenAI: Use prompt_cache_key for sticky routing to the same cache shard
         # https://platform.openai.com/docs/guides/prompt-caching
         completion_kwargs["extra_body"] = {"prompt_cache_key": cache_key}
@@ -395,7 +400,7 @@ def capture_stats_for_provider(
     """
     # Generate a unique cache key for OpenAI (ensures both requests hit the same cache shard)
     # Reference: capture_provider_stats.py and PromptManager.__init__
-    openai_cache_key = f"pz-test-{uuid.uuid4().hex[:12]}" if provider in ("openai", "openai-audio") else None
+    openai_cache_key = f"pz-test-{uuid.uuid4().hex[:12]}" if provider in ("openai", "openai-audio", "azure") else None
 
     print("    First request...")
     first_stats = call_litellm_api(messages, model, provider, cache_key=openai_cache_key)
 
@@ -17,6 +17,7 @@
 - Google/Vertex AI: gemini-2.5-flash (all seven modality combinations)
 - OpenAI: gpt-4o-2024-08-06 (text, image, text+image)
 - OpenAI: gpt-4o-audio-preview (text+audio, audio)
+- Azure: gpt-4o-2024-08-06 via Azure OpenAI (text, image, text+image)
 
 Output files are saved to: tests/pytest/scripts/provider_stats/
 """
@@ -108,6 +109,10 @@ def detect_image_media_type(base64_data: str) -> str:
             "text-image-audio",
         ],
     },
+    "azure": {
+        "model": "gpt-4o-2024-08-06",
+        "supported_modalities": ["text-only", "image-only", "text-image"],
+    },
 }
 
 
@@ -437,6 +442,77 @@ def call_openai_api(messages: list[dict], model: str, cache_key: str | None = No
     }
 
 
+# NOTE: this function was generated speculatively and has not been tested, so it may have errors
+def call_azure_api(messages: list[dict], model: str, cache_key: str | None = None) -> dict[str, Any]:
+    """
+    Call Azure OpenAI API directly and return usage statistics.
+
+    Uses the same message format as OpenAI, but routes through Azure endpoints.
+
+    Args:
+        messages: List of message dicts
+        model: Model name (deployment name)
+        cache_key: Optional prompt_cache_key for sticky routing to same cache shard
+
+    Returns dict with:
+    - completion_tokens
+    - prompt_tokens
+    - prompt_tokens_details (cached_tokens, text_tokens, image_tokens, audio_tokens)
+    - total_tokens
+    """
+    import openai
+
+    api_key = os.environ.get("AZURE_API_KEY") or os.environ.get("AZURE_OPENAI_API_KEY")
+    azure_endpoint = os.environ.get("AZURE_API_BASE")
+    api_version = os.environ.get("AZURE_API_VERSION", "2024-12-01-preview")
+
+    if not api_key:
+        raise ValueError("AZURE_API_KEY or AZURE_OPENAI_API_KEY must be set")
+    if not azure_endpoint:
+        raise ValueError("AZURE_API_BASE must be set")
+
+    client = openai.AzureOpenAI(
+        api_key=api_key,
+        azure_endpoint=azure_endpoint,
+        api_version=api_version,
+    )
+
+    openai_messages = transform_messages_for_openai(messages)
+
+    kwargs = {"model": model, "messages": openai_messages, "temperature": 0.0}
+
+    # Add prompt_cache_key for caching (ensures requests route to same cache shard)
+    if cache_key:
+        kwargs["extra_body"] = {"prompt_cache_key": cache_key}
+
+    response = client.chat.completions.create(**kwargs)
+
+    # Extract complete usage stats
+    usage_dict = {}
+    if response.usage:
+        usage_dict = response.usage.model_dump()
+
+    # Get response text safely
+    try:
+        response_text = response.choices[0].message.content[:200] if response.choices and response.choices[0].message.content else None
+    except Exception:
+        response_text = None
+
+    # Serialize the full response
+    try:
+        raw_response = response.model_dump()
+    except Exception:
+        raw_response = str(response)
+
+    return {
+        "provider": "azure",
+        "model": model,
+        "usage": usage_dict,
+        "response_content": response_text,
+        "raw_response": raw_response,
+    }
+
+
 def call_anthropic_api(messages: list[dict], model: str) -> dict[str, Any]:
     """
     Call Anthropic API directly and return usage statistics.
@@ -602,12 +678,14 @@ def capture_stats_for_provider(
     - first_request: stats from first request
     - second_request: stats from second request (should show cache hits)
     """
-    # Generate a unique cache key for OpenAI (ensures both requests hit the same cache shard)
-    openai_cache_key = f"pz-test-{uuid.uuid4().hex[:12]}" if provider in ("openai", "openai-audio") else None
+    # Generate a unique cache key for OpenAI/Azure (ensures both requests hit the same cache shard)
+    openai_cache_key = f"pz-test-{uuid.uuid4().hex[:12]}" if provider in ("openai", "openai-audio", "azure") else None
 
     print("    First request...")
     if provider == "openai" or provider == "openai-audio":
         first_stats = call_openai_api(messages, model, cache_key=openai_cache_key)
+    elif provider == "azure":
+        first_stats = call_azure_api(messages, model, cache_key=openai_cache_key)
     elif provider == "anthropic":
         first_stats = call_anthropic_api(messages, model)
     elif provider == "gemini":
@@ -625,6 +703,8 @@ def capture_stats_for_provider(
     print("    Second request (should show cache hits)...")
     if provider == "openai" or provider == "openai-audio":
         second_stats = call_openai_api(messages, model, cache_key=openai_cache_key)
+    elif provider == "azure":
+        second_stats = call_azure_api(messages, model, cache_key=openai_cache_key)
     elif provider == "anthropic":
         second_stats = call_anthropic_api(messages, model)
     elif provider == "gemini":
 
@@ -12,6 +12,7 @@
 - OpenAI-Audio: audio-only, text-audio
 - Gemini: all 7 modality combinations
 - Vertex AI: all 7 modality combinations
+- Azure: text-only, image-only, text-image
 
 Output files are saved to: tests/pytest/data/generator_messages/
 Format: {modality}_{provider}.json (e.g., text-only_anthropic.json)
@@ -244,6 +245,10 @@ class OutputSchema(BaseModel):
             "text-image", "text-audio", "image-audio", "text-image-audio",
         ],
     },
+    "azure": {
+        "model": Model.AZURE_GPT_4o,
+        "supported_modalities": ["text-only", "image-only", "text-image"],
+    },
 }
 
 
 
@@ -68,6 +68,7 @@
 # API key environment variable mapping
 API_KEY_MAPPING = {
     "openai": "OPENAI_API_KEY",
+    "azure": "AZURE_API_KEY",
     "anthropic": "ANTHROPIC_API_KEY",
     "vertex_ai": "GOOGLE_APPLICATION_CREDENTIALS",
     "gemini": "GEMINI_API_KEY",
@@ -126,7 +127,7 @@ def extract_provider(model_id: str) -> str:
     model_lower = model_id.lower()
 
     # OpenAI
-    if any(x in model_lower for x in ["gpt", "o1-", "o3-", "dall-e", "whisper"]):
+    if any(x in model_lower for x in ["gpt", "o1-", "o3-", "o4-", "dall-e", "whisper"]):
         return "openai"
 
     # Anthropic