databrickslabs · mpkrass7 · May 11, 2026
diff --git a/README.md b/README.md
@@ -71,37 +71,35 @@ This isn't just a terminal in the cloud. Running coding agents on Databricks giv
 
 ## MLflow Tracing
 
-Every Claude Code session is **automatically traced** to a Databricks MLflow experiment — zero configuration required.
+Claude Code and Codex sessions can both be **automatically traced** to a single Databricks MLflow experiment — flip one switch to turn them on.
+
+### Turning it on
+
+Set **`MLFLOW_TRACING_ENABLED=true`** in `app.yaml` (or your shell for local dev). That single variable enables tracing for both CLIs. Tracing is **off by default** to keep deploys lightweight — opt in when you want it.
+
+```yaml
+# app.yaml
+env:
+  - name: MLFLOW_TRACING_ENABLED
+    value: "true"
+```
 
 ### How it works
 
 ```
-Claude Code session starts
+MLFLOW_TRACING_ENABLED=true
         │
-        ▼
-   Environment vars set automatically:
-   MLFLOW_TRACKING_URI=databricks
-   MLFLOW_EXPERIMENT_NAME=/Users/{you}/{app-name}
+        ├──► Claude Code: Stop hook fires on session end →
+        │     mlflow.claude_code.hooks.stop_hook_handler() logs the transcript
         │
-        ▼
-   You work normally — code, debug, deploy
-        │
-        ▼
-   Session ends → Stop hook fires
-        │
-        ▼
-   Full session transcript logged as an MLflow trace
-   at /Users/{you}/{app-name} in your workspace
+        └──► Codex: @mlflow/codex notify hook fires after each turn →
+              trace appended to the experiment
 ```
 
-### What gets traced
-
-When a Claude Code session ends, the **Stop hook** automatically calls `mlflow.claude_code.hooks.stop_hook_handler()`, which captures the full session transcript — your prompts, agent actions, tool calls, and outputs — and logs it as an MLflow trace.
+Both land in the same MLflow experiment, so you can compare runs across agents side by side.
 
 ### Where traces live
 
-Traces are stored in a Databricks MLflow experiment at:
-
 ```
 /Users/{your-email}/{app-name}
 ```
@@ -116,17 +114,17 @@ View them in the Databricks UI: **Workspace > Machine Learning > Experiments**.
 
 ### Configuration
 
-Tracing is configured during app startup by `setup_mlflow.py`, which merges the following into `~/.claude/settings.json`:
+Tracing is wired up during app startup:
 
 | Setting | Value | Purpose |
 |---------|-------|---------|
-| `MLFLOW_CLAUDE_TRACING_ENABLED` | `true` | Enables Claude Code tracing |
-| `MLFLOW_TRACKING_URI` | `databricks` | Routes traces to Databricks backend |
+| `MLFLOW_TRACING_ENABLED` | `true`/`false` (default `false`) | Master switch for Claude + Codex |
+| `MLFLOW_CLAUDE_TRACING_ENABLED` | mirrors `MLFLOW_TRACING_ENABLED` | Gates Claude's Stop hook at runtime |
+| `MLFLOW_TRACKING_URI` | `databricks` | Routes traces to the Databricks backend |
 | `MLFLOW_EXPERIMENT_NAME` | `/Users/{owner}/{app}` | Target experiment path |
-| `OTEL_EXPORTER_OTLP_ENDPOINT` | `""` | Overrides container OTEL to prevent trace loss |
-| Stop hook | `uv run python -c "from mlflow.claude_code.hooks import stop_hook_handler; stop_hook_handler()"` | Fires on session end |
+| `MLFLOW_EXPERIMENT_ID` | resolved from name | Set in `~/.codex/.env` (Codex needs an ID) |
 
-Tracing is skipped gracefully if `APP_OWNER` is not set (e.g., local dev without Databricks).
+Tracing setup is skipped gracefully when `APP_OWNER` is not set (e.g., local dev without Databricks) or when `MLFLOW_TRACING_ENABLED` is left at its default `false`.
 
 ---
 
@@ -288,6 +286,7 @@ This template repo opens that vision up for every Databricks user — no IDE set
 | `CODEX_MODEL` | No | Codex model name (default: `databricks-gpt-5-5`) |
 | `GEMINI_MODEL` | No | Gemini model name (default: `databricks-gemini-2-5-pro`) |
 | `DATABRICKS_GATEWAY_HOST` | No | AI Gateway URL override. Auto-discovered from `DATABRICKS_WORKSPACE_ID` if unset |
+| `MLFLOW_TRACING_ENABLED` | No | Set to `"true"` to enable MLflow tracing for Claude and Codex in one switch (default `"false"`) |
 
 ### Security Model
 

diff --git a/app.yaml.template b/app.yaml.template
@@ -26,3 +26,8 @@ env:
   # completes the rollout and auto memory is on by default, this can be removed entirely.
   - name: CLAUDE_CODE_DISABLE_AUTO_MEMORY
     value: 0
+  # Set MLFLOW_TRACING_ENABLED=true to auto-trace Claude, Codex, and Gemini
+  # sessions into /Users/{app_owner}/{app_name}. Default off to keep installs
+  # lightweight — opt in when you want observability across all three agents.
+  - name: MLFLOW_TRACING_ENABLED
+    value: "false"
diff --git a/setup_codex.py b/setup_codex.py
@@ -8,12 +8,19 @@
 Config: ~/.codex/config.toml with custom model_providers for Databricks.
 Auth: Bearer token via DATABRICKS_TOKEN environment variable.
 """
+import json
 import os
 import shutil
 import subprocess
 from pathlib import Path
 
-from utils import adapt_instructions_file, ensure_https, get_gateway_host, get_npm_version
+from utils import (
+    adapt_instructions_file,
+    ensure_https,
+    get_gateway_host,
+    get_npm_version,
+    resolve_mlflow_experiment_id,
+)
 
 # Set HOME if not properly set
 if not os.environ.get("HOME") or os.environ["HOME"] == "/":
@@ -101,6 +108,12 @@
     shutil.copyfile(catalog_src, catalog_dst)
     print(f"Codex model catalog copied: {catalog_dst}")
 
+# Optional: MLflow tracing notify hook (one switch enables Claude + Codex)
+tracing_enabled = os.environ.get("MLFLOW_TRACING_ENABLED", "false").lower() == "true"
+notify_line = ""
+if tracing_enabled:
+    notify_line = 'notify = ["mlflow-codex", "notify-hook"]\n'
+
 # Codex CLI uses TOML config with custom model_providers
 config_content = f"""# Databricks Model Serving Configuration for Codex CLI
 # Generated by setup_codex.py
@@ -113,6 +126,7 @@
 # Disable web_search - not supported by Databricks Responses API
 web_search = "disabled"
 
+{notify_line}
 # Databricks custom provider
 [model_providers.databricks]
 name = "Databricks Model Serving"
@@ -128,12 +142,56 @@
 # 4. Write OPENAI_API_KEY to shell profile for Codex to pick up
 # Codex reads from env_key specified in config (OPENAI_API_KEY)
 # We set this via the environment, but also write a .env file as backup
-env_content = f"""# Databricks token for Codex CLI (OpenAI-compatible endpoint)
-OPENAI_API_KEY={auth_token}
-"""
+env_lines = [
+    "# Databricks token for Codex CLI (OpenAI-compatible endpoint)",
+    f"OPENAI_API_KEY={auth_token}",
+]
+
+# MLflow tracing env vars (read by @mlflow/codex notify hook)
+app_owner = os.environ.get("APP_OWNER", "")
+app_name = os.environ.get("DATABRICKS_APP_NAME", "coding-agents")
+experiment_name = f"/Users/{app_owner}/{app_name}" if app_owner else ""
+
+if tracing_enabled and experiment_name:
+    experiment_id = resolve_mlflow_experiment_id(host, token, experiment_name)
+
+    # Install @mlflow/codex (provides the `mlflow-codex` binary used by the notify hook)
+    mlflow_codex_bin = local_bin / "mlflow-codex"
+    if not mlflow_codex_bin.exists():
+        npm_prefix = str(home / ".local")
+        print("Installing @mlflow/codex for MLflow tracing...")
+        result = subprocess.run(
+            ["npm", "install", "-g", f"--prefix={npm_prefix}", "@mlflow/codex"],
+            capture_output=True, text=True,
+            env={**os.environ, "HOME": str(home)},
+        )
+        if result.returncode == 0:
+            print(f"@mlflow/codex installed to {mlflow_codex_bin}")
+        else:
+            print(f"WARNING: @mlflow/codex install failed (rc={result.returncode}): {result.stderr.strip()[:300]}")
+
+    # Pass MLflow connection details via env (override mlflow-tracing.json).
+    # DATABRICKS_HOST/TOKEN are inherited from the app process — no need to
+    # re-write them here (and that avoids stale-token bugs on PAT rotation).
+    env_lines.extend([
+        "",
+        "# MLflow tracing (enabled by MLFLOW_TRACING_ENABLED=true)",
+        "MLFLOW_TRACKING_URI=databricks",
+    ])
+    if experiment_id:
+        env_lines.append(f"MLFLOW_EXPERIMENT_ID={experiment_id}")
+    else:
+        env_lines.append(f"MLFLOW_EXPERIMENT_NAME={experiment_name}")
+
+    # mlflow-tracing.json as a fallback for the notify hook when env isn't loaded
+    tracing_cfg = {"trackingUri": "databricks"}
+    if experiment_id:
+        tracing_cfg["experimentId"] = experiment_id
+    (codex_dir / "mlflow-tracing.json").write_text(json.dumps(tracing_cfg, indent=2))
+    print(f"Codex MLflow tracing configured: experiment_id={experiment_id or 'unresolved'}")
 
 env_path = codex_dir / ".env"
-env_path.write_text(env_content)
+env_path.write_text("\n".join(env_lines) + "\n")
 env_path.chmod(0o600)
 print(f"Codex CLI env configured: {env_path}")
 

diff --git a/setup_mlflow.py b/setup_mlflow.py
@@ -1,8 +1,8 @@
-"""Configure MLflow tracing for Claude Code sessions.
+"""Wire up Claude Code's Stop hook for MLflow tracing.
 
-Merges MLflow env vars and a Stop hook into ~/.claude/settings.json so that
-every Claude Code session automatically logs traces to a Databricks MLflow
-experiment at /Users/{app_owner}/{app_name}.
+Gated on MLFLOW_TRACING_ENABLED — the same switch enables Codex and Gemini
+tracing in their respective setup scripts. Traces land in
+/Users/{app_owner}/{app_name}.
 """
 
 import os
@@ -31,17 +31,23 @@
 
 experiment_name = f"/Users/{app_owner}/{app_name}"
 
-# Merge MLflow env vars
+# Single switch that controls tracing for Claude, Codex, and Gemini.
+# Defaults to "false" so opt-in requires explicit configuration.
+tracing_enabled = os.environ.get("MLFLOW_TRACING_ENABLED", "false").lower() == "true"
+
+# Merge MLflow env vars (always written so flipping the flag at runtime works
+# without rerunning setup — Claude reads MLFLOW_CLAUDE_TRACING_ENABLED on launch).
 settings.setdefault("env", {})
-settings["env"]["MLFLOW_CLAUDE_TRACING_ENABLED"] = "false"
+settings["env"]["MLFLOW_CLAUDE_TRACING_ENABLED"] = "true" if tracing_enabled else "false"
 settings["env"]["MLFLOW_TRACKING_URI"] = "databricks"
 settings["env"]["MLFLOW_EXPERIMENT_NAME"] = experiment_name
 # Override container-level OTEL endpoint so MLflow uses its native MlflowV3SpanExporter
 # instead of sending traces to a non-existent localhost:4314 OTLP collector
 settings["env"]["OTEL_EXPORTER_OTLP_ENDPOINT"] = ""
 
-# Add Stop hook (processes full transcript at session end)
-# Use `uv run python` so mlflow resolves correctly regardless of venv paths
+# Add Stop hook (processes full transcript at session end).
+# The hook is harmless when MLFLOW_CLAUDE_TRACING_ENABLED=false — mlflow's
+# stop_hook_handler short-circuits if tracing isn't enabled.
 python_cmd = "uv run python"
 mlflow_hook = {
     "hooks": [
@@ -54,11 +60,19 @@
 
 existing_hooks = settings.get("hooks", {})
 stop_hooks = existing_hooks.get("Stop", [])
-stop_hooks.append(mlflow_hook)
+# Avoid duplicating the hook if setup runs multiple times
+already_present = any(
+    "stop_hook_handler" in h.get("hooks", [{}])[0].get("command", "")
+    for h in stop_hooks if isinstance(h, dict)
+)
+if not already_present:
+    stop_hooks.append(mlflow_hook)
 existing_hooks["Stop"] = stop_hooks
 settings["hooks"] = existing_hooks
 
 settings_path.write_text(json.dumps(settings, indent=2))
-print(f"MLflow tracing enabled: experiment={experiment_name}")
+print(f"MLflow tracing {'ENABLED' if tracing_enabled else 'disabled'}: experiment={experiment_name}")
 print(f"  Tracking URI: databricks")
 print(f"  Settings updated: {settings_path}")
+if not tracing_enabled:
+    print("  Set MLFLOW_TRACING_ENABLED=true (in app.yaml) to enable Claude + Codex + Gemini tracing.")
diff --git a/tests/test_mlflow_tracing.py b/tests/test_mlflow_tracing.py
@@ -60,13 +60,23 @@ def read_settings(tmp_path):
 class TestMlflowEnvVars:
     """Verify MLflow environment variables are added to settings.json."""
 
-    def test_tracing_enabled(self, tmp_path):
+    def test_tracing_disabled_by_default(self, tmp_path):
         write_existing_settings(tmp_path, {"env": {"ANTHROPIC_MODEL": "test"}})
         result = run_setup_mlflow(tmp_path, {"APP_OWNER": "jane@company.com"})
         assert result.returncode == 0
         settings = read_settings(tmp_path)
         assert settings["env"]["MLFLOW_CLAUDE_TRACING_ENABLED"] == "false"
 
+    def test_tracing_enabled_via_master_switch(self, tmp_path):
+        write_existing_settings(tmp_path, {"env": {"ANTHROPIC_MODEL": "test"}})
+        result = run_setup_mlflow(tmp_path, {
+            "APP_OWNER": "jane@company.com",
+            "MLFLOW_TRACING_ENABLED": "true",
+        })
+        assert result.returncode == 0
+        settings = read_settings(tmp_path)
+        assert settings["env"]["MLFLOW_CLAUDE_TRACING_ENABLED"] == "true"
+
     def test_tracking_uri(self, tmp_path):
         write_existing_settings(tmp_path, {"env": {}})
         result = run_setup_mlflow(tmp_path, {"APP_OWNER": "jane@company.com"})

diff --git a/utils.py b/utils.py
@@ -226,3 +226,30 @@ def ensure_https(url: str) -> str:
     if not url.startswith(("http://", "https://")):
         return f"https://{url}"
     return url
+
+
+def resolve_mlflow_experiment_id(host: str, token: str, experiment_name: str) -> str | None:
+    """Look up (or create) a Databricks MLflow experiment by name and return its ID.
+
+    Used by Codex and Gemini CLI tracing setup — both need an experiment *ID*,
+    not name, in their config files / OTLP headers.
+
+    Returns None on any failure so callers can degrade gracefully.
+    """
+    if not host or not token or not experiment_name:
+        return None
+    try:
+        from databricks.sdk import WorkspaceClient
+        from databricks.sdk.errors import ResourceDoesNotExist
+
+        w = WorkspaceClient(host=ensure_https(host.rstrip("/")), token=token)
+        try:
+            exp = w.experiments.get_by_name(experiment_name=experiment_name)
+            if exp and exp.experiment:
+                return exp.experiment.experiment_id
+        except ResourceDoesNotExist:
+            pass  # fall through to create
+        return w.experiments.create_experiment(name=experiment_name).experiment_id
+    except Exception as exc:
+        logger.warning(f"Could not resolve MLflow experiment '{experiment_name}': {exc}")
+        return None