cs01 · cs01 · Apr 13, 2026 · Apr 13, 2026
diff --git a/.github/workflows/update-benchmarks.yml b/.github/workflows/update-benchmarks.yml
@@ -50,8 +50,10 @@ jobs:
       - name: Build native chad
         run: node dist/chad-node.js build src/chad-native.ts -o .build/chad --target-cpu=x86-64
       - name: Run benchmarks
+        env:
+          BENCH_RUNS: 10
         run: bash benchmarks/run-ci.sh
-        timeout-minutes: 10
+        timeout-minutes: 40
       - name: Create benchmark update PR
         run: |
           git config user.name "github-actions[bot]"

diff --git a/README.md b/README.md
@@ -71,27 +71,29 @@ Hono-style API, C-level performance. One binary, no node_modules. See [`examples
 
 ## Benchmarks
 
-ChadScript compiles through LLVM, the same backend behind C and Rust — so it gets the same optimization passes. Compared against C, Go, and Node.js on Ubuntu (CI):
-
-| Benchmark      | ChadScript | Node.js | vs Node  | C      |
-| -------------- | ---------- | ------- | -------- | ------ |
-| Cold Start     | **0.6ms**  | 21.8ms  | **36x**  | 0.6ms  |
-| Monte Carlo Pi | **0.398s** | 1.474s  | **3.7x** | 0.400s |
-| File I/O       | **0.089s** | 0.315s  | **3.5x** | 0.088s |
-| JSON Parse     | **0.005s** | 0.015s  | **3.0x** | 0.004s |
-| Fibonacci      | **1.424s** | 2.842s  | **2.0x** | 0.725s |
-| Sieve          | **0.038s** | 0.054s  | **1.4x** | 0.027s |
-| N-Body Sim     | **1.852s** | 2.296s  | **1.2x** | 1.453s |
-| Quicksort      | **0.202s** | 0.249s  | **1.2x** | 0.170s |
-| SQLite         | **0.374s** | 0.437s  | **1.2x** | 0.314s |
-
-[Full benchmarks dashboard](https://cs01.github.io/ChadScript/benchmarks) (updated on every PR)
+ChadScript compiles through LLVM, the same backend behind C and Rust — so it gets the same optimization passes. Compared against C, Go, and Node.js on Apple Silicon. **Median of N=10 runs**; full 95% bootstrap confidence intervals on the [benchmarks dashboard](https://cs01.github.io/ChadScript/benchmarks).
+
+| Benchmark       | ChadScript | Node.js | vs Node  | C      |
+| --------------- | ---------- | ------- | -------- | ------ |
+| SQLite          | **0.079s** | 0.165s  | **2.1x** | 0.080s |
+| JSON Parse      | **0.002s** | 0.004s  | **2.0x** | 0.002s |
+| Monte Carlo Pi  | **0.264s** | 2.486s  | **9.4x** | 0.265s |
+| Matrix Multiply | **0.109s** | 0.137s  | **1.3x** | 0.099s |
+| Fibonacci       | **0.516s** | 1.502s  | **2.9x** | 0.442s |
+| Sieve           | **0.012s** | 0.025s  | **2.1x** | 0.008s |
+| Quicksort       | **0.140s** | 0.159s  | **1.1x** | 0.121s |
+| N-Body Sim      | **0.824s** | 1.089s  | **1.3x** | 0.774s |
+| File I/O        | **0.054s** | 0.072s  | **1.3x** | 0.027s |
+| Binary Trees    | **0.604s** | 0.368s  | 0.6x     | 0.854s |
+| Cold Start      | **5.9ms**  | 27.4ms  | **4.6x** | 6.8ms  |
+
+**Statistically tied with C on 3 benchmarks** (SQLite, JSON, Monte Carlo — 95% CIs overlap). **Beats both C and Go on Binary Trees** — but loses to Node's V8 JIT which eliminates allocations via escape analysis. **Matches Go within 5% on Matrix Multiply, N-Body, Monte Carlo, and Sieve.**
 
 ---
 
 ## It's Fast
 
-Your code goes through the same LLVM optimization passes as C and Rust — not a JIT, not an interpreter. 0.8ms cold start, native execution speed.
+Your code goes through the same LLVM optimization passes as C and Rust — not a JIT, not an interpreter. Ties hand-written C on SQLite, JSON, and Monte Carlo. Native execution speed.
 
 ## It's Familiar
 

diff --git a/benchmarks/assemble_json.py b/benchmarks/assemble_json.py
@@ -1,11 +1,41 @@
 #!/usr/bin/env python3
-import json, os, sys
+"""
+Assembles per-benchmark sample files into the published benchmarks.json.
+
+Input format (one line per language per benchmark, written by `bench_compute` in
+benchmarks/run.sh and benchmarks/run-ci.sh):
+
+    lang|comma-separated-samples|raw-label
+
+For example:
+
+    chadscript|0.596,0.584,0.601,0.593,0.599|Time: 0.596s
+
+For each language, we parse all samples and compute:
+  - median           → the reported point estimate
+  - ci_lo / ci_hi    → 95% bootstrap confidence interval of the median
+  - samples          → full list, preserved for transparency / future analysis
+
+Ranking is tie-aware: two languages are considered "tied" when their 95%
+confidence intervals overlap, which is the standard non-parametric criterion
+for "statistically distinguishable at the 95% level." Spurious medal-flipping
+from runner jitter is eliminated because a 2-3% absolute gap between two noisy
+measurements lands inside both CIs and is correctly called a tie.
+
+For N=1 (e.g. the startup benchmark, which internally averages 50 launches and
+emits one aggregate number), the CI collapses to a single point — we fall back
+to a 5% heuristic halo around the value so ranking still works.
+"""
+import json, os, random, sys
 from datetime import datetime, timezone
 
 json_dir = sys.argv[1]
 outfile = sys.argv[2]
 startup_runs = int(sys.argv[3]) if len(sys.argv) > 3 else 50
 
+# Reproducible bootstrap sampling — same inputs produce same CI every time.
+random.seed(0xC4AD)
+
 META = {
     "startup":       {"name": "Cold Start", "desc": f"Time to print 'Hello, World!' and exit. Average of {startup_runs} runs.", "metric": "ms", "lower_is_better": True},
     "sqlite":        {"name": "SQLite", "desc": "100K SELECT queries on a 100-row in-memory table.", "metric": "s", "lower_is_better": True},
@@ -27,6 +57,86 @@
     "clihex":        {"name": "Hex Dump", "desc": "chex vs xxd — hex dump a 5MB binary file.", "metric": "s", "lower_is_better": True, "category": "cli"},
 }
 
+N_BOOTSTRAP = 2000
+CONFIDENCE = 0.95
+# For N=1 or N=2 (bootstrap can't compute a meaningful CI), use a 5% halo
+# around the point estimate. This keeps the ranking code functional for
+# single-sample benchmarks like `startup`.
+FALLBACK_HALO = 0.05
+# For N>=3, trust the bootstrap. But enforce a minimum CI width of 1% of the
+# point estimate — protects against "all samples identical" giving a
+# degenerate (zero-width) CI, which would make microsecond-level differences
+# look statsig.
+MIN_CI_WIDTH = 0.01
+
+
+def median(values):
+    if not values:
+        return 0.0
+    s = sorted(values)
+    n = len(s)
+    if n % 2 == 1:
+        return s[n // 2]
+    return (s[n // 2 - 1] + s[n // 2]) / 2.0
+
+
+def bootstrap_ci(samples):
+    """Returns (point_estimate, ci_lo, ci_hi) where point_estimate is the
+    median of the observed samples and (ci_lo, ci_hi) is the bootstrap 95% CI
+    of the median.
+
+    For N>=3 samples, does proper bootstrap resampling and enforces a minimum
+    CI width (MIN_CI_WIDTH) so degenerate identical-sample cases don't produce
+    zero-width intervals. For N<3, falls back to a fixed halo around the
+    point estimate (FALLBACK_HALO) since bootstrap can't produce a meaningful
+    distribution from 1-2 samples.
+    """
+    n = len(samples)
+    if n == 0:
+        return 0.0, 0.0, 0.0
+    point = median(samples)
+    if n < 3:
+        halo = abs(point) * FALLBACK_HALO
+        return point, point - halo, point + halo
+    medians = []
+    for _ in range(N_BOOTSTRAP):
+        resample = [random.choice(samples) for _ in range(n)]
+        medians.append(median(resample))
+    medians.sort()
+    lo_idx = int(N_BOOTSTRAP * (1 - CONFIDENCE) / 2)
+    hi_idx = int(N_BOOTSTRAP * (1 + CONFIDENCE) / 2) - 1
+    ci_lo = medians[lo_idx]
+    ci_hi = medians[hi_idx]
+    # Enforce minimum CI width so very-tight bootstrap results don't treat
+    # sub-percent differences as statistically significant. Anything within
+    # MIN_CI_WIDTH of the point estimate (per side) lands inside the CI.
+    min_half_width = abs(point) * MIN_CI_WIDTH / 2
+    ci_lo = min(ci_lo, point - min_half_width)
+    ci_hi = max(ci_hi, point + min_half_width)
+    return point, ci_lo, ci_hi
+
+
+def format_value(value, metric):
+    rounded = round(value, 4)
+    if metric == "ms":
+        return f"{rounded}ms"
+    if metric in ("req/s", "msg/s"):
+        return f"{int(rounded)} {metric}"
+    return f"{rounded:.3f}s"
+
+
+def format_ci(value, ci_lo, ci_hi, metric):
+    main = format_value(value, metric)
+    lo = format_value(ci_lo, metric)
+    hi = format_value(ci_hi, metric)
+    return f"{main} ({lo}–{hi})"
+
+
+def ci_overlap(a_lo, a_hi, b_lo, b_hi):
+    """Returns True if two intervals overlap (inclusive)."""
+    return max(a_lo, b_lo) <= min(a_hi, b_hi)
+
+
 all_benchmarks = {}
 filtered_benchmarks = {}
 
@@ -36,38 +146,46 @@
     bkey = fname[:-5]
     filepath = os.path.join(json_dir, fname)
     results = {}
-    chad_val = None
+    chad_stats = None
     for line in open(filepath):
         line = line.strip()
         if not line:
             continue
         parts = line.split("|")
         if len(parts) < 3:
             continue
-        lang, value, label = parts[0], parts[1], parts[2]
-        try:
-            rounded = round(float(value), 3)
-            meta_info = META.get(bkey, {"metric": "s"})
-            metric = meta_info.get("metric", "s")
-            if metric == "ms":
-                clean_label = f"{rounded}ms"
-            elif metric in ("req/s", "msg/s"):
-                clean_label = f"{int(rounded)} {metric}"
-            else:
-                clean_label = f"{rounded:.3f}s"
-            results[lang] = {"value": rounded, "label": clean_label}
-        except ValueError:
+        lang, samples_csv, raw_label = parts[0], parts[1], parts[2]
+        samples = []
+        for s in samples_csv.split(","):
+            s = s.strip()
+            if not s:
+                continue
+            try:
+                samples.append(float(s))
+            except ValueError:
+                pass
+        if not samples:
             continue
+        meta_info = META.get(bkey, {"metric": "s"})
+        metric = meta_info.get("metric", "s")
+        point, ci_lo, ci_hi = bootstrap_ci(samples)
+        results[lang] = {
+            "value": round(point, 4),
+            "ci_lo": round(ci_lo, 4),
+            "ci_hi": round(ci_hi, 4),
+            "n": len(samples),
+            "label": format_value(point, metric),
+            "ci_label": format_ci(point, ci_lo, ci_hi, metric),
+        }
         if lang == "chadscript":
-            chad_val = float(value)
+            chad_stats = (point, ci_lo, ci_hi)
 
-    if chad_val is None:
+    if chad_stats is None:
         print(f"  Skipped: {bkey} (no ChadScript result)")
         continue
 
     meta = META.get(bkey, {"name": bkey, "desc": "", "metric": "s", "lower_is_better": True})
     lower = meta["lower_is_better"]
-
     is_cli = meta.get("category") == "cli"
 
     entry = {
@@ -82,34 +200,51 @@
 
     all_benchmarks[bkey] = entry
 
-    if is_cli:
-        langs_ahead = sum(1 for l, r in results.items() if l != "chadscript" and ((lower and r["value"] < chad_val) or (not lower and r["value"] > chad_val)))
-        place = 1 + langs_ahead
-    else:
-        langs_ahead = 0
-        for lang, r in results.items():
-            if lang in ("chadscript", "c"):
-                continue
-            if lower and r["value"] < chad_val:
+    # Per-benchmark visibility switch. A benchmark can be excluded from the
+    # published dashboard by setting `hide_from_dashboard: True` in its META
+    # entry (useful when the workload doesn't measure what it claims, or the
+    # story is too muddy to present). Hidden benchmarks still appear in
+    # benchmarks-all.json so PR comments and reproducibility are unaffected.
+    # No benchmarks currently use this — it's available for future use.
+    if meta.get("hide_from_dashboard"):
+        print(f"  Hidden from dashboard: {meta['name']} (kept in benchmarks-all.json)")
+        continue
+
+    # Tie-aware ranking via CI overlap. A language only counts as "ahead of
+    # chad" if its 95% CI is entirely on the winning side of chad's 95% CI.
+    # Overlapping CIs ⇒ not statsig different ⇒ tied.
+    chad_point, chad_lo, chad_hi = chad_stats
+    langs_ahead = 0
+    for lang, r in results.items():
+        if lang == "chadscript":
+            continue
+        v_point = r["value"]
+        v_lo = r["ci_lo"]
+        v_hi = r["ci_hi"]
+        if ci_overlap(chad_lo, chad_hi, v_lo, v_hi):
+            continue
+        # CIs don't overlap → one is statsig ahead of the other.
+        if lower:
+            if v_point < chad_point:
                 langs_ahead += 1
-            if not lower and r["value"] > chad_val:
+        else:
+            if v_point > chad_point:
                 langs_ahead += 1
-        place = 1 + (1 if any(r["value"] < chad_val if lower else r["value"] > chad_val for l, r in results.items() if l == "c") else 0) + langs_ahead
+    place = 1 + langs_ahead
 
-    if place > 3:
-        print(f"  Filtered from docs: {meta['name']} (ChadScript #{place})")
-    else:
-        entry["place"] = place
-        filtered_benchmarks[bkey] = entry
+    # No rank-based filtering — publish every benchmark that produced a
+    # ChadScript result. An honest dashboard shows weaknesses too.
+    entry["place"] = place
+    filtered_benchmarks[bkey] = entry
 
 ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
 
 os.makedirs(os.path.dirname(outfile), exist_ok=True)
 with open(outfile, "w") as f:
     json.dump({"timestamp": ts, "benchmarks": filtered_benchmarks}, f, indent=2)
-print(f"  Wrote {len(filtered_benchmarks)} benchmarks to {outfile} (docs, filtered)")
+print(f"  Wrote {len(filtered_benchmarks)} benchmarks to {outfile}")
 
 all_outfile = outfile.replace(".json", "-all.json")
 with open(all_outfile, "w") as f:
     json.dump({"timestamp": ts, "benchmarks": all_benchmarks}, f, indent=2)
-print(f"  Wrote {len(all_benchmarks)} benchmarks to {all_outfile} (PR comments, unfiltered)")
+print(f"  Wrote {len(all_benchmarks)} benchmarks to {all_outfile}")
diff --git a/benchmarks/run-ci.sh b/benchmarks/run-ci.sh
@@ -26,15 +26,31 @@ json_add_result() {
 bench_compute() {
     local bench="$1" lang="$2" display="$3" metric_key="$4"
     shift 4
-    echo "  $display"
-    local output
-    output=$("$@" 2>&1) || true
-    echo "$output" | sed 's/^/    /'
+    local runs="${BENCH_RUNS:-1}"
+    echo "  $display (N=$runs)"
+    local samples=""
+    local last_output=""
+    for i in $(seq 1 $runs); do
+        local output
+        output=$("$@" 2>&1) || true
+        local raw value
+        raw=$(extract_metric "$metric_key" "$output")
+        value=$(echo "$raw" | sed 's/[^0-9.]//g')
+        [ -z "$value" ] && continue
+        if [ -z "$samples" ]; then
+            samples="$value"
+        else
+            samples="${samples},${value}"
+        fi
+        last_output="$output"
+    done
+    echo "$last_output" | sed 's/^/    /'
     echo ""
-    local raw value
-    raw=$(extract_metric "$metric_key" "$output")
-    value=$(echo "$raw" | sed 's/[^0-9.]//g')
-    [ -n "$value" ] && json_add_result "$bench" "$lang" "$value" "$raw"
+    if [ -n "$samples" ]; then
+        local last_raw
+        last_raw=$(extract_metric "$metric_key" "$last_output")
+        json_add_result "$bench" "$lang" "$samples" "$last_raw"
+    fi
 }
 
 bench_startup() {