diff --git a/.gitignore b/.gitignore
index e5344c3..128a042 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@ coverage
 .DS_Store
 *.tsbuildinfo
 .agent-knowledge/index.json
+data/
diff --git a/package.json b/package.json
index 1b6e141..a92e158 100644
--- a/package.json
+++ b/package.json
@@ -43,6 +43,11 @@
       "types": "./dist/profiles/index.d.ts",
       "import": "./dist/profiles/index.js",
       "default": "./dist/profiles/index.js"
+    },
+    "./autodata": {
+      "types": "./dist/autodata/index.d.ts",
+      "import": "./dist/autodata/index.js",
+      "default": "./dist/autodata/index.js"
     }
   },
   "bin": {
@@ -65,7 +70,8 @@
     "test:watch": "vitest",
     "typecheck": "tsc --noEmit",
     "lint": "biome check src tests",
-    "format": "biome format --write src tests"
+    "format": "biome format --write src tests",
+    "autodata": "tsx src/autodata/run.ts"
   },
   "dependencies": {
     "@tangle-network/agent-eval": "^0.100.0",
@@ -78,6 +84,7 @@
     "@tangle-network/sandbox": "^0.8.0",
     "@types/node": "^25.6.0",
     "tsup": "^8.0.0",
+    "tsx": "^4.22.4",
     "typescript": "^5.7.0",
     "vitest": "^3.0.0"
   },
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 8442f74..cc04568 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -32,13 +32,16 @@ importers:
         version: 25.6.0
       tsup:
         specifier: ^8.0.0
-        version: 8.5.1(postcss@8.5.13)(typescript@5.9.3)(yaml@2.8.4)
+        version: 8.5.1(postcss@8.5.13)(tsx@4.22.4)(typescript@5.9.3)(yaml@2.8.4)
+      tsx:
+        specifier: ^4.22.4
+        version: 4.22.4
       typescript:
         specifier: ^5.7.0
         version: 5.9.3
       vitest:
         specifier: ^3.0.0
-        version: 3.2.4(@types/node@25.6.0)(yaml@2.8.4)
+        version: 3.2.4(@types/node@25.6.0)(tsx@4.22.4)(yaml@2.8.4)
 
 packages:
 
@@ -118,156 +121,312 @@ packages:
     cpu: [ppc64]
     os: [aix]
 
+  '@esbuild/aix-ppc64@0.28.1':
+    resolution: {integrity: sha512-Svl7tq8k/08+p6CXPpRjQ1fKX+1odH/BQbb48fV6fj3CWHhsoIOoY87w1oHXm0qEpkIK3ZfVgp0hed3XBXzXMQ==}
+    engines: {node: '>=18'}
+    cpu: [ppc64]
+    os: [aix]
+
   '@esbuild/android-arm64@0.27.7':
     resolution: {integrity: sha512-62dPZHpIXzvChfvfLJow3q5dDtiNMkwiRzPylSCfriLvZeq0a1bWChrGx/BbUbPwOrsWKMn8idSllklzBy+dgQ==}
     engines: {node: '>=18'}
     cpu: [arm64]
     os: [android]
 
+  '@esbuild/android-arm64@0.28.1':
+    resolution: {integrity: sha512-34EGEbCIAgosYz6goLcopX6Mo7NyGv9tfwEM2/7Ce2VcVRk568iSvniGWcUXIy7wEDR1wzolcxcriFVrWYcwBg==}
+    engines: {node: '>=18'}
+    cpu: [arm64]
+    os: [android]
+
   '@esbuild/android-arm@0.27.7':
     resolution: {integrity: sha512-jbPXvB4Yj2yBV7HUfE2KHe4GJX51QplCN1pGbYjvsyCZbQmies29EoJbkEc+vYuU5o45AfQn37vZlyXy4YJ8RQ==}
     engines: {node: '>=18'}
     cpu: [arm]
     os: [android]
 
+  '@esbuild/android-arm@0.28.1':
+    resolution: {integrity: sha512-0k2F129Xdio1TdJfzJ8sy1Q47vUD2NnwdhiAf7drUN1EBTfPf4hsFCtmMgu/6m8JSzsBrlmVjudMBQqOfG8usQ==}
+    engines: {node: '>=18'}
+    cpu: [arm]
+    os: [android]
+
   '@esbuild/android-x64@0.27.7':
     resolution: {integrity: sha512-x5VpMODneVDb70PYV2VQOmIUUiBtY3D3mPBG8NxVk5CogneYhkR7MmM3yR/uMdITLrC1ml/NV1rj4bMJuy9MCg==}
     engines: {node: '>=18'}
     cpu: [x64]
     os: [android]
 
+  '@esbuild/android-x64@0.28.1':
+    resolution: {integrity: sha512-dbwY7ltSMDWsRatcRpCnES4F+im88OCUgGZjy52shC7GqHRE/cYlxNbB4Z4UpJswpcc4Qxd2oE/ufM0p61IKng==}
+    engines: {node: '>=18'}
+    cpu: [x64]
+    os: [android]
+
   '@esbuild/darwin-arm64@0.27.7':
     resolution: {integrity: sha512-5lckdqeuBPlKUwvoCXIgI2D9/ABmPq3Rdp7IfL70393YgaASt7tbju3Ac+ePVi3KDH6N2RqePfHnXkaDtY9fkw==}
     engines: {node: '>=18'}
     cpu: [arm64]
     os: [darwin]
 
+  '@esbuild/darwin-arm64@0.28.1':
+    resolution: {integrity: sha512-TZbWkQY7kvTAXbXUT7uVACR5cMHsDiSz9z7ZKAX/RTq/WJEk3QyRr0wZpNhBDX+/0CtdqUIJlOiodQcta6tY3Q==}
+    engines: {node: '>=18'}
+    cpu: [arm64]
+    os: [darwin]
+
   '@esbuild/darwin-x64@0.27.7':
     resolution: {integrity: sha512-rYnXrKcXuT7Z+WL5K980jVFdvVKhCHhUwid+dDYQpH+qu+TefcomiMAJpIiC2EM3Rjtq0sO3StMV/+3w3MyyqQ==}
     engines: {node: '>=18'}
     cpu: [x64]
     os: [darwin]
 
+  '@esbuild/darwin-x64@0.28.1':
+    resolution: {integrity: sha512-zfdzgK9ACBNZLI/CyHTOx81SyNbM6YXn7rxSgX97VjyiPl9W1i4Ka4fgKECEoFCKGpvBj5qArWIGgQjOwkgskQ==}
+    engines: {node: '>=18'}
+    cpu: [x64]
+    os: [darwin]
+
   '@esbuild/freebsd-arm64@0.27.7':
     resolution: {integrity: sha512-B48PqeCsEgOtzME2GbNM2roU29AMTuOIN91dsMO30t+Ydis3z/3Ngoj5hhnsOSSwNzS+6JppqWsuhTp6E82l2w==}
     engines: {node: '>=18'}
     cpu: [arm64]
     os: [freebsd]
 
+  '@esbuild/freebsd-arm64@0.28.1':
+    resolution: {integrity: sha512-wG2EA8ENdEI0qhkSZMjfqrdY+ziCYCPMmtZjjIwOmXFjmyzEHn+UUxk5of+SYsjtfs3VpnlC7QLzSI5hY/rOAw==}
+    engines: {node: '>=18'}
+    cpu: [arm64]
+    os: [freebsd]
+
   '@esbuild/freebsd-x64@0.27.7':
     resolution: {integrity: sha512-jOBDK5XEjA4m5IJK3bpAQF9/Lelu/Z9ZcdhTRLf4cajlB+8VEhFFRjWgfy3M1O4rO2GQ/b2dLwCUGpiF/eATNQ==}
     engines: {node: '>=18'}
     cpu: [x64]
     os: [freebsd]
 
+  '@esbuild/freebsd-x64@0.28.1':
+    resolution: {integrity: sha512-i7dZ9vQgnvSCzi/rYCXNgtF/U+eKZNJBzu3eTQbRgHnM7tNSizLOkRFAl3qzVc/Op/u5YkHHa4pf/3DOYHthLQ==}
+    engines: {node: '>=18'}
+    cpu: [x64]
+    os: [freebsd]
+
   '@esbuild/linux-arm64@0.27.7':
     resolution: {integrity: sha512-RZPHBoxXuNnPQO9rvjh5jdkRmVizktkT7TCDkDmQ0W2SwHInKCAV95GRuvdSvA7w4VMwfCjUiPwDi0ZO6Nfe9A==}
     engines: {node: '>=18'}
     cpu: [arm64]
     os: [linux]
 
+  '@esbuild/linux-arm64@0.28.1':
+    resolution: {integrity: sha512-yHs+0uc8+nvEAfAfxrWQKK5peSNzBc4PegcMO0EJ2hT71uA7vB8Ihg2e77R2P7SG5uYjPbHlLLmve4LLLRCf0g==}
+    engines: {node: '>=18'}
+    cpu: [arm64]
+    os: [linux]
+
   '@esbuild/linux-arm@0.27.7':
     resolution: {integrity: sha512-RkT/YXYBTSULo3+af8Ib0ykH8u2MBh57o7q/DAs3lTJlyVQkgQvlrPTnjIzzRPQyavxtPtfg0EopvDyIt0j1rA==}
     engines: {node: '>=18'}
     cpu: [arm]
     os: [linux]
 
+  '@esbuild/linux-arm@0.28.1':
+    resolution: {integrity: sha512-qVXBOHQS+d5Y722GwJzJUtOLlX7km3CraOaGormF1pDtPd2C/l1SHRPgjLunLGe51Sh5YYWKMFDyV4SxgMQYTQ==}
+    engines: {node: '>=18'}
+    cpu: [arm]
+    os: [linux]
+
   '@esbuild/linux-ia32@0.27.7':
     resolution: {integrity: sha512-GA48aKNkyQDbd3KtkplYWT102C5sn/EZTY4XROkxONgruHPU72l+gW+FfF8tf2cFjeHaRbWpOYa/uRBz/Xq1Pg==}
     engines: {node: '>=18'}
     cpu: [ia32]
     os: [linux]
 
+  '@esbuild/linux-ia32@0.28.1':
+    resolution: {integrity: sha512-d1z4ZuP0ajrfz/FhGT4vv278rX8KnPPJx8i5+AtK7TYbx9Le9F1hyzurZpkEyjkGa9dUGhQow4C1NmeGvqxN2w==}
+    engines: {node: '>=18'}
+    cpu: [ia32]
+    os: [linux]
+
   '@esbuild/linux-loong64@0.27.7':
     resolution: {integrity: sha512-a4POruNM2oWsD4WKvBSEKGIiWQF8fZOAsycHOt6JBpZ+JN2n2JH9WAv56SOyu9X5IqAjqSIPTaJkqN8F7XOQ5Q==}
     engines: {node: '>=18'}
     cpu: [loong64]
     os: [linux]
 
+  '@esbuild/linux-loong64@0.28.1':
+    resolution: {integrity: sha512-M5sRjUVZrkm1OAPR3dlOYzNmN+loZKGVi1VUQGrwuqLcbR6qeAz+famMhjASeH3YVKvZz+zT1jlh/keC3Rj/lg==}
+    engines: {node: '>=18'}
+    cpu: [loong64]
+    os: [linux]
+
   '@esbuild/linux-mips64el@0.27.7':
     resolution: {integrity: sha512-KabT5I6StirGfIz0FMgl1I+R1H73Gp0ofL9A3nG3i/cYFJzKHhouBV5VWK1CSgKvVaG4q1RNpCTR2LuTVB3fIw==}
     engines: {node: '>=18'}
     cpu: [mips64el]
     os: [linux]
 
+  '@esbuild/linux-mips64el@0.28.1':
+    resolution: {integrity: sha512-mRObBZeHh2OxcBFPWE/FjylkRgZdYuiTR3vaTozquCGOH14iP9oN4x4Ge81CoIDYQrXmIxpFumJBu5MtZpnQJQ==}
+    engines: {node: '>=18'}
+    cpu: [mips64el]
+    os: [linux]
+
   '@esbuild/linux-ppc64@0.27.7':
     resolution: {integrity: sha512-gRsL4x6wsGHGRqhtI+ifpN/vpOFTQtnbsupUF5R5YTAg+y/lKelYR1hXbnBdzDjGbMYjVJLJTd2OFmMewAgwlQ==}
     engines: {node: '>=18'}
     cpu: [ppc64]
     os: [linux]
 
+  '@esbuild/linux-ppc64@0.28.1':
+    resolution: {integrity: sha512-slScBsMAb3GFDcdrCgLwZtPYRoH2H/youv10QiZyRjmsP48fznoveWytSgCI/R0ZcUgpc0ZhIUEx6LHts8yrfQ==}
+    engines: {node: '>=18'}
+    cpu: [ppc64]
+    os: [linux]
+
   '@esbuild/linux-riscv64@0.27.7':
     resolution: {integrity: sha512-hL25LbxO1QOngGzu2U5xeXtxXcW+/GvMN3ejANqXkxZ/opySAZMrc+9LY/WyjAan41unrR3YrmtTsUpwT66InQ==}
     engines: {node: '>=18'}
     cpu: [riscv64]
     os: [linux]
 
+  '@esbuild/linux-riscv64@0.28.1':
+    resolution: {integrity: sha512-kw0owk1o0GFETUJyW0jc0G4Yzs0BHZn0JDZ8JRT088vjJYX777BAs1fDGxAC+q831qOs2DTC96mNsG2opdfyyQ==}
+    engines: {node: '>=18'}
+    cpu: [riscv64]
+    os: [linux]
+
   '@esbuild/linux-s390x@0.27.7':
     resolution: {integrity: sha512-2k8go8Ycu1Kb46vEelhu1vqEP+UeRVj2zY1pSuPdgvbd5ykAw82Lrro28vXUrRmzEsUV0NzCf54yARIK8r0fdw==}
     engines: {node: '>=18'}
     cpu: [s390x]
     os: [linux]
 
+  '@esbuild/linux-s390x@0.28.1':
+    resolution: {integrity: sha512-/lAIjX8aYFRByhh6L5rYtPEDRqa9de/4V/juOXcta5frjvzXO4/sqEtyytse0g3zZFuWu5cDN0MkLz2qRDD2Ag==}
+    engines: {node: '>=18'}
+    cpu: [s390x]
+    os: [linux]
+
   '@esbuild/linux-x64@0.27.7':
     resolution: {integrity: sha512-hzznmADPt+OmsYzw1EE33ccA+HPdIqiCRq7cQeL1Jlq2gb1+OyWBkMCrYGBJ+sxVzve2ZJEVeePbLM2iEIZSxA==}
     engines: {node: '>=18'}
     cpu: [x64]
     os: [linux]
 
+  '@esbuild/linux-x64@0.28.1':
+    resolution: {integrity: sha512-u/anNYF2mmVOEDwLtnQ1wOr3EZ9sTNGLWrsYGYwHWzGA3Si84IOkHXlbWTD1NB+9/1lcnweYKO54uhxZydNzfA==}
+    engines: {node: '>=18'}
+    cpu: [x64]
+    os: [linux]
+
   '@esbuild/netbsd-arm64@0.27.7':
     resolution: {integrity: sha512-b6pqtrQdigZBwZxAn1UpazEisvwaIDvdbMbmrly7cDTMFnw/+3lVxxCTGOrkPVnsYIosJJXAsILG9XcQS+Yu6w==}
     engines: {node: '>=18'}
     cpu: [arm64]
     os: [netbsd]
 
+  '@esbuild/netbsd-arm64@0.28.1':
+    resolution: {integrity: sha512-oks0DYbLwWMmaakTsCb+zL4E+aHRVLom9IJZOAthMQEPiQmydXHkziYEsGYRx0uNV/IjEKGAV941JzH02pflqw==}
+    engines: {node: '>=18'}
+    cpu: [arm64]
+    os: [netbsd]
+
   '@esbuild/netbsd-x64@0.27.7':
     resolution: {integrity: sha512-OfatkLojr6U+WN5EDYuoQhtM+1xco+/6FSzJJnuWiUw5eVcicbyK3dq5EeV/QHT1uy6GoDhGbFpprUiHUYggrw==}
     engines: {node: '>=18'}
     cpu: [x64]
     os: [netbsd]
 
+  '@esbuild/netbsd-x64@0.28.1':
+    resolution: {integrity: sha512-aeL6lAnN89Hz43Mlh1G8ARasbuoYvSITDEx0tHh5b7jJnHcssqgjy9Yx430GDpmCa6OyrKoS0aNRjKundRizGg==}
+    engines: {node: '>=18'}
+    cpu: [x64]
+    os: [netbsd]
+
   '@esbuild/openbsd-arm64@0.27.7':
     resolution: {integrity: sha512-AFuojMQTxAz75Fo8idVcqoQWEHIXFRbOc1TrVcFSgCZtQfSdc1RXgB3tjOn/krRHENUB4j00bfGjyl2mJrU37A==}
     engines: {node: '>=18'}
     cpu: [arm64]
     os: [openbsd]
 
+  '@esbuild/openbsd-arm64@0.28.1':
+    resolution: {integrity: sha512-MEFJe5C3R8pwXdZ5Y21oo6m7ePiS0d9pWucn99O/wvyJZChoIQKrQDxKrGeW8F5+T0okTHesAmDeiHDTIq0V/Q==}
+    engines: {node: '>=18'}
+    cpu: [arm64]
+    os: [openbsd]
+
   '@esbuild/openbsd-x64@0.27.7':
     resolution: {integrity: sha512-+A1NJmfM8WNDv5CLVQYJ5PshuRm/4cI6WMZRg1by1GwPIQPCTs1GLEUHwiiQGT5zDdyLiRM/l1G0Pv54gvtKIg==}
     engines: {node: '>=18'}
     cpu: [x64]
     os: [openbsd]
 
+  '@esbuild/openbsd-x64@0.28.1':
+    resolution: {integrity: sha512-i/ZLIOafE0Z8cI/XANJAixoJL/uRAoS2xOA3rb0xN+KK0K177cMAsQYkzHtBrtMXAKuAc7HGgcWiZ/sRC1Nxgw==}
+    engines: {node: '>=18'}
+    cpu: [x64]
+    os: [openbsd]
+
   '@esbuild/openharmony-arm64@0.27.7':
     resolution: {integrity: sha512-+KrvYb/C8zA9CU/g0sR6w2RBw7IGc5J2BPnc3dYc5VJxHCSF1yNMxTV5LQ7GuKteQXZtspjFbiuW5/dOj7H4Yw==}
     engines: {node: '>=18'}
     cpu: [arm64]
     os: [openharmony]
 
+  '@esbuild/openharmony-arm64@0.28.1':
+    resolution: {integrity: sha512-ge+Z7EXFNt2BO1oAMsVpiQ8EwndV9i1xXerAeTIK7AtPs3bKFXQM7nlRxDSIUIMeueR1CNXxqztLzdNeReKBJg==}
+    engines: {node: '>=18'}
+    cpu: [arm64]
+    os: [openharmony]
+
   '@esbuild/sunos-x64@0.27.7':
     resolution: {integrity: sha512-ikktIhFBzQNt/QDyOL580ti9+5mL/YZeUPKU2ivGtGjdTYoqz6jObj6nOMfhASpS4GU4Q/Clh1QtxWAvcYKamA==}
     engines: {node: '>=18'}
     cpu: [x64]
     os: [sunos]
 
+  '@esbuild/sunos-x64@0.28.1':
+    resolution: {integrity: sha512-BEjgtECkL3vY+SaSQ6nzVfiALUeFxpawyp8Jmf5PtYhf1Ug40N1h/hxlhts+f1FvSvarEigdxS3BlSMI2PJLcQ==}
+    engines: {node: '>=18'}
+    cpu: [x64]
+    os: [sunos]
+
   '@esbuild/win32-arm64@0.27.7':
     resolution: {integrity: sha512-7yRhbHvPqSpRUV7Q20VuDwbjW5kIMwTHpptuUzV+AA46kiPze5Z7qgt6CLCK3pWFrHeNfDd1VKgyP4O+ng17CA==}
     engines: {node: '>=18'}
     cpu: [arm64]
     os: [win32]
 
+  '@esbuild/win32-arm64@0.28.1':
+    resolution: {integrity: sha512-lCv9eK/H6ZJWbE7bh2nw54CZ9M2nupBxJcTsdk/QQnWkdSjKGuxmmH8/GWrlT1eMmZfn4dGcCjRte397WqfQXA==}
+    engines: {node: '>=18'}
+    cpu: [arm64]
+    os: [win32]
+
   '@esbuild/win32-ia32@0.27.7':
     resolution: {integrity: sha512-SmwKXe6VHIyZYbBLJrhOoCJRB/Z1tckzmgTLfFYOfpMAx63BJEaL9ExI8x7v0oAO3Zh6D/Oi1gVxEYr5oUCFhw==}
     engines: {node: '>=18'}
     cpu: [ia32]
     os: [win32]
 
+  '@esbuild/win32-ia32@0.28.1':
+    resolution: {integrity: sha512-zvb/mB2bSCoJOpoCBgYKKpX6YM6mJBlBUVUtVj41DlZJVEB6/0CKlRYxP5wWl1C1ILiCoAU5wZZ4q1P3qeS6Eg==}
+    engines: {node: '>=18'}
+    cpu: [ia32]
+    os: [win32]
+
   '@esbuild/win32-x64@0.27.7':
     resolution: {integrity: sha512-56hiAJPhwQ1R4i+21FVF7V8kSD5zZTdHcVuRFMW0hn753vVfQN8xlx4uOPT4xoGH0Z/oVATuR82AiqSTDIpaHg==}
     engines: {node: '>=18'}
     cpu: [x64]
     os: [win32]
 
+  '@esbuild/win32-x64@0.28.1':
+    resolution: {integrity: sha512-bm4Mowrv+GXMlpWX++EcXw/iLyd1o3+bJkC2DkWXYVvgZCqD/bSj9ctZeAMC3cIxgjRVR2Dufaiu4YPxr5gW1A==}
+    engines: {node: '>=18'}
+    cpu: [x64]
+    os: [win32]
+
   '@hono/node-server@2.0.1':
     resolution: {integrity: sha512-jI9yMDyFpqBeSighf/zlXnQG/nl9AyBc6aAgy4XtxJMyt/CNyJpvPfzDD+bCc2zAOmhhqtF6TnmIaY+xV4mIrw==}
     engines: {node: '>=20'}
@@ -651,6 +810,11 @@ packages:
     engines: {node: '>=18'}
     hasBin: true
 
+  esbuild@0.28.1:
+    resolution: {integrity: sha512-HrJrvZv5ayxBzPfwphOoNzkzOIIlifzk0KJrGK2c8R4+LKpMtpYLQeUdjnwjWv/LZlkH2laZk+4w78pi99D4Vw==}
+    engines: {node: '>=18'}
+    hasBin: true
+
   estree-walker@3.0.3:
     resolution: {integrity: sha512-7RUKfXgSMMkzt6ZuXmqapOurLGPPfgj6l9uRZ7lRGolvk0y2yocc35LdcxKC5PQZdn2DMqioAQ2NoWcrTKmm6g==}
 
@@ -876,6 +1040,11 @@ packages:
       typescript:
         optional: true
 
+  tsx@4.22.4:
+    resolution: {integrity: sha512-X8EX+XV4QR5xCsrgxaED954zTDfY8KqlDtskKEL0cHhyS/P8b4IFOvGDQpsC9Q1XnLq915wEfwwY/zzskCtmhg==}
+    engines: {node: '>=18.0.0'}
+    hasBin: true
+
   typescript@5.9.3:
     resolution: {integrity: sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==}
     engines: {node: '>=14.17'}
@@ -1050,81 +1219,159 @@ snapshots:
   '@esbuild/aix-ppc64@0.27.7':
     optional: true
 
+  '@esbuild/aix-ppc64@0.28.1':
+    optional: true
+
   '@esbuild/android-arm64@0.27.7':
     optional: true
 
+  '@esbuild/android-arm64@0.28.1':
+    optional: true
+
   '@esbuild/android-arm@0.27.7':
     optional: true
 
+  '@esbuild/android-arm@0.28.1':
+    optional: true
+
   '@esbuild/android-x64@0.27.7':
     optional: true
 
+  '@esbuild/android-x64@0.28.1':
+    optional: true
+
   '@esbuild/darwin-arm64@0.27.7':
     optional: true
 
+  '@esbuild/darwin-arm64@0.28.1':
+    optional: true
+
   '@esbuild/darwin-x64@0.27.7':
     optional: true
 
+  '@esbuild/darwin-x64@0.28.1':
+    optional: true
+
   '@esbuild/freebsd-arm64@0.27.7':
     optional: true
 
+  '@esbuild/freebsd-arm64@0.28.1':
+    optional: true
+
   '@esbuild/freebsd-x64@0.27.7':
     optional: true
 
+  '@esbuild/freebsd-x64@0.28.1':
+    optional: true
+
   '@esbuild/linux-arm64@0.27.7':
     optional: true
 
+  '@esbuild/linux-arm64@0.28.1':
+    optional: true
+
   '@esbuild/linux-arm@0.27.7':
     optional: true
 
+  '@esbuild/linux-arm@0.28.1':
+    optional: true
+
   '@esbuild/linux-ia32@0.27.7':
     optional: true
 
+  '@esbuild/linux-ia32@0.28.1':
+    optional: true
+
   '@esbuild/linux-loong64@0.27.7':
     optional: true
 
+  '@esbuild/linux-loong64@0.28.1':
+    optional: true
+
   '@esbuild/linux-mips64el@0.27.7':
     optional: true
 
+  '@esbuild/linux-mips64el@0.28.1':
+    optional: true
+
   '@esbuild/linux-ppc64@0.27.7':
     optional: true
 
+  '@esbuild/linux-ppc64@0.28.1':
+    optional: true
+
   '@esbuild/linux-riscv64@0.27.7':
     optional: true
 
+  '@esbuild/linux-riscv64@0.28.1':
+    optional: true
+
   '@esbuild/linux-s390x@0.27.7':
     optional: true
 
+  '@esbuild/linux-s390x@0.28.1':
+    optional: true
+
   '@esbuild/linux-x64@0.27.7':
     optional: true
 
+  '@esbuild/linux-x64@0.28.1':
+    optional: true
+
   '@esbuild/netbsd-arm64@0.27.7':
     optional: true
 
+  '@esbuild/netbsd-arm64@0.28.1':
+    optional: true
+
   '@esbuild/netbsd-x64@0.27.7':
     optional: true
 
+  '@esbuild/netbsd-x64@0.28.1':
+    optional: true
+
   '@esbuild/openbsd-arm64@0.27.7':
     optional: true
 
+  '@esbuild/openbsd-arm64@0.28.1':
+    optional: true
+
   '@esbuild/openbsd-x64@0.27.7':
     optional: true
 
+  '@esbuild/openbsd-x64@0.28.1':
+    optional: true
+
   '@esbuild/openharmony-arm64@0.27.7':
     optional: true
 
+  '@esbuild/openharmony-arm64@0.28.1':
+    optional: true
+
   '@esbuild/sunos-x64@0.27.7':
     optional: true
 
+  '@esbuild/sunos-x64@0.28.1':
+    optional: true
+
   '@esbuild/win32-arm64@0.27.7':
     optional: true
 
+  '@esbuild/win32-arm64@0.28.1':
+    optional: true
+
   '@esbuild/win32-ia32@0.27.7':
     optional: true
 
+  '@esbuild/win32-ia32@0.28.1':
+    optional: true
+
   '@esbuild/win32-x64@0.27.7':
     optional: true
 
+  '@esbuild/win32-x64@0.28.1':
+    optional: true
+
   '@hono/node-server@2.0.1(hono@4.12.16)':
     dependencies:
       hono: 4.12.16
@@ -1338,13 +1585,13 @@ snapshots:
       chai: 5.3.3
       tinyrainbow: 2.0.0
 
-  '@vitest/mocker@3.2.4(vite@7.3.2(@types/node@25.6.0)(yaml@2.8.4))':
+  '@vitest/mocker@3.2.4(vite@7.3.2(@types/node@25.6.0)(tsx@4.22.4)(yaml@2.8.4))':
     dependencies:
       '@vitest/spy': 3.2.4
       estree-walker: 3.0.3
       magic-string: 0.30.21
     optionalDependencies:
-      vite: 7.3.2(@types/node@25.6.0)(yaml@2.8.4)
+      vite: 7.3.2(@types/node@25.6.0)(tsx@4.22.4)(yaml@2.8.4)
 
   '@vitest/pretty-format@3.2.4':
     dependencies:
@@ -1451,6 +1698,35 @@ snapshots:
       '@esbuild/win32-ia32': 0.27.7
       '@esbuild/win32-x64': 0.27.7
 
+  esbuild@0.28.1:
+    optionalDependencies:
+      '@esbuild/aix-ppc64': 0.28.1
+      '@esbuild/android-arm': 0.28.1
+      '@esbuild/android-arm64': 0.28.1
+      '@esbuild/android-x64': 0.28.1
+      '@esbuild/darwin-arm64': 0.28.1
+      '@esbuild/darwin-x64': 0.28.1
+      '@esbuild/freebsd-arm64': 0.28.1
+      '@esbuild/freebsd-x64': 0.28.1
+      '@esbuild/linux-arm': 0.28.1
+      '@esbuild/linux-arm64': 0.28.1
+      '@esbuild/linux-ia32': 0.28.1
+      '@esbuild/linux-loong64': 0.28.1
+      '@esbuild/linux-mips64el': 0.28.1
+      '@esbuild/linux-ppc64': 0.28.1
+      '@esbuild/linux-riscv64': 0.28.1
+      '@esbuild/linux-s390x': 0.28.1
+      '@esbuild/linux-x64': 0.28.1
+      '@esbuild/netbsd-arm64': 0.28.1
+      '@esbuild/netbsd-x64': 0.28.1
+      '@esbuild/openbsd-arm64': 0.28.1
+      '@esbuild/openbsd-x64': 0.28.1
+      '@esbuild/openharmony-arm64': 0.28.1
+      '@esbuild/sunos-x64': 0.28.1
+      '@esbuild/win32-arm64': 0.28.1
+      '@esbuild/win32-ia32': 0.28.1
+      '@esbuild/win32-x64': 0.28.1
+
   estree-walker@3.0.3:
     dependencies:
       '@types/estree': 1.0.8
@@ -1548,11 +1824,12 @@ snapshots:
       mlly: 1.8.2
       pathe: 2.0.3
 
-  postcss-load-config@6.0.1(postcss@8.5.13)(yaml@2.8.4):
+  postcss-load-config@6.0.1(postcss@8.5.13)(tsx@4.22.4)(yaml@2.8.4):
     dependencies:
       lilconfig: 3.1.3
     optionalDependencies:
       postcss: 8.5.13
+      tsx: 4.22.4
       yaml: 2.8.4
 
   postcss@8.5.13:
@@ -1647,7 +1924,7 @@ snapshots:
 
   ts-interface-checker@0.1.13: {}
 
-  tsup@8.5.1(postcss@8.5.13)(typescript@5.9.3)(yaml@2.8.4):
+  tsup@8.5.1(postcss@8.5.13)(tsx@4.22.4)(typescript@5.9.3)(yaml@2.8.4):
     dependencies:
       bundle-require: 5.1.0(esbuild@0.27.7)
       cac: 6.7.14
@@ -1658,7 +1935,7 @@ snapshots:
       fix-dts-default-cjs-exports: 1.0.1
       joycon: 3.1.1
       picocolors: 1.1.1
-      postcss-load-config: 6.0.1(postcss@8.5.13)(yaml@2.8.4)
+      postcss-load-config: 6.0.1(postcss@8.5.13)(tsx@4.22.4)(yaml@2.8.4)
       resolve-from: 5.0.0
       rollup: 4.60.2
       source-map: 0.7.6
@@ -1675,6 +1952,12 @@ snapshots:
       - tsx
       - yaml
 
+  tsx@4.22.4:
+    dependencies:
+      esbuild: 0.28.1
+    optionalDependencies:
+      fsevents: 2.3.3
+
   typescript@5.9.3: {}
 
   ufo@1.6.4: {}
@@ -1698,13 +1981,13 @@ snapshots:
       - utf-8-validate
       - zod
 
-  vite-node@3.2.4(@types/node@25.6.0)(yaml@2.8.4):
+  vite-node@3.2.4(@types/node@25.6.0)(tsx@4.22.4)(yaml@2.8.4):
     dependencies:
       cac: 6.7.14
       debug: 4.4.3
       es-module-lexer: 1.7.0
       pathe: 2.0.3
-      vite: 7.3.2(@types/node@25.6.0)(yaml@2.8.4)
+      vite: 7.3.2(@types/node@25.6.0)(tsx@4.22.4)(yaml@2.8.4)
     transitivePeerDependencies:
       - '@types/node'
       - jiti
@@ -1719,7 +2002,7 @@ snapshots:
       - tsx
       - yaml
 
-  vite@7.3.2(@types/node@25.6.0)(yaml@2.8.4):
+  vite@7.3.2(@types/node@25.6.0)(tsx@4.22.4)(yaml@2.8.4):
     dependencies:
       esbuild: 0.27.7
       fdir: 6.5.0(picomatch@4.0.4)
@@ -1730,13 +2013,14 @@ snapshots:
     optionalDependencies:
       '@types/node': 25.6.0
       fsevents: 2.3.3
+      tsx: 4.22.4
       yaml: 2.8.4
 
-  vitest@3.2.4(@types/node@25.6.0)(yaml@2.8.4):
+  vitest@3.2.4(@types/node@25.6.0)(tsx@4.22.4)(yaml@2.8.4):
     dependencies:
       '@types/chai': 5.2.3
       '@vitest/expect': 3.2.4
-      '@vitest/mocker': 3.2.4(vite@7.3.2(@types/node@25.6.0)(yaml@2.8.4))
+      '@vitest/mocker': 3.2.4(vite@7.3.2(@types/node@25.6.0)(tsx@4.22.4)(yaml@2.8.4))
       '@vitest/pretty-format': 3.2.4
       '@vitest/runner': 3.2.4
       '@vitest/snapshot': 3.2.4
@@ -1754,8 +2038,8 @@ snapshots:
       tinyglobby: 0.2.16
       tinypool: 1.1.1
       tinyrainbow: 2.0.0
-      vite: 7.3.2(@types/node@25.6.0)(yaml@2.8.4)
-      vite-node: 3.2.4(@types/node@25.6.0)(yaml@2.8.4)
+      vite: 7.3.2(@types/node@25.6.0)(tsx@4.22.4)(yaml@2.8.4)
+      vite-node: 3.2.4(@types/node@25.6.0)(tsx@4.22.4)(yaml@2.8.4)
       why-is-node-running: 2.3.0
     optionalDependencies:
       '@types/node': 25.6.0
diff --git a/src/autodata/build-dataset.ts b/src/autodata/build-dataset.ts
new file mode 100644
index 0000000..a2e1853
--- /dev/null
+++ b/src/autodata/build-dataset.ts
@@ -0,0 +1,172 @@
+/**
+ * Build a discriminative QA dataset from a real source document with REAL two-tier solvers.
+ *
+ * Grounds on the document (or takes an already-grounded excerpt), runs `createDataCreationLoop` with
+ * the live router roles, writes the accepted examples as JSONL, and returns the numbers the
+ * calibration depends on: the per-example strong/weak gap for the LOOP-ACCEPTED (agentic) examples
+ * AND for the challenger's FIRST drafts (plain), plus the cost ledger split by role.
+ */
+
+import { mkdir, writeFile } from 'node:fs/promises'
+import { dirname } from 'node:path'
+import { CostLedger } from '@tangle-network/agent-eval'
+import {
+  createDataCreationLoop,
+  discriminativeAcceptRule,
+  type ExampleEvaluation,
+} from './data-creation-loop'
+import { type GroundedDoc, groundDoc } from './grounding'
+import { buildAutodataRoles, type RouterCallRecord } from './router-roles'
+
+export interface DiscriminativeThresholds {
+  minStrong?: number
+  maxWeak?: number
+  minGap?: number
+}
+
+export interface AutodataDatasetConfig {
+  apiKey: string
+  baseUrl?: string
+  /** A grounded excerpt, or a spec to fetch + chunk one from a real URL. */
+  source: GroundedDoc | { url: string; focus?: string; cacheDir?: string }
+  /** Where to write the JSONL dataset. */
+  outPath: string
+  target?: number
+  samples?: number
+  maxRetries?: number
+  thresholds?: DiscriminativeThresholds
+  models?: { challenger?: string; weak?: string; strong?: string; judge?: string }
+  signal?: AbortSignal
+}
+
+/** One accepted, discriminating training example with its real strong/weak scores + provenance. */
+export interface DatasetRow {
+  context: string
+  question: string
+  reference: string
+  rubric: readonly string[]
+  weakScore: number
+  strongScore: number
+  gap: number
+  source: { url: string; headingPath: string; chunkIndex: number }
+}
+
+export interface AutodataDatasetResult {
+  source: GroundedDoc
+  accepted: ExampleEvaluation[]
+  rows: DatasetRow[]
+  /** Mean strong−weak gap on the challenger's first-draft (plain-generation) questions. */
+  plainGapMean: number | null
+  /** Mean strong−weak gap on the loop-accepted (agentic) questions. */
+  agenticGapMean: number | null
+  /** Mean of the BEST gap the refinement reached per slot (accepted or not) — the informative
+   *  comparison against `plainGapMean` even when nothing clears the accept bar. */
+  refinedGapMean: number | null
+  plainGaps: number[]
+  agenticGaps: number[]
+  refinedGaps: number[]
+  cost: CostLedger
+  costPerExampleUsd: number | null
+  /** How many router calls were priced by the router vs rate-estimated. */
+  callProvenance: { router: number; estimated: number }
+  outPath: string
+}
+
+function mean(xs: number[]): number | null {
+  return xs.length === 0 ? null : xs.reduce((a, b) => a + b, 0) / xs.length
+}
+
+function isGrounded(s: AutodataDatasetConfig['source']): s is GroundedDoc {
+  return typeof (s as GroundedDoc).doc === 'string'
+}
+
+function challengerInstruction(doc: string): string {
+  return (
+    `SOURCE DOCUMENT EXCERPT:\n\n${doc}\n\n` +
+    `Write ONE hard exam question grounded in this excerpt. It must require multi-step reasoning ` +
+    `over the excerpt (a small model should get it wrong, a strong model right), never a verbatim ` +
+    `lookup. Return STRICT JSON: {"context": string, "question": string, "reference": string, ` +
+    `"rubric": string[] }.`
+  )
+}
+
+/** Run the full pipeline: ground → loop → JSONL. Returns the calibration numbers + cost. */
+export async function buildAutodataDataset(
+  config: AutodataDatasetConfig,
+): Promise<AutodataDatasetResult> {
+  const source = isGrounded(config.source)
+    ? config.source
+    : await groundDoc({
+        url: config.source.url,
+        focus: config.source.focus,
+        cacheDir: config.source.cacheDir,
+        signal: config.signal,
+      })
+
+  const provenance = { router: 0, estimated: 0 }
+  const onCall = (rec: RouterCallRecord): void => {
+    if (rec.costSource === 'router') provenance.router += 1
+    else provenance.estimated += 1
+  }
+
+  const ledger = new CostLedger()
+
+  const roles = buildAutodataRoles({
+    apiKey: config.apiKey,
+    baseUrl: config.baseUrl,
+    challengerModel: config.models?.challenger,
+    weakModel: config.models?.weak,
+    strongModel: config.models?.strong,
+    judgeModel: config.models?.judge,
+    ledger,
+    onCall,
+  })
+
+  const result = await createDataCreationLoop({
+    doc: source.doc,
+    baseInstruction: challengerInstruction,
+    challenger: roles.challenger,
+    weakSolver: roles.weakSolver,
+    strongSolver: roles.strongSolver,
+    judge: roles.judge,
+    accept: (i) => discriminativeAcceptRule({ ...i, ...config.thresholds }),
+    target: config.target ?? 3,
+    samples: config.samples ?? 3,
+    maxRetries: config.maxRetries ?? 4,
+    cost: ledger,
+    signal: config.signal,
+  })
+
+  const rows: DatasetRow[] = result.accepted.map((ex) => ({
+    context: ex.example.context,
+    question: ex.example.question,
+    reference: ex.example.reference,
+    rubric: ex.example.rubric,
+    weakScore: ex.weakScore,
+    strongScore: ex.strongScore,
+    gap: ex.gap,
+    source: { url: source.url, headingPath: source.headingPath, chunkIndex: source.chunkIndex },
+  }))
+
+  await mkdir(dirname(config.outPath), { recursive: true })
+  await writeFile(
+    config.outPath,
+    rows.map((r) => JSON.stringify(r)).join('\n') + (rows.length ? '\n' : ''),
+  )
+
+  return {
+    source,
+    accepted: result.accepted,
+    rows,
+    plainGapMean: mean(result.plainGaps),
+    agenticGapMean: mean(result.agenticGaps),
+    refinedGapMean: mean(result.refinedGaps),
+    plainGaps: result.plainGaps,
+    agenticGaps: result.agenticGaps,
+    refinedGaps: result.refinedGaps,
+    cost: result.cost,
+    costPerExampleUsd: result.cost.costPerCompletedTask(),
+    callProvenance: provenance,
+    outPath: config.outPath,
+  }
+}
diff --git a/src/autodata/data-creation-loop.test.ts b/src/autodata/data-creation-loop.test.ts
new file mode 100644
index 0000000..df917cc
--- /dev/null
+++ b/src/autodata/data-creation-loop.test.ts
@@ -0,0 +1,130 @@
+import { describe, expect, it } from 'vitest'
+import {
+  createDataCreationLoop,
+  discriminativeAcceptRule,
+  qualityCheck,
+} from './data-creation-loop'
+import {
+  baseInstruction,
+  buildRubricJudge,
+  challengerClient,
+  groundingDoc,
+  solverClient,
+} from './offline-fixtures'
+import { parseDataExample } from './router-roles'
+
+describe('discriminativeAcceptRule (the new piece)', () => {
+  it('accepts an example that separates strong from weak', () => {
+    const d = discriminativeAcceptRule({ strongScore: 0.77, weakScore: 0.46 })
+    expect(d.accept).toBe(true)
+    expect(d.reason).toContain('discriminates')
+  })
+
+  it('rejects "too easy" when the weak solver passes', () => {
+    const d = discriminativeAcceptRule({ strongScore: 0.86, weakScore: 0.84 })
+    expect(d.accept).toBe(false)
+    expect(d.reason).toContain('too easy')
+  })
+
+  it('rejects "too hard" when even the strong solver misses', () => {
+    const d = discriminativeAcceptRule({ strongScore: 0.55, weakScore: 0.3 })
+    expect(d.accept).toBe(false)
+    expect(d.reason).toContain('too hard')
+  })
+
+  it('rejects when the gap is below minGap even if both thresholds hold', () => {
+    const d = discriminativeAcceptRule({ strongScore: 0.66, weakScore: 0.48 })
+    expect(d.accept).toBe(false)
+    expect(d.reason).toContain('not discriminative')
+  })
+
+  it('honors custom thresholds', () => {
+    const strict = discriminativeAcceptRule({ strongScore: 0.77, weakScore: 0.46, minGap: 0.4 })
+    expect(strict.accept).toBe(false)
+  })
+})
+
+describe('qualityCheck', () => {
+  it('rejects a reference that leaks verbatim into the context', () => {
+    const q = qualityCheck({
+      context: 'The answer is 42 and nothing else matters.',
+      question: 'What is the answer?',
+      reference: 'The answer is 42',
+      rubric: ['a', 'b'],
+    })
+    expect(q.ok).toBe(false)
+    expect(q.reason).toContain('leaked')
+  })
+
+  it('rejects a thin rubric', () => {
+    const q = qualityCheck({ context: 'c', question: 'q', reference: 'r', rubric: ['only one'] })
+    expect(q.ok).toBe(false)
+    expect(q.reason).toContain('thin rubric')
+  })
+
+  it('passes a clean example', () => {
+    const q = qualityCheck({
+      context: 'Some grounding context that does not contain the answer phrasing.',
+      question: 'Why does it matter?',
+      reference: 'Because of a distinct reasoning chain.',
+      rubric: ['states X', 'explains Y'],
+    })
+    expect(q.ok).toBe(true)
+  })
+})
+
+describe('parseDataExample (challenger JSON parsing)', () => {
+  it('parses a bare JSON object', () => {
+    const ex = parseDataExample('{"context":"c","question":"q","reference":"r","rubric":["a","b"]}')
+    expect(ex.question).toBe('q')
+    expect(ex.rubric).toHaveLength(2)
+  })
+
+  it('parses JSON wrapped in a ```json fence with surrounding prose', () => {
+    const ex = parseDataExample(
+      'Sure, here is the example:\n```json\n{"context":"c","question":"q","reference":"r","rubric":["a","b"]}\n```\nDone.',
+    )
+    expect(ex.reference).toBe('r')
+  })
+
+  it('throws loud when no JSON object is present', () => {
+    expect(() => parseDataExample('no json here')).toThrow()
+  })
+
+  it('throws loud when a required field is missing', () => {
+    expect(() => parseDataExample('{"context":"c","question":"q"}')).toThrow()
+  })
+})
+
+describe('createDataCreationLoop (offline)', () => {
+  it('manufactures discriminating examples and separates plain from agentic gaps', async () => {
+    const result = await createDataCreationLoop({
+      doc: groundingDoc,
+      baseInstruction,
+      challenger: challengerClient(),
+      weakSolver: solverClient('weak'),
+      strongSolver: solverClient('strong'),
+      judge: buildRubricJudge(),
+      target: 2,
+      samples: 3,
+      maxRetries: 4,
+    })
+
+    expect(result.accepted).toHaveLength(2)
+    for (const ex of result.accepted) {
+      expect(ex.decision.accept).toBe(true)
+      expect(ex.gap).toBeGreaterThanOrEqual(0.2)
+    }
+
+    const mean = (xs: number[]) => xs.reduce((a, b) => a + b, 0) / xs.length
+    const plain = mean(result.plainGaps)
+    const agentic = mean(result.agenticGaps)
+    expect(plain).toBeLessThan(0.1)
+    expect(agentic).toBeGreaterThan(0.25)
+    expect(agentic - plain).toBeGreaterThanOrEqual(0.15)
+
+    const stored = await result.corpus.query({ area: 'training-data' })
+    expect(stored).toHaveLength(2)
+    expect(result.cost.summary().totalCostUsd).toBeGreaterThan(0)
+  })
+})
diff --git a/src/autodata/data-creation-loop.ts b/src/autodata/data-creation-loop.ts
new file mode 100644
index 0000000..9022cbb
--- /dev/null
+++ b/src/autodata/data-creation-loop.ts
@@ -0,0 +1,491 @@
+/**
+ * The Autodata / Agentic Self-Instruct INNER loop: an agent MANUFACTURES hard training examples
+ * from a grounding doc and keeps only the ones that DISCRIMINATE a strong solver from a weak one.
+ *
+ * PROVENANCE — this loop is vendored verbatim from agent-runtime
+ * `examples/agentic-data-creation/agentic-data-creation.ts` (branch
+ * `examples/agentic-data-creation`). It is an EXAMPLE in agent-runtime, not a published runtime
+ * export — examples are not shipped in the npm dist — so agent-knowledge cannot import it and
+ * vendors it here (the "copy with a note" path). Every primitive it COMPOSES is reused from the
+ * published packages, nothing is re-implemented: the judge is `llmJudge` (agent-eval), the loop
+ * kernel is `runLoop` (agent-runtime/loops), the store is `InMemoryCorpus` (agent-runtime/loops),
+ * the cost accounting is `CostLedger` (agent-eval). The REAL grounding (arXiv ingestion) + the REAL
+ * two-tier router solvers live in the sibling files; this file stays domain- and transport-agnostic.
+ *
+ * The whole method is four roles + one accept rule:
+ *   1. CHALLENGER  writes a candidate {context, question, reference, rubric} from the doc.
+ *   2. WEAK solver and STRONG solver each attempt it, sampled N× to average out variance.
+ *   3. JUDGE      scores every attempt against the rubric (one `llmJudge` call per attempt).
+ *   4. ACCEPT     keeps the example ONLY IF it discriminates: strong >= hi, weak < lo, gap >= g —
+ *                 plus a quality check (no context leakage, a real rubric).
+ *   On reject, the CHALLENGER driver FOLDS the reject reason into its next prompt and retries.
+ *   Accepted examples accrete into a `Corpus`.
+ *
+ * The ONE genuinely new piece is `discriminativeAcceptRule` — the paper's reward, written as a
+ * small Validator-shaped accept/reject. It is a lift candidate for agent-eval (next to
+ * `blendHeldout` / `HeldOutGate`) if it proves out across real domains; it lives here until then.
+ */
+
+import { CostLedger } from '@tangle-network/agent-eval'
+import type { JudgeConfig, Scenario } from '@tangle-network/agent-eval/campaign'
+import {
+  type AgentRunSpec,
+  type Corpus,
+  type CorpusRecord,
+  type Driver,
+  InMemoryCorpus,
+  type OutputAdapter,
+  runLoop,
+  type SandboxClient,
+  type Validator,
+} from '@tangle-network/agent-runtime/loops'
+import type { AgentProfile, SandboxEvent } from '@tangle-network/sandbox'
+
+// ── The four-role data shapes ─────────────────────────────────────────────────────────────
+
+/** One manufactured training example, grounded in `context` excerpted from the doc. */
+export interface DataExample {
+  /** The grounding excerpt the question is answerable from. */
+  readonly context: string
+  readonly question: string
+  /** The reference answer the rubric is graded against. */
+  readonly reference: string
+  /** Scoring criteria the judge applies (>= 2 for a usable example). */
+  readonly rubric: readonly string[]
+}
+
+/** What the judge scores: a solver's `answer` to one `example`. */
+export interface SolverArtifact {
+  readonly example: DataExample
+  readonly answer: string
+}
+
+/** The accept rule's verdict — keep this example, and why (or why not). */
+export interface AcceptDecision {
+  readonly accept: boolean
+  readonly reason: string
+}
+
+// ═══════════════════════════════════════════════════════════════════════════════════════════
+// THE ONE NEW PIECE — the paper's discriminative reward, as a small Validator-shaped rule.
+// ═══════════════════════════════════════════════════════════════════════════════════════════
+//
+// Autodata keeps an example ONLY IF it separates a strong solver from a weak one: the strong
+// solver should mostly get it (>= minStrong), the weak solver should mostly miss it (< maxWeak),
+// and the margin between them (the "gap") must clear minGap. That is the whole objective, so the
+// rule is the LITERAL accept criterion, never softened. The three reject reasons map one-to-one
+// onto the challenger's next-prompt fold.
+export function discriminativeAcceptRule(input: {
+  /** Strong solver's mean rubric score, [0,1]. */
+  strongScore: number
+  /** Weak solver's mean rubric score, [0,1]. */
+  weakScore: number
+  /** Strong must reach at least this (else the example is unfair / too hard). Default 0.65. */
+  minStrong?: number
+  /** Weak must stay strictly below this (else the example is too easy). Default 0.5. */
+  maxWeak?: number
+  /** strong − weak must be at least this (else it does not discriminate). Default 0.2. */
+  minGap?: number
+}): AcceptDecision {
+  const { strongScore, weakScore } = input
+  const minStrong = input.minStrong ?? 0.65
+  const maxWeak = input.maxWeak ?? 0.5
+  const minGap = input.minGap ?? 0.2
+  const gap = strongScore - weakScore
+
+  if (strongScore < minStrong) {
+    return {
+      accept: false,
+      reason: `too hard: strong solver reached only ${pct(strongScore)} (< ${pct(minStrong)})`,
+    }
+  }
+  if (weakScore >= maxWeak) {
+    return {
+      accept: false,
+      reason: `too easy: weak solver reached ${pct(weakScore)} (>= ${pct(maxWeak)})`,
+    }
+  }
+  if (gap < minGap) {
+    return { accept: false, reason: `not discriminative: gap ${pct(gap)} (< ${pct(minGap)})` }
+  }
+  return {
+    accept: true,
+    reason: `discriminates: strong ${pct(strongScore)} >= ${pct(minStrong)}, weak ${pct(weakScore)} < ${pct(maxWeak)}, gap ${pct(gap)} >= ${pct(minGap)}`,
+  }
+}
+
+/**
+ * The quality gate the paper pairs with the gap: reject examples that LEAK the answer into the
+ * context (a copy-paste solver would pass), or that ship a thin rubric. Deterministic, no LLM.
+ */
+export function qualityCheck(ex: DataExample): { ok: boolean; reason: string } {
+  const ref = ex.reference.trim()
+  if (ref.length > 0 && ex.context.includes(ref)) {
+    return { ok: false, reason: 'leaked: the reference answer appears verbatim in the context' }
+  }
+  if (ex.rubric.length < 2) {
+    return { ok: false, reason: 'thin rubric: an example needs >= 2 scoring criteria' }
+  }
+  return { ok: true, reason: 'clean' }
+}
+
+// ── Tasks + output adapters (the worker seam) ──────────────────────────────────────────────
+
+/** The challenger's task: ground on `doc`, run the `prompt` the driver authored this round. */
+interface ChallengerTask {
+  readonly doc: string
+  /** The instruction for THIS round — the refine driver rewrites it from the last reject. */
+  readonly prompt: string
+}
+
+/** One solver attempt over an `example`; `sampleIndex` distinguishes the N parallel samples. */
+interface SolverTask {
+  readonly example: DataExample
+  readonly sampleIndex: number
+}
+
+const challengerOutput: OutputAdapter<DataExample> = {
+  parse(events) {
+    const ex = resultPayload(events)
+    if (isDataExample(ex)) return ex
+    // Fail loud: a challenger that produced no parseable example is a real defect, not an empty pass.
+    throw new Error('challenger produced no parseable DataExample')
+  },
+}
+
+const solverOutput: OutputAdapter<{ answer: string }> = {
+  parse(events) {
+    const r = resultPayload(events)
+    if (
+      r &&
+      typeof r === 'object' &&
+      'answer' in r &&
+      typeof (r as { answer: unknown }).answer === 'string'
+    ) {
+      return { answer: (r as { answer: string }).answer }
+    }
+    throw new Error('solver produced no answer')
+  },
+}
+
+// ── N× solver sampling = an inline FANOUT driver over runLoop ────────────────────────────────
+//
+// A "round" returns N independent solver tasks (no fold between them) → the kernel runs all N,
+// the `llmJudge`-as-validator scores each against the rubric, and we AVERAGE the N scores (the
+// variance-reduced estimate the accept rule compares — not argmax). runLoop already aggregated
+// the N calls' cost, so we roll its total into the ledger under this solver's channel.
+async function sampleSolverScore(args: {
+  solver: SandboxClient
+  solverSpec: AgentRunSpec<SolverTask>
+  example: DataExample
+  judge: JudgeConfig<SolverArtifact>
+  samples: number
+  channel: string
+  ledger: CostLedger
+  signal?: AbortSignal
+}): Promise<number> {
+  const { solver, solverSpec, example, judge, samples, channel, ledger } = args
+
+  const validator: Validator<{ answer: string }> = {
+    async validate(out, ctx) {
+      const score = await judge.score({
+        artifact: { example, answer: out.answer },
+        scenario: solveScenario,
+        signal: ctx.signal,
+      })
+      return { valid: !score.failed, score: score.composite, notes: score.notes }
+    },
+  }
+
+  const fanout: Driver<SolverTask, { answer: string }, 'done'> = {
+    name: `${channel}/sample-x${samples}`,
+    plan: async (task, history) =>
+      history.length === 0
+        ? Array.from({ length: samples }, (_, i) => ({ ...task, sampleIndex: i }))
+        : [],
+    decide: () => 'done',
+  }
+
+  const result = await runLoop<SolverTask, { answer: string }, 'done'>({
+    driver: fanout,
+    agentRun: solverSpec,
+    output: solverOutput,
+    validator,
+    task: { example, sampleIndex: 0 },
+    ctx: { sandboxClient: solver, signal: args.signal },
+    maxIterations: samples,
+    maxConcurrency: samples,
+  })
+
+  ledger.record({
+    model: solverSpec.profile.name ?? channel,
+    channel,
+    usage: { inputTokens: result.tokenUsage.input, outputTokens: result.tokenUsage.output },
+    actualCostUsd: result.costUsd,
+    tags: { role: channel },
+  })
+
+  const scored = result.iterations.filter((it) => it.verdict).map((it) => it.verdict?.score ?? 0)
+  if (scored.length === 0)
+    throw new Error(`${channel}: every solver sample errored — no score to average`)
+  return scored.reduce((a, b) => a + b, 0) / scored.length
+}
+
+// ── The challenger refine driver — the FOLD ──────────────────────────────────────────────────
+
+type ChallengerDecision = 'refine' | 'accept' | 'reject'
+
+function challengerDriver(
+  maxRetries: number,
+  baseInstruction: (doc: string) => string,
+): Driver<ChallengerTask, DataExample, ChallengerDecision> {
+  return {
+    name: 'challenger-refine',
+    async plan(task, history) {
+      if (history.length === 0) return [task] // shot 0: a first draft straight from the doc
+      const last = history[history.length - 1]
+      if (last?.verdict?.valid) return [] // accepted → stop
+      if (history.length >= maxRetries) return [] // out of budget → stop
+      // THE FOLD: read WHY the last example was rejected and rewrite the instruction to target it.
+      // "too easy" → make it harder; "too hard" → ease it; "leaked" → keep the answer out of context.
+      const why = last?.verdict?.notes ?? 'rejected'
+      const prompt = `${baseInstruction(task.doc)}\n\nYour previous example was REJECTED: ${why}. Write a new example that fixes exactly that.`
+      return [{ ...task, prompt }]
+    },
+    decide(history) {
+      if (history.some((it) => it.verdict?.valid)) return 'accept'
+      return history.length < maxRetries ? 'refine' : 'reject'
+    },
+  }
+}
+
+// ── One example's full evaluation (used for both the accept loop and calibration) ─────────────
+
+export interface ExampleEvaluation {
+  readonly example: DataExample
+  readonly weakScore: number
+  readonly strongScore: number
+  readonly gap: number
+  readonly decision: AcceptDecision
+}
+
+// ── The loop ────────────────────────────────────────────────────────────────────────────────
+
+export interface DataCreationConfig {
+  /** The grounding document the challenger writes examples from. */
+  readonly doc: string
+  /** The challenger worker (prompt → DataExample). The driver authors each round's prompt. */
+  readonly challenger: SandboxClient
+  /** The weak + strong solver workers (rendered example → answer). */
+  readonly weakSolver: SandboxClient
+  readonly strongSolver: SandboxClient
+  /** The rubric judge — an `llmJudge` `JudgeConfig`. */
+  readonly judge: JudgeConfig<SolverArtifact>
+  /** The challenger's base instruction over the doc (the un-folded prompt). */
+  readonly baseInstruction: (doc: string) => string
+  /** How a solver sees one example. Default: context + question + numbered rubric + sample tag. */
+  readonly renderSolverPrompt?: (example: DataExample, sampleIndex: number) => string
+  /** Profiles materialized for each worker (names surface in traces + the cost ledger). */
+  readonly challengerProfile?: AgentProfile
+  readonly weakSolverProfile?: AgentProfile
+  readonly strongSolverProfile?: AgentProfile
+  /** The accept rule. Defaults to `discriminativeAcceptRule` at its paper thresholds. */
+  readonly accept?: (input: { strongScore: number; weakScore: number }) => AcceptDecision
+  /** How many accepted examples to manufacture. Default 3. */
+  readonly target?: number
+  /** Solver samples per example (variance reduction). Default 3. */
+  readonly samples?: number
+  /** Refine budget per example. Default 4. */
+  readonly maxRetries?: number
+  /** Where accepted examples accrete. Default a fresh `InMemoryCorpus`. */
+  readonly corpus?: Corpus
+  /** Cost ledger to record into. Default a fresh `CostLedger`. */
+  readonly cost?: CostLedger
+  readonly signal?: AbortSignal
+}
+
+/** Default solver prompt: ground the answer in the context, score against the numbered rubric. */
+function defaultRenderSolverPrompt(example: DataExample, sampleIndex: number): string {
+  return (
+    `Answer the QUESTION using only the CONTEXT.\n\n` +
+    `CONTEXT:\n${example.context}\n\n` +
+    `QUESTION:\n${example.question}\n\n` +
+    `RUBRIC (you are graded on each):\n${example.rubric.map((r, i) => `${i + 1}. ${r}`).join('\n')}\n` +
+    `[sample ${sampleIndex}]`
+  )
+}
+
+export interface DataCreationResult {
+  /** The accepted, discriminating examples (the manufactured training set). */
+  readonly accepted: ExampleEvaluation[]
+  /** The `gap` of each accepted example — large by construction (the agentic arm). */
+  readonly agenticGaps: number[]
+  /** The `gap` of each FIRST (un-refined) draft — the plain-generation baseline for calibration. */
+  readonly plainGaps: number[]
+  /** Per slot, the BEST gap the refinement reached (max over the budget), accepted or not. Lets the
+   *  plain-vs-refined calibration stay informative even when no example clears the accept bar. */
+  readonly refinedGaps: number[]
+  readonly corpus: Corpus
+  readonly cost: CostLedger
+}
+
+/**
+ * Run the Autodata inner loop: manufacture `target` discriminating examples from `doc`, refining
+ * each via the challenger fold until it is accepted (or its retry budget runs out). Returns the
+ * accepted set, the per-example gap for the accepted (agentic) AND the first-draft (plain) examples
+ * for calibration, the corpus they accreted into, and the cost ledger.
+ */
+export async function createDataCreationLoop(
+  config: DataCreationConfig,
+): Promise<DataCreationResult> {
+  const corpus = config.corpus ?? new InMemoryCorpus()
+  const cost = config.cost ?? new CostLedger()
+  const accept = config.accept ?? ((i) => discriminativeAcceptRule(i))
+  const target = config.target ?? 3
+  const samples = config.samples ?? 3
+  const maxRetries = config.maxRetries ?? 4
+  const renderSolverPrompt = config.renderSolverPrompt ?? defaultRenderSolverPrompt
+
+  // Build the three worker specs once (task → prompt + the profile the substrate materializes).
+  const challengerSpec: AgentRunSpec<ChallengerTask> = {
+    profile: config.challengerProfile ?? ({ name: 'challenger' } as AgentProfile),
+    taskToPrompt: (t) => t.prompt,
+  }
+  const weakSolverSpec: AgentRunSpec<SolverTask> = {
+    profile: config.weakSolverProfile ?? ({ name: 'weak-solver' } as AgentProfile),
+    taskToPrompt: (t) => renderSolverPrompt(t.example, t.sampleIndex),
+  }
+  const strongSolverSpec: AgentRunSpec<SolverTask> = {
+    profile: config.strongSolverProfile ?? ({ name: 'strong-solver' } as AgentProfile),
+    taskToPrompt: (t) => renderSolverPrompt(t.example, t.sampleIndex),
+  }
+
+  const accepted: ExampleEvaluation[] = []
+  const agenticGaps: number[] = []
+  const plainGaps: number[] = []
+  const refinedGaps: number[] = []
+
+  for (let i = 0; i < target; i++) {
+    // The challenger validator evaluates a candidate example: sample both solvers, judge each, then
+    // apply the accept rule. It stashes each iteration's evaluation so the loop can read back the
+    // ACCEPTED one (the agentic arm) and the FIRST draft (the plain calibration baseline).
+    const evaluations = new Map<number, ExampleEvaluation>()
+    const validator: Validator<DataExample> = {
+      async validate(example, ctx) {
+        const quality = qualityCheck(example)
+        const weakScore = quality.ok
+          ? await sampleSolverScore({
+              solver: config.weakSolver,
+              solverSpec: weakSolverSpec,
+              example,
+              judge: config.judge,
+              samples,
+              channel: 'weak-solver',
+              ledger: cost,
+              signal: ctx.signal,
+            })
+          : 0
+        const strongScore = quality.ok
+          ? await sampleSolverScore({
+              solver: config.strongSolver,
+              solverSpec: strongSolverSpec,
+              example,
+              judge: config.judge,
+              samples,
+              channel: 'strong-solver',
+              ledger: cost,
+              signal: ctx.signal,
+            })
+          : 0
+        const decision = quality.ok
+          ? accept({ strongScore, weakScore })
+          : { accept: false, reason: quality.reason }
+        const gap = strongScore - weakScore
+        evaluations.set(ctx.iteration, { example, weakScore, strongScore, gap, decision })
+        return { valid: decision.accept, score: gap, notes: decision.reason }
+      },
+    }
+
+    const result = await runLoop<ChallengerTask, DataExample, ChallengerDecision>({
+      driver: challengerDriver(maxRetries, config.baseInstruction),
+      agentRun: challengerSpec,
+      output: challengerOutput,
+      validator,
+      task: { doc: config.doc, prompt: config.baseInstruction(config.doc) },
+      ctx: { sandboxClient: config.challenger, signal: config.signal },
+      maxIterations: maxRetries + 1,
+    })
+
+    cost.record({
+      model: challengerSpec.profile.name ?? 'challenger',
+      channel: 'challenger',
+      usage: { inputTokens: result.tokenUsage.input, outputTokens: result.tokenUsage.output },
+      actualCostUsd: result.costUsd,
+      tags: { role: 'challenger' },
+    })
+
+    const plain = evaluations.get(0)
+    if (plain) plainGaps.push(plain.gap)
+
+    const slotGaps = [...evaluations.values()].map((e) => e.gap)
+    if (slotGaps.length > 0) refinedGaps.push(Math.max(...slotGaps))
+
+    // ONLY a genuinely-accepted winner counts. `defaultSelectWinner` falls back to the best-scoring
+    // iteration when none is valid, so `result.winner` is set even when the accept rule rejected
+    // every candidate — with real solvers that frequently happens (no question separated the tiers
+    // inside the budget). Gate on `verdict.valid` so the manufactured set never includes a rejected
+    // example; a target slot that never produced a discriminating example is simply left unfilled.
+    if (result.winner?.verdict?.valid) {
+      const winnerEval = evaluations.get(result.winner.iterationIndex)
+      if (!winnerEval) throw new Error('internal: accepted iteration has no recorded evaluation')
+      const append = await corpus.append(toCorpusRecord(winnerEval, i))
+      if (!append.succeeded) throw new Error(`corpus append failed: ${append.error}`)
+      accepted.push(winnerEval)
+      agenticGaps.push(winnerEval.gap)
+      cost.markCompleted()
+    }
+  }
+
+  return { accepted, agenticGaps, plainGaps, refinedGaps, corpus, cost }
+}
+
+// ── Helpers ───────────────────────────────────────────────────────────────────────────────
+
+const solveScenario: Scenario = { id: 'agentic-data-creation', kind: 'solve' }
+
+function toCorpusRecord(evalRec: ExampleEvaluation, index: number): CorpusRecord {
+  return {
+    schemaVersion: '1.0.0',
+    id: `example-${index}`,
+    runId: 'agentic-data-creation',
+    producedAt: new Date().toISOString(),
+    area: 'training-data',
+    claim: JSON.stringify(evalRec.example),
+    rationale: evalRec.decision.reason,
+    tags: ['discriminative', `gap:${evalRec.gap.toFixed(2)}`],
+    // The gap is the producing run's confidence this example is hard — clamped into [0,1].
+    confidence: Math.min(1, Math.max(0, evalRec.gap)),
+  }
+}
+
+function resultPayload(events: SandboxEvent[]): unknown {
+  for (const ev of events) {
+    if (ev.type === 'result') return (ev as { data?: { result?: unknown } }).data?.result
+  }
+  return undefined
+}
+
+function isDataExample(value: unknown): value is DataExample {
+  if (typeof value !== 'object' || value === null) return false
+  const v = value as Record<string, unknown>
+  return (
+    typeof v.context === 'string' &&
+    typeof v.question === 'string' &&
+    typeof v.reference === 'string' &&
+    Array.isArray(v.rubric)
+  )
+}
+
+function pct(x: number): string {
+  return x.toFixed(2)
+}
diff --git a/src/autodata/grounding.ts b/src/autodata/grounding.ts
new file mode 100644
index 0000000..130a177
--- /dev/null
+++ b/src/autodata/grounding.ts
@@ -0,0 +1,97 @@
+/**
+ * Ground the Autodata loop on a REAL source document, reusing agent-knowledge's ingestion utils
+ * (`politeFetch` → `htmlToText` → `chunkMarkdown`). Fetches the page, strips it to text, chunks it,
+ * and selects ONE content-rich chunk as the grounding excerpt the challenger writes questions from.
+ *
+ * The default source is the "Attention Is All You Need" paper via ar5iv (arXiv's LaTeX→HTML service),
+ * a stable real paper with multi-step-reasoning content that affords genuinely discriminating
+ * questions. Any arXiv / ar5iv URL works; pass a `focus` term to bias chunk selection toward a section.
+ */
+
+import { chunkMarkdown } from '../chunking'
+import { htmlToText } from '../sources/html'
+import { politeFetch } from '../sources/http'
+
+/** A stable real arXiv paper (Transformer / "Attention Is All You Need") rendered to HTML by ar5iv. */
+export const DEFAULT_SOURCE_URL = 'https://ar5iv.labs.arxiv.org/html/1706.03762'
+
+export interface GroundDocOptions {
+  url: string
+  cacheDir?: string
+  /** Bias chunk selection toward chunks mentioning this term (case-insensitive). */
+  focus?: string
+  /** Chunk size ceiling. Default 1800 chars — a paragraph or two of grounding context. */
+  maxChars?: number
+  /** Minimum letters a chunk must have to be eligible (skips nav / citation scraps). Default 400. */
+  minLetters?: number
+  signal?: AbortSignal
+}
+
+export interface GroundedDoc {
+  url: string
+  sourceUpdatedAt: string
+  /** The selected grounding excerpt — the `doc` passed to the loop. */
+  doc: string
+  chunkIndex: number
+  headingPath: string
+  totalChunks: number
+}
+
+function letterCount(s: string): number {
+  return (s.match(/[a-zA-Z]/g) ?? []).length
+}
+
+/** Reference/bibliography chunks are citation soup — never good question material. */
+function looksLikeReferences(headingPath: string, text: string): boolean {
+  if (/references|bibliography|acknowledg/i.test(headingPath)) return true
+  // A chunk that is mostly "[n]" / "et al." / years is a reference list.
+  const refMarkers = (text.match(/\[\d+\]|et al\.|arXiv:|doi:/gi) ?? []).length
+  return refMarkers >= 5
+}
+
+/**
+ * Fetch + chunk + select a grounding excerpt from a real document. Fails loud if the fetch is
+ * unverifiable or yields no usable prose chunk.
+ */
+export async function groundDoc(opts: GroundDocOptions): Promise<GroundedDoc> {
+  const res = await politeFetch(opts.url, { cacheDir: opts.cacheDir, signal: opts.signal })
+  if (!res.verifiable) {
+    throw new Error(`source not verifiable (${opts.url}): ${res.unverifiableReason ?? 'unknown'}`)
+  }
+  const text = htmlToText(res.body)
+  const maxChars = opts.maxChars ?? 1800
+  const chunks = chunkMarkdown(text, { maxChars, targetChars: Math.round(maxChars * 0.8) })
+  const minLetters = opts.minLetters ?? 400
+
+  const eligible = chunks.filter(
+    (c) =>
+      !c.oversized &&
+      letterCount(c.text) >= minLetters &&
+      !looksLikeReferences(c.headingPath, c.text),
+  )
+  if (eligible.length === 0) {
+    throw new Error(
+      `no usable prose chunk from ${opts.url} (${chunks.length} chunks, none eligible)`,
+    )
+  }
+
+  const focus = opts.focus?.toLowerCase()
+  const score = (text: string): number => {
+    const letters = letterCount(text)
+    if (!focus) return letters
+    const hits = (
+      text.toLowerCase().match(new RegExp(focus.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), 'g')) ?? []
+    ).length
+    return hits * 2000 + letters
+  }
+  const selected = eligible.reduce((best, c) => (score(c.text) > score(best.text) ? c : best))
+
+  return {
+    url: opts.url,
+    sourceUpdatedAt: res.sourceUpdatedAt,
+    doc: selected.text,
+    chunkIndex: selected.index,
+    headingPath: selected.headingPath,
+    totalChunks: chunks.length,
+  }
+}
diff --git a/src/autodata/index.ts b/src/autodata/index.ts
new file mode 100644
index 0000000..e211f33
--- /dev/null
+++ b/src/autodata/index.ts
@@ -0,0 +1,52 @@
+/**
+ * Autodata — the LIVE dataset builder: ground on a real source document, run the agentic
+ * data-creation loop with REAL two-tier solver models (the paper's Qwen tiers via the Tangle
+ * router), and emit a discriminative QA dataset + the empirical strong/weak gap.
+ *
+ * The inner loop (`createDataCreationLoop` / `discriminativeAcceptRule`) is vendored from the
+ * agent-runtime example and composes only published substrate primitives. This package adds the
+ * real grounding (`grounding`), the real router roles (`router-roles`), and the pipeline +
+ * JSONL writer (`build-dataset`).
+ */
+
+export {
+  type AutodataDatasetConfig,
+  type AutodataDatasetResult,
+  buildAutodataDataset,
+  type DatasetRow,
+  type DiscriminativeThresholds,
+} from './build-dataset'
+export {
+  type AcceptDecision,
+  createDataCreationLoop,
+  type DataCreationConfig,
+  type DataCreationResult,
+  type DataExample,
+  discriminativeAcceptRule,
+  type ExampleEvaluation,
+  qualityCheck,
+  type SolverArtifact,
+} from './data-creation-loop'
+export {
+  DEFAULT_SOURCE_URL,
+  type GroundDocOptions,
+  type GroundedDoc,
+  groundDoc,
+} from './grounding'
+export {
+  type AutodataRoles,
+  buildAutodataRoles,
+  CHALLENGER_MODEL,
+  DEFAULT_BASE_URL,
+  JUDGE_MODEL,
+  parseDataExample,
+  type RouterCallRecord,
+  type RouterChatInput,
+  type RouterChatResult,
+  type RouterRolesConfig,
+  routerChat,
+  type SmokeResult,
+  STRONG_SOLVER_MODEL,
+  smokeTestModels,
+  WEAK_SOLVER_MODEL,
+} from './router-roles'
diff --git a/src/autodata/offline-fixtures.ts b/src/autodata/offline-fixtures.ts
new file mode 100644
index 0000000..77e0c9c
--- /dev/null
+++ b/src/autodata/offline-fixtures.ts
@@ -0,0 +1,200 @@
+/**
+ * Credentialless offline stand-ins so the Autodata loop runs in CI with ZERO creds and reproducible
+ * numbers: scripted challenger/solvers + a mock-transport judge. None of this is the lesson — it is
+ * the minimum that lets the wiring be tested offline. The LIVE roles (real router models) live in
+ * `router-roles.ts`; the scores here are tuned to reproduce the paper's Table 1 separation (an EASY
+ * first-draft example barely separates the two solvers, a HARD loop-accepted one separates widely).
+ *
+ * Ported from agent-runtime `examples/agentic-data-creation/offline-fixtures.ts`.
+ */
+
+import { createChatClient, llmJudge } from '@tangle-network/agent-eval'
+import type { JudgeConfig } from '@tangle-network/agent-eval/campaign'
+import { inProcessSandboxClient, type SandboxClient } from '@tangle-network/agent-runtime/loops'
+import type { SandboxEvent } from '@tangle-network/sandbox'
+import type { DataExample, SolverArtifact } from './data-creation-loop'
+
+export const groundingDoc = `Idempotency in the Payments API
+
+Every write to POST /charges may carry an Idempotency-Key header. The server stores the first
+response under that key for 24 hours. A retry with the SAME key and the SAME request body replays
+the stored response instead of charging again. A retry with the SAME key but a DIFFERENT body is a
+conflict: the server rejects it with 422 Unprocessable Entity and creates no second charge.
+Idempotency keys are scoped per merchant account.`
+
+export const baseInstruction = (doc: string): string =>
+  `You are writing ONE training example from the document below. Produce a context excerpt, a ` +
+  `question answerable from it, a reference answer, and a 2-3 item rubric.\n\nDOCUMENT:\n${doc}`
+
+const easyExample: DataExample = {
+  context: 'Every write to POST /charges may carry an Idempotency-Key header.',
+  question: 'Which HTTP header carries the idempotency key on a POST /charges write?',
+  reference: 'The request uses an Idempotency-Key header.',
+  rubric: ['Names the Idempotency-Key header', 'Ties it to a POST /charges write'],
+}
+
+const hardExamples: DataExample[] = [
+  {
+    context:
+      'A retry with the same key but a different body is a conflict: the server rejects it with 422 ' +
+      'Unprocessable Entity and creates no second charge.',
+    question:
+      'Why must the server reject a same-key, different-body retry with 422 instead of replaying the ' +
+      'stored response, and what failure does that prevent?',
+    reference:
+      'Replaying the stored response would apply it to a different request; rejecting with 422 surfaces ' +
+      'the mismatch and prevents a double or incorrect charge.',
+    rubric: [
+      'States the server rejects the retry with 422',
+      'Explains replaying the stored response would be wrong for a different body',
+      'Identifies the prevented failure: a double or incorrect charge',
+    ],
+  },
+  {
+    context:
+      'The server stores the first response under the idempotency key for 24 hours and replays it on a ' +
+      'retry with the same key and the same body.',
+    question:
+      'Why does replaying the stored response on a same-key, same-body retry matter, and what failure ' +
+      'does it prevent when a client retries after a dropped connection?',
+    reference:
+      'The original request may already have charged; replaying returns that one result so a network ' +
+      'retry does not create a second charge.',
+    rubric: [
+      'Explains the first request may have already succeeded',
+      'States the stored response is replayed instead of re-charging',
+      'Identifies the prevented failure: a duplicate charge on retry',
+    ],
+  },
+  {
+    context: 'Idempotency keys are scoped per merchant account.',
+    question:
+      'Why are idempotency keys scoped per merchant account, and what would break if they were global ' +
+      'across all merchants?',
+    reference:
+      'Per-merchant scoping isolates key spaces; a global scope would let one merchant key collide ' +
+      'with another and replay the wrong merchant charge.',
+    rubric: [
+      'States keys are isolated per merchant account',
+      'Explains a global scope risks cross-merchant key collisions',
+      'Identifies the failure: replaying the wrong merchant charge',
+    ],
+  },
+]
+
+const hardQuestionPattern = /\b(why|explain|under what|what happens if|reason)\b/i
+
+/**
+ * Scripted challenger: first draft (no "REJECTED" in the prompt) → the EASY example; once the refine
+ * driver folds a "too easy" reject into the prompt, it ships the next HARD example — proving the
+ * loop's behavior changed because of the fold. Stateful so successive targets get DISTINCT examples.
+ */
+export function challengerClient(): SandboxClient {
+  let hardServed = 0
+  return inProcessSandboxClient({
+    onPrompt: (prompt): SandboxEvent[] => {
+      const wantsHarder = /rejected|too easy/i.test(prompt)
+      const example = wantsHarder
+        ? (hardExamples[hardServed++ % hardExamples.length] ?? easyExample)
+        : easyExample
+      return [
+        {
+          type: 'llm_call',
+          data: { model: 'offline-challenger', tokensIn: 320, tokensOut: 90, costUsd: 0.0006 },
+        },
+        { type: 'result', data: { result: example } },
+      ]
+    },
+  })
+}
+
+/**
+ * Scripted solver: answers the rendered example and tags the answer with a grade marker the offline
+ * judge reads. The weak solver produces a thin answer; the strong solver a complete one.
+ */
+export function solverClient(strength: 'weak' | 'strong'): SandboxClient {
+  return inProcessSandboxClient({
+    onPrompt: (prompt): SandboxEvent[] => {
+      const hard = hardQuestionPattern.test(prompt)
+      const sample = Number(/\[sample (\d+)\]/.exec(prompt)?.[1] ?? '0')
+      const body =
+        strength === 'strong'
+          ? 'A complete, rubric-covering answer grounded in the context.'
+          : 'A short, partial answer.'
+      const answer = `${body} <<grade:${strength}:${hard ? 'hard' : 'easy'}:s${sample}>>`
+      return [
+        {
+          type: 'llm_call',
+          data: {
+            model: `offline-${strength}-solver`,
+            tokensIn: 140,
+            tokensOut: 30,
+            costUsd: 0.0003,
+          },
+        },
+        { type: 'result', data: { result: { answer } } },
+      ]
+    },
+  })
+}
+
+/** A REAL `llmJudge` over a MOCK transport: returns a scripted [0,1] score from the grade marker. */
+export function buildRubricJudge(): JudgeConfig<SolverArtifact> {
+  const chat = createChatClient({
+    transport: 'mock',
+    defaultModel: 'offline-judge',
+    handler: async (req) => {
+      const text = req.messages
+        .map((m) => (typeof m.content === 'string' ? m.content : ''))
+        .join('\n')
+      const m = /<<grade:(strong|weak):(hard|easy):s(\d+)>>/.exec(text)
+      const strength = m?.[1]
+      const difficulty = m?.[2]
+      const sampleIndex = m?.[3]
+      if (!strength || !difficulty || sampleIndex === undefined) {
+        throw new Error('offline judge: answer carried no grade marker')
+      }
+      const base =
+        difficulty === 'hard'
+          ? strength === 'strong'
+            ? 0.77
+            : 0.46
+          : strength === 'strong'
+            ? 0.86
+            : 0.84
+      // Per-sample jitter over samples 0,1,2 → −0.02, 0, +0.02, so the N× mean lands back on `base`.
+      const jitter = (Number(sampleIndex) - 1) * 0.02
+      const score = Math.min(1, Math.max(0, base + jitter))
+      return {
+        content: JSON.stringify({
+          dimensions: { rubric_coverage: score, correctness: score },
+          notes: `offline: ${strength} solver on ${difficulty} example (sample ${sampleIndex})`,
+        }),
+        usage: { promptTokens: 130, completionTokens: 25, totalTokens: 155 },
+        costUsd: 0.0001,
+        model: 'offline-judge',
+        durationMs: 1,
+        raw: {},
+      }
+    },
+  })
+
+  return llmJudge<SolverArtifact>(
+    'rubric-judge',
+    'Score the candidate ANSWER against the example RUBRIC. Return JSON ' +
+      '{"dimensions":{"rubric_coverage":N,"correctness":N},"notes":"..."} with each score in [0,1].',
+    {
+      chat,
+      dimensions: [
+        {
+          key: 'rubric_coverage',
+          description: 'fraction of the rubric criteria the answer satisfies',
+        },
+        { key: 'correctness', description: 'agreement with the reference answer' },
+      ],
+      scale: 'unit',
+      renderUser: ({ artifact }) =>
+        `RUBRIC:\n${artifact.example.rubric.map((r, i) => `${i + 1}. ${r}`).join('\n')}\n\nANSWER:\n${artifact.answer}`,
+    },
+  )
+}
diff --git a/src/autodata/router-roles.ts b/src/autodata/router-roles.ts
new file mode 100644
index 0000000..6ba7413
--- /dev/null
+++ b/src/autodata/router-roles.ts
@@ -0,0 +1,442 @@
+/**
+ * The REAL two-tier roles for the Autodata loop, over the Tangle router.
+ *
+ * One transport seam — `routerChat` — POSTs `/chat/completions` and returns content + exact token
+ * usage + a per-call USD cost (the router's own cost when it returns one, else a documented
+ * rate-table estimate over the exact token counts; the source is flagged, never silently faked).
+ * The four roles are materialized on top of it:
+ *   • challenger (glm-5.2) → an `inProcessSandboxClient` that asks for ONE JSON example and parses it
+ *   • weak solver (qwen-2.5-7b) / strong solver (qwen3-235b) → `inProcessSandboxClient` answer workers
+ *   • judge (glm-5.2) → an `llmJudge` `JudgeConfig` whose transport is a `sandbox-sdk` ChatClient
+ *     wrapping `routerChat`; the judge's own spend is recorded into the same `CostLedger` (the loop
+ *     only aggregates challenger + solver spend, so the judge channel would otherwise be invisible).
+ *
+ * glm-5.2 returns empty content unless `max_tokens` is generous, so every glm call is floored and the
+ * judge is built with an explicit `maxTokens`.
+ */
+
+import {
+  type ChatCallOpts,
+  type ChatRequest,
+  type ChatResponse,
+  type CostLedger,
+  createChatClient,
+  llmJudge,
+} from '@tangle-network/agent-eval'
+import type { JudgeConfig } from '@tangle-network/agent-eval/campaign'
+import { inProcessSandboxClient, type SandboxClient } from '@tangle-network/agent-runtime/loops'
+import type { SandboxEvent } from '@tangle-network/sandbox'
+import type { DataExample, SolverArtifact } from './data-creation-loop'
+
+export const DEFAULT_BASE_URL = 'https://router.tangle.tools/v1'
+
+// A genuine small-vs-large tier in one model family. The brief specified the Qwen tier
+// (`qwen/qwen-2.5-7b-instruct` weak, `qwen/qwen3-235b-a22b` strong), but on the live Tangle router
+// EVERY Qwen id 401s `No API key configured for model` for this key — the Qwen upstream is not
+// provisioned (verified by probing `/v1/chat/completions` across the `/v1/models` catalog). The
+// GLM family IS served, so the real tier here is the smallest GLM (`glm-4.5-air`) as the weak solver
+// vs the latest (`glm-5.2`) as the strong solver. Same family, a real generational/size gap; swap
+// these constants back to the Qwen ids once the router provisions that upstream.
+export const WEAK_SOLVER_MODEL = 'glm-4.5-air'
+export const STRONG_SOLVER_MODEL = 'glm-5.2'
+export const CHALLENGER_MODEL = 'glm-5.2'
+export const JUDGE_MODEL = 'glm-5.2'
+
+interface ModelPrice {
+  /** USD per 1M input tokens. */
+  inputPerM: number
+  /** USD per 1M output tokens. */
+  outputPerM: number
+}
+
+/**
+ * Rate table for the $ estimate. The TOKEN COUNTS are exact (read from the router's `usage`); these
+ * rates are the documented basis for converting them to dollars WHEN the router returns no per-call
+ * cost. They are estimates, not invoices — `routerChat` flags every call's `costSource` so a report
+ * can say how many calls were router-priced vs rate-estimated.
+ */
+const PRICE_TABLE: Record<string, ModelPrice> = {
+  'glm-4.5-air': { inputPerM: 0.2, outputPerM: 0.6 },
+  'glm-5.2': { inputPerM: 0.95, outputPerM: 3.0 },
+}
+
+/** Per-call usage record surfaced to an optional sink for cost-provenance reporting. */
+export interface RouterCallRecord {
+  model: string
+  promptTokens: number
+  completionTokens: number
+  costUsd: number
+  costSource: 'router' | 'estimated'
+  finishReason: string | null
+}
+
+export interface RouterChatInput {
+  apiKey: string
+  baseUrl?: string
+  model: string
+  messages: { role: 'system' | 'user' | 'assistant'; content: string }[]
+  maxTokens: number
+  temperature?: number
+  jsonMode?: boolean
+  signal?: AbortSignal
+  onCall?: (rec: RouterCallRecord) => void
+}
+
+export interface RouterChatResult {
+  content: string
+  promptTokens: number
+  completionTokens: number
+  costUsd: number
+  costSource: 'router' | 'estimated'
+  finishReason: string | null
+  raw: Record<string, unknown>
+}
+
+/** glm spends its budget on hidden reasoning and returns empty content unless max_tokens is high. */
+function maxTokensFloor(model: string): number {
+  return /glm/i.test(model) ? 1500 : 512
+}
+
+/** Read a per-call cost the router may return, across the field names proxies use. */
+function routerReportedCost(body: Record<string, unknown>): number | null {
+  const usage = (body.usage ?? {}) as Record<string, unknown>
+  const candidates = [body._response_cost, body.cost, usage.cost, usage.total_cost]
+  for (const c of candidates) {
+    if (typeof c === 'number' && Number.isFinite(c) && c > 0) return c
+  }
+  return null
+}
+
+function estimateCostUsd(model: string, promptTokens: number, completionTokens: number): number {
+  const price = PRICE_TABLE[model]
+  if (!price) {
+    // Fail loud: a model we route to but cannot price would emit a 0 that masquerades as "free".
+    throw new Error(`no price-table entry for model '${model}' — add it before routing live spend`)
+  }
+  return (promptTokens * price.inputPerM + completionTokens * price.outputPerM) / 1_000_000
+}
+
+/**
+ * One Tangle-router chat call. Fails loud on a non-2xx status. Returns the visible content, the
+ * exact prompt/completion token counts, and a USD cost (router-reported when present, else
+ * rate-estimated over the real token counts) with its source flagged.
+ */
+export async function routerChat(input: RouterChatInput): Promise<RouterChatResult> {
+  const baseUrl = (input.baseUrl ?? DEFAULT_BASE_URL).replace(/\/$/, '')
+  const max_tokens = Math.max(input.maxTokens, maxTokensFloor(input.model))
+  const res = await fetch(`${baseUrl}/chat/completions`, {
+    method: 'POST',
+    headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${input.apiKey}` },
+    signal: input.signal,
+    body: JSON.stringify({
+      model: input.model,
+      messages: input.messages,
+      max_tokens,
+      temperature: input.temperature ?? 0.2,
+      stream: false,
+      ...(input.jsonMode ? { response_format: { type: 'json_object' } } : {}),
+    }),
+  })
+  if (!res.ok) {
+    const detail = await res.text().catch(() => res.statusText)
+    throw new Error(`router ${res.status} for ${input.model}: ${detail.slice(0, 400)}`)
+  }
+  const body = (await res.json()) as Record<string, unknown>
+  const choice = (body.choices as { message?: { content?: string }; finish_reason?: string }[])?.[0]
+  const usage = (body.usage ?? {}) as { prompt_tokens?: number; completion_tokens?: number }
+  const promptTokens = usage.prompt_tokens ?? 0
+  const completionTokens = usage.completion_tokens ?? 0
+  const reported = routerReportedCost(body)
+  const costUsd = reported ?? estimateCostUsd(input.model, promptTokens, completionTokens)
+  const costSource: 'router' | 'estimated' = reported !== null ? 'router' : 'estimated'
+  const finishReason = choice?.finish_reason ?? null
+  input.onCall?.({
+    model: input.model,
+    promptTokens,
+    completionTokens,
+    costUsd,
+    costSource,
+    finishReason,
+  })
+  return {
+    content: choice?.message?.content ?? '',
+    promptTokens,
+    completionTokens,
+    costUsd,
+    costSource,
+    finishReason,
+    raw: body,
+  }
+}
+
+// ── Parsing the challenger's JSON example ─────────────────────────────────────────────────────
+
+/** Extract the first balanced top-level JSON object from a model response (handles ```json fences). */
+function extractJsonObject(text: string): string | null {
+  const fenced = /```(?:json)?\s*([\s\S]*?)```/i.exec(text)
+  const body = fenced ? (fenced[1] ?? '') : text
+  const start = body.indexOf('{')
+  if (start < 0) return null
+  let depth = 0
+  let inString = false
+  let escaped = false
+  for (let i = start; i < body.length; i++) {
+    const ch = body[i]
+    if (escaped) {
+      escaped = false
+      continue
+    }
+    if (ch === '\\') {
+      escaped = true
+      continue
+    }
+    if (ch === '"') inString = !inString
+    if (inString) continue
+    if (ch === '{') depth++
+    else if (ch === '}') {
+      depth--
+      if (depth === 0) return body.slice(start, i + 1)
+    }
+  }
+  return null
+}
+
+/** Parse a challenger response into a `DataExample`, or throw loud (the loop refines on the error). */
+export function parseDataExample(text: string): DataExample {
+  const json = extractJsonObject(text)
+  if (!json) throw new Error('challenger response contained no JSON object')
+  const parsed = JSON.parse(json) as Record<string, unknown>
+  const rubric = parsed.rubric
+  if (
+    typeof parsed.context !== 'string' ||
+    typeof parsed.question !== 'string' ||
+    typeof parsed.reference !== 'string' ||
+    !Array.isArray(rubric)
+  ) {
+    throw new Error('challenger JSON missing a required field (context/question/reference/rubric)')
+  }
+  return {
+    context: parsed.context,
+    question: parsed.question,
+    reference: parsed.reference,
+    rubric: rubric.map((r) => String(r)),
+  }
+}
+
+// ── The roles ─────────────────────────────────────────────────────────────────────────────────
+
+const challengerSystem =
+  'You write ONE hard exam question from a source document. The question must require multi-step ' +
+  'reasoning a small model would get wrong but a strong model would get right — never a verbatim ' +
+  'lookup. Return STRICT JSON and nothing else: ' +
+  '{"context": string, "question": string, "reference": string, "rubric": string[] }. ' +
+  'The context is a short excerpt from the document; the question must NOT be answerable by copying ' +
+  'a sentence; the reference is the correct answer; the rubric is 2-3 scoring criteria. ' +
+  'Do NOT put the reference answer verbatim inside the context.'
+
+const judgeSystem =
+  'You are grading a candidate ANSWER to a question against a RUBRIC and a REFERENCE answer. ' +
+  'Return JSON {"dimensions":{"rubric_coverage":N,"correctness":N},"notes":"..."} with each score ' +
+  'in [0,1]. rubric_coverage = the fraction of rubric criteria the answer satisfies; correctness = ' +
+  'how well the answer agrees with the reference. Be strict: a vague or partial answer scores low.'
+
+export interface RouterRolesConfig {
+  apiKey: string
+  baseUrl?: string
+  challengerModel?: string
+  weakModel?: string
+  strongModel?: string
+  judgeModel?: string
+  /** Judge spend is recorded here directly (the loop captures only challenger + solver spend). */
+  ledger: CostLedger
+  /** Optional sink for every router call's cost provenance. */
+  onCall?: (rec: RouterCallRecord) => void
+}
+
+export interface AutodataRoles {
+  challenger: SandboxClient
+  weakSolver: SandboxClient
+  strongSolver: SandboxClient
+  judge: JudgeConfig<SolverArtifact>
+}
+
+function solverClient(cfg: RouterRolesConfig, model: string): SandboxClient {
+  return inProcessSandboxClient({
+    onPrompt: async (prompt, ctx): Promise<SandboxEvent[]> => {
+      const r = await routerChat({
+        apiKey: cfg.apiKey,
+        baseUrl: cfg.baseUrl,
+        model,
+        messages: [{ role: 'user', content: prompt }],
+        maxTokens: 1024,
+        signal: ctx.signal,
+        onCall: cfg.onCall,
+      })
+      return [
+        {
+          type: 'llm_call',
+          data: {
+            model,
+            tokensIn: r.promptTokens,
+            tokensOut: r.completionTokens,
+            costUsd: r.costUsd,
+          },
+        },
+        { type: 'result', data: { result: { answer: r.content } } },
+      ]
+    },
+  })
+}
+
+function challengerClient(cfg: RouterRolesConfig): SandboxClient {
+  const model = cfg.challengerModel ?? CHALLENGER_MODEL
+  return inProcessSandboxClient({
+    onPrompt: async (prompt, ctx): Promise<SandboxEvent[]> => {
+      const r = await routerChat({
+        apiKey: cfg.apiKey,
+        baseUrl: cfg.baseUrl,
+        model,
+        messages: [
+          { role: 'system', content: challengerSystem },
+          { role: 'user', content: prompt },
+        ],
+        maxTokens: 1500,
+        jsonMode: true,
+        signal: ctx.signal,
+        onCall: cfg.onCall,
+      })
+      const example = parseDataExample(r.content)
+      return [
+        {
+          type: 'llm_call',
+          data: {
+            model,
+            tokensIn: r.promptTokens,
+            tokensOut: r.completionTokens,
+            costUsd: r.costUsd,
+          },
+        },
+        { type: 'result', data: { result: example } },
+      ]
+    },
+  })
+}
+
+function rubricJudge(cfg: RouterRolesConfig): JudgeConfig<SolverArtifact> {
+  const judgeModel = cfg.judgeModel ?? JUDGE_MODEL
+  const chat = createChatClient({
+    transport: 'sandbox-sdk',
+    defaultModel: judgeModel,
+    chat: async (req: ChatRequest, opts?: ChatCallOpts): Promise<ChatResponse> => {
+      const model = req.model ?? judgeModel
+      const messages = req.messages.map((m) => ({
+        role: m.role,
+        content:
+          typeof m.content === 'string'
+            ? m.content
+            : m.content.map((p) => (p.type === 'text' ? p.text : '')).join('\n'),
+      }))
+      const r = await routerChat({
+        apiKey: cfg.apiKey,
+        baseUrl: cfg.baseUrl,
+        model,
+        messages,
+        maxTokens: req.maxTokens ?? 1500,
+        temperature: req.temperature,
+        jsonMode: req.jsonMode,
+        signal: opts?.signal,
+        onCall: cfg.onCall,
+      })
+      cfg.ledger.record({
+        model,
+        channel: 'judge',
+        usage: { inputTokens: r.promptTokens, outputTokens: r.completionTokens },
+        actualCostUsd: r.costUsd,
+        tags: { role: 'judge' },
+      })
+      return {
+        content: r.content,
+        usage: {
+          promptTokens: r.promptTokens,
+          completionTokens: r.completionTokens,
+          totalTokens: r.promptTokens + r.completionTokens,
+        },
+        costUsd: r.costUsd,
+        model,
+        durationMs: 0,
+        finishReason: r.finishReason,
+        contentEmpty: r.content.trim() === '',
+        raw: r.raw,
+      }
+    },
+  })
+
+  return llmJudge<SolverArtifact>('autodata-rubric-judge', judgeSystem, {
+    chat,
+    maxTokens: 1500,
+    dimensions: [
+      {
+        key: 'rubric_coverage',
+        description: 'fraction of the rubric criteria the answer satisfies',
+      },
+      { key: 'correctness', description: 'agreement with the reference answer' },
+    ],
+    scale: 'unit',
+    renderUser: ({ artifact }) =>
+      `REFERENCE ANSWER:\n${artifact.example.reference}\n\n` +
+      `RUBRIC:\n${artifact.example.rubric.map((r, i) => `${i + 1}. ${r}`).join('\n')}\n\n` +
+      `CANDIDATE ANSWER:\n${artifact.answer}`,
+  })
+}
+
+/** Materialize all four live roles over the Tangle router. */
+export function buildAutodataRoles(cfg: RouterRolesConfig): AutodataRoles {
+  return {
+    challenger: challengerClient(cfg),
+    weakSolver: solverClient(cfg, cfg.weakModel ?? WEAK_SOLVER_MODEL),
+    strongSolver: solverClient(cfg, cfg.strongModel ?? STRONG_SOLVER_MODEL),
+    judge: rubricJudge(cfg),
+  }
+}
+
+export interface SmokeResult {
+  model: string
+  ok: boolean
+  contentChars: number
+  finishReason: string | null
+  costUsd: number
+  costSource: 'router' | 'estimated'
+}
+
+/**
+ * The cost gate: one cheap call per model, asserting non-empty content, BEFORE the loop burn.
+ * Returns a row per model so the caller can fail loud if any tier is dead.
+ */
+export async function smokeTestModels(cfg: {
+  apiKey: string
+  baseUrl?: string
+  models?: string[]
+  signal?: AbortSignal
+}): Promise<SmokeResult[]> {
+  const models = cfg.models ?? [CHALLENGER_MODEL, WEAK_SOLVER_MODEL, STRONG_SOLVER_MODEL]
+  const rows: SmokeResult[] = []
+  for (const model of models) {
+    const r = await routerChat({
+      apiKey: cfg.apiKey,
+      baseUrl: cfg.baseUrl,
+      model,
+      messages: [{ role: 'user', content: 'Reply with the single word: ready.' }],
+      maxTokens: 32,
+      signal: cfg.signal,
+    })
+    rows.push({
+      model,
+      ok: r.content.trim().length > 0,
+      contentChars: r.content.trim().length,
+      finishReason: r.finishReason,
+      costUsd: r.costUsd,
+      costSource: r.costSource,
+    })
+  }
+  return rows
+}
diff --git a/src/autodata/run.ts b/src/autodata/run.ts
new file mode 100644
index 0000000..cca767f
--- /dev/null
+++ b/src/autodata/run.ts
@@ -0,0 +1,150 @@
+/**
+ * Autodata — the LIVE runnable: cost-gate the three models, ground on a REAL arXiv document, run the
+ * agentic data-creation loop with the real two-tier solvers, and report the empirical strong/weak
+ * gap (plain first-draft vs loop-accepted), the cost split by role, and the JSONL dataset path.
+ *
+ * Run (key never printed):
+ *   dotenvx run -f /home/drew/company/devops/secrets/agent-state.env -- \
+ *     pnpm tsx src/autodata/run.ts
+ *
+ * Env knobs: AUTODATA_URL, AUTODATA_FOCUS, AUTODATA_TARGET, AUTODATA_SAMPLES, AUTODATA_MAXRETRIES,
+ *            AUTODATA_OUT, TANGLE_API_KEY (or TANGLE_ROUTER_KEY).
+ */
+
+import { buildAutodataDataset } from './build-dataset'
+import { DEFAULT_SOURCE_URL, groundDoc } from './grounding'
+import {
+  CHALLENGER_MODEL,
+  STRONG_SOLVER_MODEL,
+  smokeTestModels,
+  WEAK_SOLVER_MODEL,
+} from './router-roles'
+
+function envInt(name: string, fallback: number): number {
+  const raw = process.env[name]
+  if (!raw) return fallback
+  const n = Number.parseInt(raw, 10)
+  if (!Number.isFinite(n) || n <= 0) throw new Error(`${name}='${raw}' is not a positive integer`)
+  return n
+}
+
+function fmt(x: number | null, digits = 3): string {
+  return x === null ? 'n/a' : x.toFixed(digits)
+}
+
+async function main(): Promise<void> {
+  const apiKey = process.env.TANGLE_API_KEY ?? process.env.TANGLE_ROUTER_KEY
+  if (!apiKey) throw new Error('no TANGLE_API_KEY in env — run under dotenvx so the key is set')
+
+  const url = process.env.AUTODATA_URL ?? DEFAULT_SOURCE_URL
+  const focus = process.env.AUTODATA_FOCUS ?? 'attention'
+  const target = envInt('AUTODATA_TARGET', 3)
+  const samples = envInt('AUTODATA_SAMPLES', 3)
+  const maxRetries = envInt('AUTODATA_MAXRETRIES', 4)
+  const outPath = process.env.AUTODATA_OUT ?? 'data/autodata-dataset.jsonl'
+
+  // ── 1. COST GATE: one cheap call per model, all must return non-empty content before the burn ──
+  console.log('Autodata · cost gate (one call per model)\n')
+  const smoke = await smokeTestModels({
+    apiKey,
+    models: [CHALLENGER_MODEL, WEAK_SOLVER_MODEL, STRONG_SOLVER_MODEL],
+  })
+  for (const s of smoke) {
+    console.log(
+      `  ${s.ok ? 'ok ' : 'DEAD'} ${s.model.padEnd(28)} chars=${String(s.contentChars).padStart(4)}  ` +
+        `finish=${s.finishReason ?? '?'}  cost=$${s.costUsd.toFixed(5)} (${s.costSource})`,
+    )
+  }
+  const dead = smoke.filter((s) => !s.ok)
+  if (dead.length > 0) {
+    throw new Error(`cost gate failed — empty content from: ${dead.map((d) => d.model).join(', ')}`)
+  }
+
+  // ── 2. Ground on a REAL document ──
+  const grounded = await groundDoc({ url, focus })
+  console.log(
+    `\nGrounded on ${grounded.url}\n  section='${grounded.headingPath}' chunk=${grounded.chunkIndex}/${grounded.totalChunks} ` +
+      `(${grounded.doc.length} chars, updated ${grounded.sourceUpdatedAt})`,
+  )
+  console.log(`  excerpt: ${grounded.doc.slice(0, 200).replace(/\s+/g, ' ')}...`)
+
+  // ── 3. Run the loop with real two-tier solvers ──
+  console.log(
+    `\nManufacturing up to ${target} discriminating example(s) · samples=${samples} maxRetries=${maxRetries}\n` +
+      `  challenger/judge=${CHALLENGER_MODEL}  weak=${WEAK_SOLVER_MODEL}  strong=${STRONG_SOLVER_MODEL}`,
+  )
+  const result = await buildAutodataDataset({
+    apiKey,
+    source: grounded,
+    outPath,
+    target,
+    samples,
+    maxRetries,
+  })
+
+  // ── 4. The accepted set ──
+  console.log(`\n— Accepted examples (${result.accepted.length}/${target}) —`)
+  for (const [i, ex] of result.accepted.entries()) {
+    console.log(`\n  [${i}] Q: ${ex.example.question}`)
+    console.log(
+      `      weak=${ex.weakScore.toFixed(2)}  strong=${ex.strongScore.toFixed(2)}  gap=${ex.gap.toFixed(2)}`,
+    )
+    console.log(`      ${ex.decision.reason}`)
+  }
+
+  // ── 5. The empirical calibration (paper Table 1) ──
+  console.log('\n— Calibration: plain first-draft gap vs agentic loop-accepted gap —')
+  console.log(
+    `  plain   (first-draft questions, n=${result.plainGaps.length})  mean gap = ${fmt(result.plainGapMean)}`,
+  )
+  console.log(
+    `  agentic (loop-accepted questions, n=${result.agenticGaps.length}) mean gap = ${fmt(result.agenticGapMean)}`,
+  )
+  console.log(
+    `  refined (best gap reached per slot, n=${result.refinedGaps.length}) mean gap = ${fmt(result.refinedGapMean)}`,
+  )
+  // The honest comparison: plain first-draft gap vs the best the refinement reached. Acceptance is
+  // strict (gap >= 0.20); refined-vs-plain shows whether the fold widened the gap at all.
+  if (result.plainGapMean !== null && result.refinedGapMean !== null) {
+    const delta = result.refinedGapMean - result.plainGapMean
+    console.log(
+      `  Δ (refined − plain) = ${delta >= 0 ? '+' : ''}${delta.toFixed(3)}  ` +
+        (delta >= 0.1
+          ? '→ the loop WIDENS the strong/weak gap (empirical Table-1 direction)'
+          : '→ NO meaningful widening on these real models (honest null)'),
+    )
+  } else {
+    console.log('  (insufficient data to compare — see accepted count)')
+  }
+  if (result.accepted.length === 0) {
+    console.log(
+      '  NOTE: 0 examples cleared the discriminative accept bar — the two GLM tiers did not separate.',
+    )
+  }
+
+  // ── 6. Cost split by role ──
+  const summary = result.cost.summary()
+  console.log('\n— Cost (CostLedger, by role) —')
+  console.log(
+    `  total: $${summary.totalCostUsd.toFixed(4)} over ${summary.totalCalls} recorded loops/calls` +
+      (summary.fullyPriced
+        ? ' (fully priced)'
+        : ` (unpriced models: ${summary.unpricedModels.join(', ')})`),
+  )
+  for (const ch of summary.byChannel) {
+    console.log(`    ${ch.channel.padEnd(14)} $${ch.costUsd.toFixed(4)}  (${ch.calls} loops/calls)`)
+  }
+  if (result.costPerExampleUsd !== null) {
+    console.log(`  cost per accepted example: $${result.costPerExampleUsd.toFixed(4)}`)
+  }
+  console.log(
+    `  call provenance: ${result.callProvenance.router} router-priced, ${result.callProvenance.estimated} rate-estimated`,
+  )
+
+  console.log(`\n— Dataset — ${result.rows.length} row(s) written to ${result.outPath}`)
+}
+
+main().catch((err) => {
+  console.error(err)
+  process.exit(1)
+})
diff --git a/tsup.config.ts b/tsup.config.ts
index 1621ec2..3aed29f 100644
--- a/tsup.config.ts
+++ b/tsup.config.ts
@@ -8,6 +8,7 @@ export default defineConfig({
     'memory/index': 'src/memory/index.ts',
     'sources/index': 'src/sources/index.ts',
     'profiles/index': 'src/profiles/index.ts',
+    'autodata/index': 'src/autodata/index.ts',
   },
   format: ['esm'],
   dts: true,