diff --git a/.gitignore b/.gitignore index e5344c3..128a042 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ coverage .DS_Store *.tsbuildinfo .agent-knowledge/index.json +data/ diff --git a/package.json b/package.json index 1b6e141..a92e158 100644 --- a/package.json +++ b/package.json @@ -43,6 +43,11 @@ "types": "./dist/profiles/index.d.ts", "import": "./dist/profiles/index.js", "default": "./dist/profiles/index.js" + }, + "./autodata": { + "types": "./dist/autodata/index.d.ts", + "import": "./dist/autodata/index.js", + "default": "./dist/autodata/index.js" } }, "bin": { @@ -65,7 +70,8 @@ "test:watch": "vitest", "typecheck": "tsc --noEmit", "lint": "biome check src tests", - "format": "biome format --write src tests" + "format": "biome format --write src tests", + "autodata": "tsx src/autodata/run.ts" }, "dependencies": { "@tangle-network/agent-eval": "^0.100.0", @@ -78,6 +84,7 @@ "@tangle-network/sandbox": "^0.8.0", "@types/node": "^25.6.0", "tsup": "^8.0.0", + "tsx": "^4.22.4", "typescript": "^5.7.0", "vitest": "^3.0.0" }, diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 8442f74..cc04568 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -32,13 +32,16 @@ importers: version: 25.6.0 tsup: specifier: ^8.0.0 - version: 8.5.1(postcss@8.5.13)(typescript@5.9.3)(yaml@2.8.4) + version: 8.5.1(postcss@8.5.13)(tsx@4.22.4)(typescript@5.9.3)(yaml@2.8.4) + tsx: + specifier: ^4.22.4 + version: 4.22.4 typescript: specifier: ^5.7.0 version: 5.9.3 vitest: specifier: ^3.0.0 - version: 3.2.4(@types/node@25.6.0)(yaml@2.8.4) + version: 3.2.4(@types/node@25.6.0)(tsx@4.22.4)(yaml@2.8.4) packages: @@ -118,156 +121,312 @@ packages: cpu: [ppc64] os: [aix] + '@esbuild/aix-ppc64@0.28.1': + resolution: {integrity: sha512-Svl7tq8k/08+p6CXPpRjQ1fKX+1odH/BQbb48fV6fj3CWHhsoIOoY87w1oHXm0qEpkIK3ZfVgp0hed3XBXzXMQ==} + engines: {node: '>=18'} + cpu: [ppc64] + os: [aix] + '@esbuild/android-arm64@0.27.7': resolution: {integrity: sha512-62dPZHpIXzvChfvfLJow3q5dDtiNMkwiRzPylSCfriLvZeq0a1bWChrGx/BbUbPwOrsWKMn8idSllklzBy+dgQ==} engines: {node: '>=18'} cpu: [arm64] os: [android] + '@esbuild/android-arm64@0.28.1': + resolution: {integrity: sha512-34EGEbCIAgosYz6goLcopX6Mo7NyGv9tfwEM2/7Ce2VcVRk568iSvniGWcUXIy7wEDR1wzolcxcriFVrWYcwBg==} + engines: {node: '>=18'} + cpu: [arm64] + os: [android] + '@esbuild/android-arm@0.27.7': resolution: {integrity: sha512-jbPXvB4Yj2yBV7HUfE2KHe4GJX51QplCN1pGbYjvsyCZbQmies29EoJbkEc+vYuU5o45AfQn37vZlyXy4YJ8RQ==} engines: {node: '>=18'} cpu: [arm] os: [android] + '@esbuild/android-arm@0.28.1': + resolution: {integrity: sha512-0k2F129Xdio1TdJfzJ8sy1Q47vUD2NnwdhiAf7drUN1EBTfPf4hsFCtmMgu/6m8JSzsBrlmVjudMBQqOfG8usQ==} + engines: {node: '>=18'} + cpu: [arm] + os: [android] + '@esbuild/android-x64@0.27.7': resolution: {integrity: sha512-x5VpMODneVDb70PYV2VQOmIUUiBtY3D3mPBG8NxVk5CogneYhkR7MmM3yR/uMdITLrC1ml/NV1rj4bMJuy9MCg==} engines: {node: '>=18'} cpu: [x64] os: [android] + '@esbuild/android-x64@0.28.1': + resolution: {integrity: sha512-dbwY7ltSMDWsRatcRpCnES4F+im88OCUgGZjy52shC7GqHRE/cYlxNbB4Z4UpJswpcc4Qxd2oE/ufM0p61IKng==} + engines: {node: '>=18'} + cpu: [x64] + os: [android] + '@esbuild/darwin-arm64@0.27.7': resolution: {integrity: sha512-5lckdqeuBPlKUwvoCXIgI2D9/ABmPq3Rdp7IfL70393YgaASt7tbju3Ac+ePVi3KDH6N2RqePfHnXkaDtY9fkw==} engines: {node: '>=18'} cpu: [arm64] os: [darwin] + '@esbuild/darwin-arm64@0.28.1': + resolution: {integrity: sha512-TZbWkQY7kvTAXbXUT7uVACR5cMHsDiSz9z7ZKAX/RTq/WJEk3QyRr0wZpNhBDX+/0CtdqUIJlOiodQcta6tY3Q==} + engines: {node: '>=18'} + cpu: [arm64] + os: [darwin] + '@esbuild/darwin-x64@0.27.7': resolution: {integrity: sha512-rYnXrKcXuT7Z+WL5K980jVFdvVKhCHhUwid+dDYQpH+qu+TefcomiMAJpIiC2EM3Rjtq0sO3StMV/+3w3MyyqQ==} engines: {node: '>=18'} cpu: [x64] os: [darwin] + '@esbuild/darwin-x64@0.28.1': + resolution: {integrity: sha512-zfdzgK9ACBNZLI/CyHTOx81SyNbM6YXn7rxSgX97VjyiPl9W1i4Ka4fgKECEoFCKGpvBj5qArWIGgQjOwkgskQ==} + engines: {node: '>=18'} + cpu: [x64] + os: [darwin] + '@esbuild/freebsd-arm64@0.27.7': resolution: {integrity: sha512-B48PqeCsEgOtzME2GbNM2roU29AMTuOIN91dsMO30t+Ydis3z/3Ngoj5hhnsOSSwNzS+6JppqWsuhTp6E82l2w==} engines: {node: '>=18'} cpu: [arm64] os: [freebsd] + '@esbuild/freebsd-arm64@0.28.1': + resolution: {integrity: sha512-wG2EA8ENdEI0qhkSZMjfqrdY+ziCYCPMmtZjjIwOmXFjmyzEHn+UUxk5of+SYsjtfs3VpnlC7QLzSI5hY/rOAw==} + engines: {node: '>=18'} + cpu: [arm64] + os: [freebsd] + '@esbuild/freebsd-x64@0.27.7': resolution: {integrity: sha512-jOBDK5XEjA4m5IJK3bpAQF9/Lelu/Z9ZcdhTRLf4cajlB+8VEhFFRjWgfy3M1O4rO2GQ/b2dLwCUGpiF/eATNQ==} engines: {node: '>=18'} cpu: [x64] os: [freebsd] + '@esbuild/freebsd-x64@0.28.1': + resolution: {integrity: sha512-i7dZ9vQgnvSCzi/rYCXNgtF/U+eKZNJBzu3eTQbRgHnM7tNSizLOkRFAl3qzVc/Op/u5YkHHa4pf/3DOYHthLQ==} + engines: {node: '>=18'} + cpu: [x64] + os: [freebsd] + '@esbuild/linux-arm64@0.27.7': resolution: {integrity: sha512-RZPHBoxXuNnPQO9rvjh5jdkRmVizktkT7TCDkDmQ0W2SwHInKCAV95GRuvdSvA7w4VMwfCjUiPwDi0ZO6Nfe9A==} engines: {node: '>=18'} cpu: [arm64] os: [linux] + '@esbuild/linux-arm64@0.28.1': + resolution: {integrity: sha512-yHs+0uc8+nvEAfAfxrWQKK5peSNzBc4PegcMO0EJ2hT71uA7vB8Ihg2e77R2P7SG5uYjPbHlLLmve4LLLRCf0g==} + engines: {node: '>=18'} + cpu: [arm64] + os: [linux] + '@esbuild/linux-arm@0.27.7': resolution: {integrity: sha512-RkT/YXYBTSULo3+af8Ib0ykH8u2MBh57o7q/DAs3lTJlyVQkgQvlrPTnjIzzRPQyavxtPtfg0EopvDyIt0j1rA==} engines: {node: '>=18'} cpu: [arm] os: [linux] + '@esbuild/linux-arm@0.28.1': + resolution: {integrity: sha512-qVXBOHQS+d5Y722GwJzJUtOLlX7km3CraOaGormF1pDtPd2C/l1SHRPgjLunLGe51Sh5YYWKMFDyV4SxgMQYTQ==} + engines: {node: '>=18'} + cpu: [arm] + os: [linux] + '@esbuild/linux-ia32@0.27.7': resolution: {integrity: sha512-GA48aKNkyQDbd3KtkplYWT102C5sn/EZTY4XROkxONgruHPU72l+gW+FfF8tf2cFjeHaRbWpOYa/uRBz/Xq1Pg==} engines: {node: '>=18'} cpu: [ia32] os: [linux] + '@esbuild/linux-ia32@0.28.1': + resolution: {integrity: sha512-d1z4ZuP0ajrfz/FhGT4vv278rX8KnPPJx8i5+AtK7TYbx9Le9F1hyzurZpkEyjkGa9dUGhQow4C1NmeGvqxN2w==} + engines: {node: '>=18'} + cpu: [ia32] + os: [linux] + '@esbuild/linux-loong64@0.27.7': resolution: {integrity: sha512-a4POruNM2oWsD4WKvBSEKGIiWQF8fZOAsycHOt6JBpZ+JN2n2JH9WAv56SOyu9X5IqAjqSIPTaJkqN8F7XOQ5Q==} engines: {node: '>=18'} cpu: [loong64] os: [linux] + '@esbuild/linux-loong64@0.28.1': + resolution: {integrity: sha512-M5sRjUVZrkm1OAPR3dlOYzNmN+loZKGVi1VUQGrwuqLcbR6qeAz+famMhjASeH3YVKvZz+zT1jlh/keC3Rj/lg==} + engines: {node: '>=18'} + cpu: [loong64] + os: [linux] + '@esbuild/linux-mips64el@0.27.7': resolution: {integrity: sha512-KabT5I6StirGfIz0FMgl1I+R1H73Gp0ofL9A3nG3i/cYFJzKHhouBV5VWK1CSgKvVaG4q1RNpCTR2LuTVB3fIw==} engines: {node: '>=18'} cpu: [mips64el] os: [linux] + '@esbuild/linux-mips64el@0.28.1': + resolution: {integrity: sha512-mRObBZeHh2OxcBFPWE/FjylkRgZdYuiTR3vaTozquCGOH14iP9oN4x4Ge81CoIDYQrXmIxpFumJBu5MtZpnQJQ==} + engines: {node: '>=18'} + cpu: [mips64el] + os: [linux] + '@esbuild/linux-ppc64@0.27.7': resolution: {integrity: sha512-gRsL4x6wsGHGRqhtI+ifpN/vpOFTQtnbsupUF5R5YTAg+y/lKelYR1hXbnBdzDjGbMYjVJLJTd2OFmMewAgwlQ==} engines: {node: '>=18'} cpu: [ppc64] os: [linux] + '@esbuild/linux-ppc64@0.28.1': + resolution: {integrity: sha512-slScBsMAb3GFDcdrCgLwZtPYRoH2H/youv10QiZyRjmsP48fznoveWytSgCI/R0ZcUgpc0ZhIUEx6LHts8yrfQ==} + engines: {node: '>=18'} + cpu: [ppc64] + os: [linux] + '@esbuild/linux-riscv64@0.27.7': resolution: {integrity: sha512-hL25LbxO1QOngGzu2U5xeXtxXcW+/GvMN3ejANqXkxZ/opySAZMrc+9LY/WyjAan41unrR3YrmtTsUpwT66InQ==} engines: {node: '>=18'} cpu: [riscv64] os: [linux] + '@esbuild/linux-riscv64@0.28.1': + resolution: {integrity: sha512-kw0owk1o0GFETUJyW0jc0G4Yzs0BHZn0JDZ8JRT088vjJYX777BAs1fDGxAC+q831qOs2DTC96mNsG2opdfyyQ==} + engines: {node: '>=18'} + cpu: [riscv64] + os: [linux] + '@esbuild/linux-s390x@0.27.7': resolution: {integrity: sha512-2k8go8Ycu1Kb46vEelhu1vqEP+UeRVj2zY1pSuPdgvbd5ykAw82Lrro28vXUrRmzEsUV0NzCf54yARIK8r0fdw==} engines: {node: '>=18'} cpu: [s390x] os: [linux] + '@esbuild/linux-s390x@0.28.1': + resolution: {integrity: sha512-/lAIjX8aYFRByhh6L5rYtPEDRqa9de/4V/juOXcta5frjvzXO4/sqEtyytse0g3zZFuWu5cDN0MkLz2qRDD2Ag==} + engines: {node: '>=18'} + cpu: [s390x] + os: [linux] + '@esbuild/linux-x64@0.27.7': resolution: {integrity: sha512-hzznmADPt+OmsYzw1EE33ccA+HPdIqiCRq7cQeL1Jlq2gb1+OyWBkMCrYGBJ+sxVzve2ZJEVeePbLM2iEIZSxA==} engines: {node: '>=18'} cpu: [x64] os: [linux] + '@esbuild/linux-x64@0.28.1': + resolution: {integrity: sha512-u/anNYF2mmVOEDwLtnQ1wOr3EZ9sTNGLWrsYGYwHWzGA3Si84IOkHXlbWTD1NB+9/1lcnweYKO54uhxZydNzfA==} + engines: {node: '>=18'} + cpu: [x64] + os: [linux] + '@esbuild/netbsd-arm64@0.27.7': resolution: {integrity: sha512-b6pqtrQdigZBwZxAn1UpazEisvwaIDvdbMbmrly7cDTMFnw/+3lVxxCTGOrkPVnsYIosJJXAsILG9XcQS+Yu6w==} engines: {node: '>=18'} cpu: [arm64] os: [netbsd] + '@esbuild/netbsd-arm64@0.28.1': + resolution: {integrity: sha512-oks0DYbLwWMmaakTsCb+zL4E+aHRVLom9IJZOAthMQEPiQmydXHkziYEsGYRx0uNV/IjEKGAV941JzH02pflqw==} + engines: {node: '>=18'} + cpu: [arm64] + os: [netbsd] + '@esbuild/netbsd-x64@0.27.7': resolution: {integrity: sha512-OfatkLojr6U+WN5EDYuoQhtM+1xco+/6FSzJJnuWiUw5eVcicbyK3dq5EeV/QHT1uy6GoDhGbFpprUiHUYggrw==} engines: {node: '>=18'} cpu: [x64] os: [netbsd] + '@esbuild/netbsd-x64@0.28.1': + resolution: {integrity: sha512-aeL6lAnN89Hz43Mlh1G8ARasbuoYvSITDEx0tHh5b7jJnHcssqgjy9Yx430GDpmCa6OyrKoS0aNRjKundRizGg==} + engines: {node: '>=18'} + cpu: [x64] + os: [netbsd] + '@esbuild/openbsd-arm64@0.27.7': resolution: {integrity: sha512-AFuojMQTxAz75Fo8idVcqoQWEHIXFRbOc1TrVcFSgCZtQfSdc1RXgB3tjOn/krRHENUB4j00bfGjyl2mJrU37A==} engines: {node: '>=18'} cpu: [arm64] os: [openbsd] + '@esbuild/openbsd-arm64@0.28.1': + resolution: {integrity: sha512-MEFJe5C3R8pwXdZ5Y21oo6m7ePiS0d9pWucn99O/wvyJZChoIQKrQDxKrGeW8F5+T0okTHesAmDeiHDTIq0V/Q==} + engines: {node: '>=18'} + cpu: [arm64] + os: [openbsd] + '@esbuild/openbsd-x64@0.27.7': resolution: {integrity: sha512-+A1NJmfM8WNDv5CLVQYJ5PshuRm/4cI6WMZRg1by1GwPIQPCTs1GLEUHwiiQGT5zDdyLiRM/l1G0Pv54gvtKIg==} engines: {node: '>=18'} cpu: [x64] os: [openbsd] + '@esbuild/openbsd-x64@0.28.1': + resolution: {integrity: sha512-i/ZLIOafE0Z8cI/XANJAixoJL/uRAoS2xOA3rb0xN+KK0K177cMAsQYkzHtBrtMXAKuAc7HGgcWiZ/sRC1Nxgw==} + engines: {node: '>=18'} + cpu: [x64] + os: [openbsd] + '@esbuild/openharmony-arm64@0.27.7': resolution: {integrity: sha512-+KrvYb/C8zA9CU/g0sR6w2RBw7IGc5J2BPnc3dYc5VJxHCSF1yNMxTV5LQ7GuKteQXZtspjFbiuW5/dOj7H4Yw==} engines: {node: '>=18'} cpu: [arm64] os: [openharmony] + '@esbuild/openharmony-arm64@0.28.1': + resolution: {integrity: sha512-ge+Z7EXFNt2BO1oAMsVpiQ8EwndV9i1xXerAeTIK7AtPs3bKFXQM7nlRxDSIUIMeueR1CNXxqztLzdNeReKBJg==} + engines: {node: '>=18'} + cpu: [arm64] + os: [openharmony] + '@esbuild/sunos-x64@0.27.7': resolution: {integrity: sha512-ikktIhFBzQNt/QDyOL580ti9+5mL/YZeUPKU2ivGtGjdTYoqz6jObj6nOMfhASpS4GU4Q/Clh1QtxWAvcYKamA==} engines: {node: '>=18'} cpu: [x64] os: [sunos] + '@esbuild/sunos-x64@0.28.1': + resolution: {integrity: sha512-BEjgtECkL3vY+SaSQ6nzVfiALUeFxpawyp8Jmf5PtYhf1Ug40N1h/hxlhts+f1FvSvarEigdxS3BlSMI2PJLcQ==} + engines: {node: '>=18'} + cpu: [x64] + os: [sunos] + '@esbuild/win32-arm64@0.27.7': resolution: {integrity: sha512-7yRhbHvPqSpRUV7Q20VuDwbjW5kIMwTHpptuUzV+AA46kiPze5Z7qgt6CLCK3pWFrHeNfDd1VKgyP4O+ng17CA==} engines: {node: '>=18'} cpu: [arm64] os: [win32] + '@esbuild/win32-arm64@0.28.1': + resolution: {integrity: sha512-lCv9eK/H6ZJWbE7bh2nw54CZ9M2nupBxJcTsdk/QQnWkdSjKGuxmmH8/GWrlT1eMmZfn4dGcCjRte397WqfQXA==} + engines: {node: '>=18'} + cpu: [arm64] + os: [win32] + '@esbuild/win32-ia32@0.27.7': resolution: {integrity: sha512-SmwKXe6VHIyZYbBLJrhOoCJRB/Z1tckzmgTLfFYOfpMAx63BJEaL9ExI8x7v0oAO3Zh6D/Oi1gVxEYr5oUCFhw==} engines: {node: '>=18'} cpu: [ia32] os: [win32] + '@esbuild/win32-ia32@0.28.1': + resolution: {integrity: sha512-zvb/mB2bSCoJOpoCBgYKKpX6YM6mJBlBUVUtVj41DlZJVEB6/0CKlRYxP5wWl1C1ILiCoAU5wZZ4q1P3qeS6Eg==} + engines: {node: '>=18'} + cpu: [ia32] + os: [win32] + '@esbuild/win32-x64@0.27.7': resolution: {integrity: sha512-56hiAJPhwQ1R4i+21FVF7V8kSD5zZTdHcVuRFMW0hn753vVfQN8xlx4uOPT4xoGH0Z/oVATuR82AiqSTDIpaHg==} engines: {node: '>=18'} cpu: [x64] os: [win32] + '@esbuild/win32-x64@0.28.1': + resolution: {integrity: sha512-bm4Mowrv+GXMlpWX++EcXw/iLyd1o3+bJkC2DkWXYVvgZCqD/bSj9ctZeAMC3cIxgjRVR2Dufaiu4YPxr5gW1A==} + engines: {node: '>=18'} + cpu: [x64] + os: [win32] + '@hono/node-server@2.0.1': resolution: {integrity: sha512-jI9yMDyFpqBeSighf/zlXnQG/nl9AyBc6aAgy4XtxJMyt/CNyJpvPfzDD+bCc2zAOmhhqtF6TnmIaY+xV4mIrw==} engines: {node: '>=20'} @@ -651,6 +810,11 @@ packages: engines: {node: '>=18'} hasBin: true + esbuild@0.28.1: + resolution: {integrity: sha512-HrJrvZv5ayxBzPfwphOoNzkzOIIlifzk0KJrGK2c8R4+LKpMtpYLQeUdjnwjWv/LZlkH2laZk+4w78pi99D4Vw==} + engines: {node: '>=18'} + hasBin: true + estree-walker@3.0.3: resolution: {integrity: sha512-7RUKfXgSMMkzt6ZuXmqapOurLGPPfgj6l9uRZ7lRGolvk0y2yocc35LdcxKC5PQZdn2DMqioAQ2NoWcrTKmm6g==} @@ -876,6 +1040,11 @@ packages: typescript: optional: true + tsx@4.22.4: + resolution: {integrity: sha512-X8EX+XV4QR5xCsrgxaED954zTDfY8KqlDtskKEL0cHhyS/P8b4IFOvGDQpsC9Q1XnLq915wEfwwY/zzskCtmhg==} + engines: {node: '>=18.0.0'} + hasBin: true + typescript@5.9.3: resolution: {integrity: sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==} engines: {node: '>=14.17'} @@ -1050,81 +1219,159 @@ snapshots: '@esbuild/aix-ppc64@0.27.7': optional: true + '@esbuild/aix-ppc64@0.28.1': + optional: true + '@esbuild/android-arm64@0.27.7': optional: true + '@esbuild/android-arm64@0.28.1': + optional: true + '@esbuild/android-arm@0.27.7': optional: true + '@esbuild/android-arm@0.28.1': + optional: true + '@esbuild/android-x64@0.27.7': optional: true + '@esbuild/android-x64@0.28.1': + optional: true + '@esbuild/darwin-arm64@0.27.7': optional: true + '@esbuild/darwin-arm64@0.28.1': + optional: true + '@esbuild/darwin-x64@0.27.7': optional: true + '@esbuild/darwin-x64@0.28.1': + optional: true + '@esbuild/freebsd-arm64@0.27.7': optional: true + '@esbuild/freebsd-arm64@0.28.1': + optional: true + '@esbuild/freebsd-x64@0.27.7': optional: true + '@esbuild/freebsd-x64@0.28.1': + optional: true + '@esbuild/linux-arm64@0.27.7': optional: true + '@esbuild/linux-arm64@0.28.1': + optional: true + '@esbuild/linux-arm@0.27.7': optional: true + '@esbuild/linux-arm@0.28.1': + optional: true + '@esbuild/linux-ia32@0.27.7': optional: true + '@esbuild/linux-ia32@0.28.1': + optional: true + '@esbuild/linux-loong64@0.27.7': optional: true + '@esbuild/linux-loong64@0.28.1': + optional: true + '@esbuild/linux-mips64el@0.27.7': optional: true + '@esbuild/linux-mips64el@0.28.1': + optional: true + '@esbuild/linux-ppc64@0.27.7': optional: true + '@esbuild/linux-ppc64@0.28.1': + optional: true + '@esbuild/linux-riscv64@0.27.7': optional: true + '@esbuild/linux-riscv64@0.28.1': + optional: true + '@esbuild/linux-s390x@0.27.7': optional: true + '@esbuild/linux-s390x@0.28.1': + optional: true + '@esbuild/linux-x64@0.27.7': optional: true + '@esbuild/linux-x64@0.28.1': + optional: true + '@esbuild/netbsd-arm64@0.27.7': optional: true + '@esbuild/netbsd-arm64@0.28.1': + optional: true + '@esbuild/netbsd-x64@0.27.7': optional: true + '@esbuild/netbsd-x64@0.28.1': + optional: true + '@esbuild/openbsd-arm64@0.27.7': optional: true + '@esbuild/openbsd-arm64@0.28.1': + optional: true + '@esbuild/openbsd-x64@0.27.7': optional: true + '@esbuild/openbsd-x64@0.28.1': + optional: true + '@esbuild/openharmony-arm64@0.27.7': optional: true + '@esbuild/openharmony-arm64@0.28.1': + optional: true + '@esbuild/sunos-x64@0.27.7': optional: true + '@esbuild/sunos-x64@0.28.1': + optional: true + '@esbuild/win32-arm64@0.27.7': optional: true + '@esbuild/win32-arm64@0.28.1': + optional: true + '@esbuild/win32-ia32@0.27.7': optional: true + '@esbuild/win32-ia32@0.28.1': + optional: true + '@esbuild/win32-x64@0.27.7': optional: true + '@esbuild/win32-x64@0.28.1': + optional: true + '@hono/node-server@2.0.1(hono@4.12.16)': dependencies: hono: 4.12.16 @@ -1338,13 +1585,13 @@ snapshots: chai: 5.3.3 tinyrainbow: 2.0.0 - '@vitest/mocker@3.2.4(vite@7.3.2(@types/node@25.6.0)(yaml@2.8.4))': + '@vitest/mocker@3.2.4(vite@7.3.2(@types/node@25.6.0)(tsx@4.22.4)(yaml@2.8.4))': dependencies: '@vitest/spy': 3.2.4 estree-walker: 3.0.3 magic-string: 0.30.21 optionalDependencies: - vite: 7.3.2(@types/node@25.6.0)(yaml@2.8.4) + vite: 7.3.2(@types/node@25.6.0)(tsx@4.22.4)(yaml@2.8.4) '@vitest/pretty-format@3.2.4': dependencies: @@ -1451,6 +1698,35 @@ snapshots: '@esbuild/win32-ia32': 0.27.7 '@esbuild/win32-x64': 0.27.7 + esbuild@0.28.1: + optionalDependencies: + '@esbuild/aix-ppc64': 0.28.1 + '@esbuild/android-arm': 0.28.1 + '@esbuild/android-arm64': 0.28.1 + '@esbuild/android-x64': 0.28.1 + '@esbuild/darwin-arm64': 0.28.1 + '@esbuild/darwin-x64': 0.28.1 + '@esbuild/freebsd-arm64': 0.28.1 + '@esbuild/freebsd-x64': 0.28.1 + '@esbuild/linux-arm': 0.28.1 + '@esbuild/linux-arm64': 0.28.1 + '@esbuild/linux-ia32': 0.28.1 + '@esbuild/linux-loong64': 0.28.1 + '@esbuild/linux-mips64el': 0.28.1 + '@esbuild/linux-ppc64': 0.28.1 + '@esbuild/linux-riscv64': 0.28.1 + '@esbuild/linux-s390x': 0.28.1 + '@esbuild/linux-x64': 0.28.1 + '@esbuild/netbsd-arm64': 0.28.1 + '@esbuild/netbsd-x64': 0.28.1 + '@esbuild/openbsd-arm64': 0.28.1 + '@esbuild/openbsd-x64': 0.28.1 + '@esbuild/openharmony-arm64': 0.28.1 + '@esbuild/sunos-x64': 0.28.1 + '@esbuild/win32-arm64': 0.28.1 + '@esbuild/win32-ia32': 0.28.1 + '@esbuild/win32-x64': 0.28.1 + estree-walker@3.0.3: dependencies: '@types/estree': 1.0.8 @@ -1548,11 +1824,12 @@ snapshots: mlly: 1.8.2 pathe: 2.0.3 - postcss-load-config@6.0.1(postcss@8.5.13)(yaml@2.8.4): + postcss-load-config@6.0.1(postcss@8.5.13)(tsx@4.22.4)(yaml@2.8.4): dependencies: lilconfig: 3.1.3 optionalDependencies: postcss: 8.5.13 + tsx: 4.22.4 yaml: 2.8.4 postcss@8.5.13: @@ -1647,7 +1924,7 @@ snapshots: ts-interface-checker@0.1.13: {} - tsup@8.5.1(postcss@8.5.13)(typescript@5.9.3)(yaml@2.8.4): + tsup@8.5.1(postcss@8.5.13)(tsx@4.22.4)(typescript@5.9.3)(yaml@2.8.4): dependencies: bundle-require: 5.1.0(esbuild@0.27.7) cac: 6.7.14 @@ -1658,7 +1935,7 @@ snapshots: fix-dts-default-cjs-exports: 1.0.1 joycon: 3.1.1 picocolors: 1.1.1 - postcss-load-config: 6.0.1(postcss@8.5.13)(yaml@2.8.4) + postcss-load-config: 6.0.1(postcss@8.5.13)(tsx@4.22.4)(yaml@2.8.4) resolve-from: 5.0.0 rollup: 4.60.2 source-map: 0.7.6 @@ -1675,6 +1952,12 @@ snapshots: - tsx - yaml + tsx@4.22.4: + dependencies: + esbuild: 0.28.1 + optionalDependencies: + fsevents: 2.3.3 + typescript@5.9.3: {} ufo@1.6.4: {} @@ -1698,13 +1981,13 @@ snapshots: - utf-8-validate - zod - vite-node@3.2.4(@types/node@25.6.0)(yaml@2.8.4): + vite-node@3.2.4(@types/node@25.6.0)(tsx@4.22.4)(yaml@2.8.4): dependencies: cac: 6.7.14 debug: 4.4.3 es-module-lexer: 1.7.0 pathe: 2.0.3 - vite: 7.3.2(@types/node@25.6.0)(yaml@2.8.4) + vite: 7.3.2(@types/node@25.6.0)(tsx@4.22.4)(yaml@2.8.4) transitivePeerDependencies: - '@types/node' - jiti @@ -1719,7 +2002,7 @@ snapshots: - tsx - yaml - vite@7.3.2(@types/node@25.6.0)(yaml@2.8.4): + vite@7.3.2(@types/node@25.6.0)(tsx@4.22.4)(yaml@2.8.4): dependencies: esbuild: 0.27.7 fdir: 6.5.0(picomatch@4.0.4) @@ -1730,13 +2013,14 @@ snapshots: optionalDependencies: '@types/node': 25.6.0 fsevents: 2.3.3 + tsx: 4.22.4 yaml: 2.8.4 - vitest@3.2.4(@types/node@25.6.0)(yaml@2.8.4): + vitest@3.2.4(@types/node@25.6.0)(tsx@4.22.4)(yaml@2.8.4): dependencies: '@types/chai': 5.2.3 '@vitest/expect': 3.2.4 - '@vitest/mocker': 3.2.4(vite@7.3.2(@types/node@25.6.0)(yaml@2.8.4)) + '@vitest/mocker': 3.2.4(vite@7.3.2(@types/node@25.6.0)(tsx@4.22.4)(yaml@2.8.4)) '@vitest/pretty-format': 3.2.4 '@vitest/runner': 3.2.4 '@vitest/snapshot': 3.2.4 @@ -1754,8 +2038,8 @@ snapshots: tinyglobby: 0.2.16 tinypool: 1.1.1 tinyrainbow: 2.0.0 - vite: 7.3.2(@types/node@25.6.0)(yaml@2.8.4) - vite-node: 3.2.4(@types/node@25.6.0)(yaml@2.8.4) + vite: 7.3.2(@types/node@25.6.0)(tsx@4.22.4)(yaml@2.8.4) + vite-node: 3.2.4(@types/node@25.6.0)(tsx@4.22.4)(yaml@2.8.4) why-is-node-running: 2.3.0 optionalDependencies: '@types/node': 25.6.0 diff --git a/src/autodata/build-dataset.ts b/src/autodata/build-dataset.ts new file mode 100644 index 0000000..a2e1853 --- /dev/null +++ b/src/autodata/build-dataset.ts @@ -0,0 +1,172 @@ +/** + * Build a discriminative QA dataset from a real source document with REAL two-tier solvers. + * + * Grounds on the document (or takes an already-grounded excerpt), runs `createDataCreationLoop` with + * the live router roles, writes the accepted examples as JSONL, and returns the numbers the + * calibration depends on: the per-example strong/weak gap for the LOOP-ACCEPTED (agentic) examples + * AND for the challenger's FIRST drafts (plain), plus the cost ledger split by role. + */ + +import { mkdir, writeFile } from 'node:fs/promises' +import { dirname } from 'node:path' +import { CostLedger } from '@tangle-network/agent-eval' +import { + createDataCreationLoop, + discriminativeAcceptRule, + type ExampleEvaluation, +} from './data-creation-loop' +import { type GroundedDoc, groundDoc } from './grounding' +import { buildAutodataRoles, type RouterCallRecord } from './router-roles' + +export interface DiscriminativeThresholds { + minStrong?: number + maxWeak?: number + minGap?: number +} + +export interface AutodataDatasetConfig { + apiKey: string + baseUrl?: string + /** A grounded excerpt, or a spec to fetch + chunk one from a real URL. */ + source: GroundedDoc | { url: string; focus?: string; cacheDir?: string } + /** Where to write the JSONL dataset. */ + outPath: string + target?: number + samples?: number + maxRetries?: number + thresholds?: DiscriminativeThresholds + models?: { challenger?: string; weak?: string; strong?: string; judge?: string } + signal?: AbortSignal +} + +/** One accepted, discriminating training example with its real strong/weak scores + provenance. */ +export interface DatasetRow { + context: string + question: string + reference: string + rubric: readonly string[] + weakScore: number + strongScore: number + gap: number + source: { url: string; headingPath: string; chunkIndex: number } +} + +export interface AutodataDatasetResult { + source: GroundedDoc + accepted: ExampleEvaluation[] + rows: DatasetRow[] + /** Mean strong−weak gap on the challenger's first-draft (plain-generation) questions. */ + plainGapMean: number | null + /** Mean strong−weak gap on the loop-accepted (agentic) questions. */ + agenticGapMean: number | null + /** Mean of the BEST gap the refinement reached per slot (accepted or not) — the informative + * comparison against `plainGapMean` even when nothing clears the accept bar. */ + refinedGapMean: number | null + plainGaps: number[] + agenticGaps: number[] + refinedGaps: number[] + cost: CostLedger + costPerExampleUsd: number | null + /** How many router calls were priced by the router vs rate-estimated. */ + callProvenance: { router: number; estimated: number } + outPath: string +} + +function mean(xs: number[]): number | null { + return xs.length === 0 ? null : xs.reduce((a, b) => a + b, 0) / xs.length +} + +function isGrounded(s: AutodataDatasetConfig['source']): s is GroundedDoc { + return typeof (s as GroundedDoc).doc === 'string' +} + +function challengerInstruction(doc: string): string { + return ( + `SOURCE DOCUMENT EXCERPT:\n\n${doc}\n\n` + + `Write ONE hard exam question grounded in this excerpt. It must require multi-step reasoning ` + + `over the excerpt (a small model should get it wrong, a strong model right), never a verbatim ` + + `lookup. Return STRICT JSON: {"context": string, "question": string, "reference": string, ` + + `"rubric": string[] }.` + ) +} + +/** Run the full pipeline: ground → loop → JSONL. Returns the calibration numbers + cost. */ +export async function buildAutodataDataset( + config: AutodataDatasetConfig, +): Promise { + const source = isGrounded(config.source) + ? config.source + : await groundDoc({ + url: config.source.url, + focus: config.source.focus, + cacheDir: config.source.cacheDir, + signal: config.signal, + }) + + const provenance = { router: 0, estimated: 0 } + const onCall = (rec: RouterCallRecord): void => { + if (rec.costSource === 'router') provenance.router += 1 + else provenance.estimated += 1 + } + + const ledger = new CostLedger() + + const roles = buildAutodataRoles({ + apiKey: config.apiKey, + baseUrl: config.baseUrl, + challengerModel: config.models?.challenger, + weakModel: config.models?.weak, + strongModel: config.models?.strong, + judgeModel: config.models?.judge, + ledger, + onCall, + }) + + const result = await createDataCreationLoop({ + doc: source.doc, + baseInstruction: challengerInstruction, + challenger: roles.challenger, + weakSolver: roles.weakSolver, + strongSolver: roles.strongSolver, + judge: roles.judge, + accept: (i) => discriminativeAcceptRule({ ...i, ...config.thresholds }), + target: config.target ?? 3, + samples: config.samples ?? 3, + maxRetries: config.maxRetries ?? 4, + cost: ledger, + signal: config.signal, + }) + + const rows: DatasetRow[] = result.accepted.map((ex) => ({ + context: ex.example.context, + question: ex.example.question, + reference: ex.example.reference, + rubric: ex.example.rubric, + weakScore: ex.weakScore, + strongScore: ex.strongScore, + gap: ex.gap, + source: { url: source.url, headingPath: source.headingPath, chunkIndex: source.chunkIndex }, + })) + + await mkdir(dirname(config.outPath), { recursive: true }) + await writeFile( + config.outPath, + rows.map((r) => JSON.stringify(r)).join('\n') + (rows.length ? '\n' : ''), + ) + + return { + source, + accepted: result.accepted, + rows, + plainGapMean: mean(result.plainGaps), + agenticGapMean: mean(result.agenticGaps), + refinedGapMean: mean(result.refinedGaps), + plainGaps: result.plainGaps, + agenticGaps: result.agenticGaps, + refinedGaps: result.refinedGaps, + cost: result.cost, + costPerExampleUsd: result.cost.costPerCompletedTask(), + callProvenance: provenance, + outPath: config.outPath, + } +} diff --git a/src/autodata/data-creation-loop.test.ts b/src/autodata/data-creation-loop.test.ts new file mode 100644 index 0000000..df917cc --- /dev/null +++ b/src/autodata/data-creation-loop.test.ts @@ -0,0 +1,130 @@ +import { describe, expect, it } from 'vitest' +import { + createDataCreationLoop, + discriminativeAcceptRule, + qualityCheck, +} from './data-creation-loop' +import { + baseInstruction, + buildRubricJudge, + challengerClient, + groundingDoc, + solverClient, +} from './offline-fixtures' +import { parseDataExample } from './router-roles' + +describe('discriminativeAcceptRule (the new piece)', () => { + it('accepts an example that separates strong from weak', () => { + const d = discriminativeAcceptRule({ strongScore: 0.77, weakScore: 0.46 }) + expect(d.accept).toBe(true) + expect(d.reason).toContain('discriminates') + }) + + it('rejects "too easy" when the weak solver passes', () => { + const d = discriminativeAcceptRule({ strongScore: 0.86, weakScore: 0.84 }) + expect(d.accept).toBe(false) + expect(d.reason).toContain('too easy') + }) + + it('rejects "too hard" when even the strong solver misses', () => { + const d = discriminativeAcceptRule({ strongScore: 0.55, weakScore: 0.3 }) + expect(d.accept).toBe(false) + expect(d.reason).toContain('too hard') + }) + + it('rejects when the gap is below minGap even if both thresholds hold', () => { + const d = discriminativeAcceptRule({ strongScore: 0.66, weakScore: 0.48 }) + expect(d.accept).toBe(false) + expect(d.reason).toContain('not discriminative') + }) + + it('honors custom thresholds', () => { + const strict = discriminativeAcceptRule({ strongScore: 0.77, weakScore: 0.46, minGap: 0.4 }) + expect(strict.accept).toBe(false) + }) +}) + +describe('qualityCheck', () => { + it('rejects a reference that leaks verbatim into the context', () => { + const q = qualityCheck({ + context: 'The answer is 42 and nothing else matters.', + question: 'What is the answer?', + reference: 'The answer is 42', + rubric: ['a', 'b'], + }) + expect(q.ok).toBe(false) + expect(q.reason).toContain('leaked') + }) + + it('rejects a thin rubric', () => { + const q = qualityCheck({ context: 'c', question: 'q', reference: 'r', rubric: ['only one'] }) + expect(q.ok).toBe(false) + expect(q.reason).toContain('thin rubric') + }) + + it('passes a clean example', () => { + const q = qualityCheck({ + context: 'Some grounding context that does not contain the answer phrasing.', + question: 'Why does it matter?', + reference: 'Because of a distinct reasoning chain.', + rubric: ['states X', 'explains Y'], + }) + expect(q.ok).toBe(true) + }) +}) + +describe('parseDataExample (challenger JSON parsing)', () => { + it('parses a bare JSON object', () => { + const ex = parseDataExample('{"context":"c","question":"q","reference":"r","rubric":["a","b"]}') + expect(ex.question).toBe('q') + expect(ex.rubric).toHaveLength(2) + }) + + it('parses JSON wrapped in a ```json fence with surrounding prose', () => { + const ex = parseDataExample( + 'Sure, here is the example:\n```json\n{"context":"c","question":"q","reference":"r","rubric":["a","b"]}\n```\nDone.', + ) + expect(ex.reference).toBe('r') + }) + + it('throws loud when no JSON object is present', () => { + expect(() => parseDataExample('no json here')).toThrow() + }) + + it('throws loud when a required field is missing', () => { + expect(() => parseDataExample('{"context":"c","question":"q"}')).toThrow() + }) +}) + +describe('createDataCreationLoop (offline)', () => { + it('manufactures discriminating examples and separates plain from agentic gaps', async () => { + const result = await createDataCreationLoop({ + doc: groundingDoc, + baseInstruction, + challenger: challengerClient(), + weakSolver: solverClient('weak'), + strongSolver: solverClient('strong'), + judge: buildRubricJudge(), + target: 2, + samples: 3, + maxRetries: 4, + }) + + expect(result.accepted).toHaveLength(2) + for (const ex of result.accepted) { + expect(ex.decision.accept).toBe(true) + expect(ex.gap).toBeGreaterThanOrEqual(0.2) + } + + const mean = (xs: number[]) => xs.reduce((a, b) => a + b, 0) / xs.length + const plain = mean(result.plainGaps) + const agentic = mean(result.agenticGaps) + expect(plain).toBeLessThan(0.1) + expect(agentic).toBeGreaterThan(0.25) + expect(agentic - plain).toBeGreaterThanOrEqual(0.15) + + const stored = await result.corpus.query({ area: 'training-data' }) + expect(stored).toHaveLength(2) + expect(result.cost.summary().totalCostUsd).toBeGreaterThan(0) + }) +}) diff --git a/src/autodata/data-creation-loop.ts b/src/autodata/data-creation-loop.ts new file mode 100644 index 0000000..9022cbb --- /dev/null +++ b/src/autodata/data-creation-loop.ts @@ -0,0 +1,491 @@ +/** + * The Autodata / Agentic Self-Instruct INNER loop: an agent MANUFACTURES hard training examples + * from a grounding doc and keeps only the ones that DISCRIMINATE a strong solver from a weak one. + * + * PROVENANCE — this loop is vendored verbatim from agent-runtime + * `examples/agentic-data-creation/agentic-data-creation.ts` (branch + * `examples/agentic-data-creation`). It is an EXAMPLE in agent-runtime, not a published runtime + * export — examples are not shipped in the npm dist — so agent-knowledge cannot import it and + * vendors it here (the "copy with a note" path). Every primitive it COMPOSES is reused from the + * published packages, nothing is re-implemented: the judge is `llmJudge` (agent-eval), the loop + * kernel is `runLoop` (agent-runtime/loops), the store is `InMemoryCorpus` (agent-runtime/loops), + * the cost accounting is `CostLedger` (agent-eval). The REAL grounding (arXiv ingestion) + the REAL + * two-tier router solvers live in the sibling files; this file stays domain- and transport-agnostic. + * + * The whole method is four roles + one accept rule: + * 1. CHALLENGER writes a candidate {context, question, reference, rubric} from the doc. + * 2. WEAK solver and STRONG solver each attempt it, sampled N× to average out variance. + * 3. JUDGE scores every attempt against the rubric (one `llmJudge` call per attempt). + * 4. ACCEPT keeps the example ONLY IF it discriminates: strong >= hi, weak < lo, gap >= g — + * plus a quality check (no context leakage, a real rubric). + * On reject, the CHALLENGER driver FOLDS the reject reason into its next prompt and retries. + * Accepted examples accrete into a `Corpus`. + * + * The ONE genuinely new piece is `discriminativeAcceptRule` — the paper's reward, written as a + * small Validator-shaped accept/reject. It is a lift candidate for agent-eval (next to + * `blendHeldout` / `HeldOutGate`) if it proves out across real domains; it lives here until then. + */ + +import { CostLedger } from '@tangle-network/agent-eval' +import type { JudgeConfig, Scenario } from '@tangle-network/agent-eval/campaign' +import { + type AgentRunSpec, + type Corpus, + type CorpusRecord, + type Driver, + InMemoryCorpus, + type OutputAdapter, + runLoop, + type SandboxClient, + type Validator, +} from '@tangle-network/agent-runtime/loops' +import type { AgentProfile, SandboxEvent } from '@tangle-network/sandbox' + +// ── The four-role data shapes ───────────────────────────────────────────────────────────── + +/** One manufactured training example, grounded in `context` excerpted from the doc. */ +export interface DataExample { + /** The grounding excerpt the question is answerable from. */ + readonly context: string + readonly question: string + /** The reference answer the rubric is graded against. */ + readonly reference: string + /** Scoring criteria the judge applies (>= 2 for a usable example). */ + readonly rubric: readonly string[] +} + +/** What the judge scores: a solver's `answer` to one `example`. */ +export interface SolverArtifact { + readonly example: DataExample + readonly answer: string +} + +/** The accept rule's verdict — keep this example, and why (or why not). */ +export interface AcceptDecision { + readonly accept: boolean + readonly reason: string +} + +// ═══════════════════════════════════════════════════════════════════════════════════════════ +// THE ONE NEW PIECE — the paper's discriminative reward, as a small Validator-shaped rule. +// ═══════════════════════════════════════════════════════════════════════════════════════════ +// +// Autodata keeps an example ONLY IF it separates a strong solver from a weak one: the strong +// solver should mostly get it (>= minStrong), the weak solver should mostly miss it (< maxWeak), +// and the margin between them (the "gap") must clear minGap. That is the whole objective, so the +// rule is the LITERAL accept criterion, never softened. The three reject reasons map one-to-one +// onto the challenger's next-prompt fold. +export function discriminativeAcceptRule(input: { + /** Strong solver's mean rubric score, [0,1]. */ + strongScore: number + /** Weak solver's mean rubric score, [0,1]. */ + weakScore: number + /** Strong must reach at least this (else the example is unfair / too hard). Default 0.65. */ + minStrong?: number + /** Weak must stay strictly below this (else the example is too easy). Default 0.5. */ + maxWeak?: number + /** strong − weak must be at least this (else it does not discriminate). Default 0.2. */ + minGap?: number +}): AcceptDecision { + const { strongScore, weakScore } = input + const minStrong = input.minStrong ?? 0.65 + const maxWeak = input.maxWeak ?? 0.5 + const minGap = input.minGap ?? 0.2 + const gap = strongScore - weakScore + + if (strongScore < minStrong) { + return { + accept: false, + reason: `too hard: strong solver reached only ${pct(strongScore)} (< ${pct(minStrong)})`, + } + } + if (weakScore >= maxWeak) { + return { + accept: false, + reason: `too easy: weak solver reached ${pct(weakScore)} (>= ${pct(maxWeak)})`, + } + } + if (gap < minGap) { + return { accept: false, reason: `not discriminative: gap ${pct(gap)} (< ${pct(minGap)})` } + } + return { + accept: true, + reason: `discriminates: strong ${pct(strongScore)} >= ${pct(minStrong)}, weak ${pct(weakScore)} < ${pct(maxWeak)}, gap ${pct(gap)} >= ${pct(minGap)}`, + } +} + +/** + * The quality gate the paper pairs with the gap: reject examples that LEAK the answer into the + * context (a copy-paste solver would pass), or that ship a thin rubric. Deterministic, no LLM. + */ +export function qualityCheck(ex: DataExample): { ok: boolean; reason: string } { + const ref = ex.reference.trim() + if (ref.length > 0 && ex.context.includes(ref)) { + return { ok: false, reason: 'leaked: the reference answer appears verbatim in the context' } + } + if (ex.rubric.length < 2) { + return { ok: false, reason: 'thin rubric: an example needs >= 2 scoring criteria' } + } + return { ok: true, reason: 'clean' } +} + +// ── Tasks + output adapters (the worker seam) ────────────────────────────────────────────── + +/** The challenger's task: ground on `doc`, run the `prompt` the driver authored this round. */ +interface ChallengerTask { + readonly doc: string + /** The instruction for THIS round — the refine driver rewrites it from the last reject. */ + readonly prompt: string +} + +/** One solver attempt over an `example`; `sampleIndex` distinguishes the N parallel samples. */ +interface SolverTask { + readonly example: DataExample + readonly sampleIndex: number +} + +const challengerOutput: OutputAdapter = { + parse(events) { + const ex = resultPayload(events) + if (isDataExample(ex)) return ex + // Fail loud: a challenger that produced no parseable example is a real defect, not an empty pass. + throw new Error('challenger produced no parseable DataExample') + }, +} + +const solverOutput: OutputAdapter<{ answer: string }> = { + parse(events) { + const r = resultPayload(events) + if ( + r && + typeof r === 'object' && + 'answer' in r && + typeof (r as { answer: unknown }).answer === 'string' + ) { + return { answer: (r as { answer: string }).answer } + } + throw new Error('solver produced no answer') + }, +} + +// ── N× solver sampling = an inline FANOUT driver over runLoop ──────────────────────────────── +// +// A "round" returns N independent solver tasks (no fold between them) → the kernel runs all N, +// the `llmJudge`-as-validator scores each against the rubric, and we AVERAGE the N scores (the +// variance-reduced estimate the accept rule compares — not argmax). runLoop already aggregated +// the N calls' cost, so we roll its total into the ledger under this solver's channel. +async function sampleSolverScore(args: { + solver: SandboxClient + solverSpec: AgentRunSpec + example: DataExample + judge: JudgeConfig + samples: number + channel: string + ledger: CostLedger + signal?: AbortSignal +}): Promise { + const { solver, solverSpec, example, judge, samples, channel, ledger } = args + + const validator: Validator<{ answer: string }> = { + async validate(out, ctx) { + const score = await judge.score({ + artifact: { example, answer: out.answer }, + scenario: solveScenario, + signal: ctx.signal, + }) + return { valid: !score.failed, score: score.composite, notes: score.notes } + }, + } + + const fanout: Driver = { + name: `${channel}/sample-x${samples}`, + plan: async (task, history) => + history.length === 0 + ? Array.from({ length: samples }, (_, i) => ({ ...task, sampleIndex: i })) + : [], + decide: () => 'done', + } + + const result = await runLoop({ + driver: fanout, + agentRun: solverSpec, + output: solverOutput, + validator, + task: { example, sampleIndex: 0 }, + ctx: { sandboxClient: solver, signal: args.signal }, + maxIterations: samples, + maxConcurrency: samples, + }) + + ledger.record({ + model: solverSpec.profile.name ?? channel, + channel, + usage: { inputTokens: result.tokenUsage.input, outputTokens: result.tokenUsage.output }, + actualCostUsd: result.costUsd, + tags: { role: channel }, + }) + + const scored = result.iterations.filter((it) => it.verdict).map((it) => it.verdict?.score ?? 0) + if (scored.length === 0) + throw new Error(`${channel}: every solver sample errored — no score to average`) + return scored.reduce((a, b) => a + b, 0) / scored.length +} + +// ── The challenger refine driver — the FOLD ────────────────────────────────────────────────── + +type ChallengerDecision = 'refine' | 'accept' | 'reject' + +function challengerDriver( + maxRetries: number, + baseInstruction: (doc: string) => string, +): Driver { + return { + name: 'challenger-refine', + async plan(task, history) { + if (history.length === 0) return [task] // shot 0: a first draft straight from the doc + const last = history[history.length - 1] + if (last?.verdict?.valid) return [] // accepted → stop + if (history.length >= maxRetries) return [] // out of budget → stop + // THE FOLD: read WHY the last example was rejected and rewrite the instruction to target it. + // "too easy" → make it harder; "too hard" → ease it; "leaked" → keep the answer out of context. + const why = last?.verdict?.notes ?? 'rejected' + const prompt = `${baseInstruction(task.doc)}\n\nYour previous example was REJECTED: ${why}. Write a new example that fixes exactly that.` + return [{ ...task, prompt }] + }, + decide(history) { + if (history.some((it) => it.verdict?.valid)) return 'accept' + return history.length < maxRetries ? 'refine' : 'reject' + }, + } +} + +// ── One example's full evaluation (used for both the accept loop and calibration) ───────────── + +export interface ExampleEvaluation { + readonly example: DataExample + readonly weakScore: number + readonly strongScore: number + readonly gap: number + readonly decision: AcceptDecision +} + +// ── The loop ──────────────────────────────────────────────────────────────────────────────── + +export interface DataCreationConfig { + /** The grounding document the challenger writes examples from. */ + readonly doc: string + /** The challenger worker (prompt → DataExample). The driver authors each round's prompt. */ + readonly challenger: SandboxClient + /** The weak + strong solver workers (rendered example → answer). */ + readonly weakSolver: SandboxClient + readonly strongSolver: SandboxClient + /** The rubric judge — an `llmJudge` `JudgeConfig`. */ + readonly judge: JudgeConfig + /** The challenger's base instruction over the doc (the un-folded prompt). */ + readonly baseInstruction: (doc: string) => string + /** How a solver sees one example. Default: context + question + numbered rubric + sample tag. */ + readonly renderSolverPrompt?: (example: DataExample, sampleIndex: number) => string + /** Profiles materialized for each worker (names surface in traces + the cost ledger). */ + readonly challengerProfile?: AgentProfile + readonly weakSolverProfile?: AgentProfile + readonly strongSolverProfile?: AgentProfile + /** The accept rule. Defaults to `discriminativeAcceptRule` at its paper thresholds. */ + readonly accept?: (input: { strongScore: number; weakScore: number }) => AcceptDecision + /** How many accepted examples to manufacture. Default 3. */ + readonly target?: number + /** Solver samples per example (variance reduction). Default 3. */ + readonly samples?: number + /** Refine budget per example. Default 4. */ + readonly maxRetries?: number + /** Where accepted examples accrete. Default a fresh `InMemoryCorpus`. */ + readonly corpus?: Corpus + /** Cost ledger to record into. Default a fresh `CostLedger`. */ + readonly cost?: CostLedger + readonly signal?: AbortSignal +} + +/** Default solver prompt: ground the answer in the context, score against the numbered rubric. */ +function defaultRenderSolverPrompt(example: DataExample, sampleIndex: number): string { + return ( + `Answer the QUESTION using only the CONTEXT.\n\n` + + `CONTEXT:\n${example.context}\n\n` + + `QUESTION:\n${example.question}\n\n` + + `RUBRIC (you are graded on each):\n${example.rubric.map((r, i) => `${i + 1}. ${r}`).join('\n')}\n` + + `[sample ${sampleIndex}]` + ) +} + +export interface DataCreationResult { + /** The accepted, discriminating examples (the manufactured training set). */ + readonly accepted: ExampleEvaluation[] + /** The `gap` of each accepted example — large by construction (the agentic arm). */ + readonly agenticGaps: number[] + /** The `gap` of each FIRST (un-refined) draft — the plain-generation baseline for calibration. */ + readonly plainGaps: number[] + /** Per slot, the BEST gap the refinement reached (max over the budget), accepted or not. Lets the + * plain-vs-refined calibration stay informative even when no example clears the accept bar. */ + readonly refinedGaps: number[] + readonly corpus: Corpus + readonly cost: CostLedger +} + +/** + * Run the Autodata inner loop: manufacture `target` discriminating examples from `doc`, refining + * each via the challenger fold until it is accepted (or its retry budget runs out). Returns the + * accepted set, the per-example gap for the accepted (agentic) AND the first-draft (plain) examples + * for calibration, the corpus they accreted into, and the cost ledger. + */ +export async function createDataCreationLoop( + config: DataCreationConfig, +): Promise { + const corpus = config.corpus ?? new InMemoryCorpus() + const cost = config.cost ?? new CostLedger() + const accept = config.accept ?? ((i) => discriminativeAcceptRule(i)) + const target = config.target ?? 3 + const samples = config.samples ?? 3 + const maxRetries = config.maxRetries ?? 4 + const renderSolverPrompt = config.renderSolverPrompt ?? defaultRenderSolverPrompt + + // Build the three worker specs once (task → prompt + the profile the substrate materializes). + const challengerSpec: AgentRunSpec = { + profile: config.challengerProfile ?? ({ name: 'challenger' } as AgentProfile), + taskToPrompt: (t) => t.prompt, + } + const weakSolverSpec: AgentRunSpec = { + profile: config.weakSolverProfile ?? ({ name: 'weak-solver' } as AgentProfile), + taskToPrompt: (t) => renderSolverPrompt(t.example, t.sampleIndex), + } + const strongSolverSpec: AgentRunSpec = { + profile: config.strongSolverProfile ?? ({ name: 'strong-solver' } as AgentProfile), + taskToPrompt: (t) => renderSolverPrompt(t.example, t.sampleIndex), + } + + const accepted: ExampleEvaluation[] = [] + const agenticGaps: number[] = [] + const plainGaps: number[] = [] + const refinedGaps: number[] = [] + + for (let i = 0; i < target; i++) { + // The challenger validator evaluates a candidate example: sample both solvers, judge each, then + // apply the accept rule. It stashes each iteration's evaluation so the loop can read back the + // ACCEPTED one (the agentic arm) and the FIRST draft (the plain calibration baseline). + const evaluations = new Map() + const validator: Validator = { + async validate(example, ctx) { + const quality = qualityCheck(example) + const weakScore = quality.ok + ? await sampleSolverScore({ + solver: config.weakSolver, + solverSpec: weakSolverSpec, + example, + judge: config.judge, + samples, + channel: 'weak-solver', + ledger: cost, + signal: ctx.signal, + }) + : 0 + const strongScore = quality.ok + ? await sampleSolverScore({ + solver: config.strongSolver, + solverSpec: strongSolverSpec, + example, + judge: config.judge, + samples, + channel: 'strong-solver', + ledger: cost, + signal: ctx.signal, + }) + : 0 + const decision = quality.ok + ? accept({ strongScore, weakScore }) + : { accept: false, reason: quality.reason } + const gap = strongScore - weakScore + evaluations.set(ctx.iteration, { example, weakScore, strongScore, gap, decision }) + return { valid: decision.accept, score: gap, notes: decision.reason } + }, + } + + const result = await runLoop({ + driver: challengerDriver(maxRetries, config.baseInstruction), + agentRun: challengerSpec, + output: challengerOutput, + validator, + task: { doc: config.doc, prompt: config.baseInstruction(config.doc) }, + ctx: { sandboxClient: config.challenger, signal: config.signal }, + maxIterations: maxRetries + 1, + }) + + cost.record({ + model: challengerSpec.profile.name ?? 'challenger', + channel: 'challenger', + usage: { inputTokens: result.tokenUsage.input, outputTokens: result.tokenUsage.output }, + actualCostUsd: result.costUsd, + tags: { role: 'challenger' }, + }) + + const plain = evaluations.get(0) + if (plain) plainGaps.push(plain.gap) + + const slotGaps = [...evaluations.values()].map((e) => e.gap) + if (slotGaps.length > 0) refinedGaps.push(Math.max(...slotGaps)) + + // ONLY a genuinely-accepted winner counts. `defaultSelectWinner` falls back to the best-scoring + // iteration when none is valid, so `result.winner` is set even when the accept rule rejected + // every candidate — with real solvers that frequently happens (no question separated the tiers + // inside the budget). Gate on `verdict.valid` so the manufactured set never includes a rejected + // example; a target slot that never produced a discriminating example is simply left unfilled. + if (result.winner?.verdict?.valid) { + const winnerEval = evaluations.get(result.winner.iterationIndex) + if (!winnerEval) throw new Error('internal: accepted iteration has no recorded evaluation') + const append = await corpus.append(toCorpusRecord(winnerEval, i)) + if (!append.succeeded) throw new Error(`corpus append failed: ${append.error}`) + accepted.push(winnerEval) + agenticGaps.push(winnerEval.gap) + cost.markCompleted() + } + } + + return { accepted, agenticGaps, plainGaps, refinedGaps, corpus, cost } +} + +// ── Helpers ─────────────────────────────────────────────────────────────────────────────── + +const solveScenario: Scenario = { id: 'agentic-data-creation', kind: 'solve' } + +function toCorpusRecord(evalRec: ExampleEvaluation, index: number): CorpusRecord { + return { + schemaVersion: '1.0.0', + id: `example-${index}`, + runId: 'agentic-data-creation', + producedAt: new Date().toISOString(), + area: 'training-data', + claim: JSON.stringify(evalRec.example), + rationale: evalRec.decision.reason, + tags: ['discriminative', `gap:${evalRec.gap.toFixed(2)}`], + // The gap is the producing run's confidence this example is hard — clamped into [0,1]. + confidence: Math.min(1, Math.max(0, evalRec.gap)), + } +} + +function resultPayload(events: SandboxEvent[]): unknown { + for (const ev of events) { + if (ev.type === 'result') return (ev as { data?: { result?: unknown } }).data?.result + } + return undefined +} + +function isDataExample(value: unknown): value is DataExample { + if (typeof value !== 'object' || value === null) return false + const v = value as Record + return ( + typeof v.context === 'string' && + typeof v.question === 'string' && + typeof v.reference === 'string' && + Array.isArray(v.rubric) + ) +} + +function pct(x: number): string { + return x.toFixed(2) +} diff --git a/src/autodata/grounding.ts b/src/autodata/grounding.ts new file mode 100644 index 0000000..130a177 --- /dev/null +++ b/src/autodata/grounding.ts @@ -0,0 +1,97 @@ +/** + * Ground the Autodata loop on a REAL source document, reusing agent-knowledge's ingestion utils + * (`politeFetch` → `htmlToText` → `chunkMarkdown`). Fetches the page, strips it to text, chunks it, + * and selects ONE content-rich chunk as the grounding excerpt the challenger writes questions from. + * + * The default source is the "Attention Is All You Need" paper via ar5iv (arXiv's LaTeX→HTML service), + * a stable real paper with multi-step-reasoning content that affords genuinely discriminating + * questions. Any arXiv / ar5iv URL works; pass a `focus` term to bias chunk selection toward a section. + */ + +import { chunkMarkdown } from '../chunking' +import { htmlToText } from '../sources/html' +import { politeFetch } from '../sources/http' + +/** A stable real arXiv paper (Transformer / "Attention Is All You Need") rendered to HTML by ar5iv. */ +export const DEFAULT_SOURCE_URL = 'https://ar5iv.labs.arxiv.org/html/1706.03762' + +export interface GroundDocOptions { + url: string + cacheDir?: string + /** Bias chunk selection toward chunks mentioning this term (case-insensitive). */ + focus?: string + /** Chunk size ceiling. Default 1800 chars — a paragraph or two of grounding context. */ + maxChars?: number + /** Minimum letters a chunk must have to be eligible (skips nav / citation scraps). Default 400. */ + minLetters?: number + signal?: AbortSignal +} + +export interface GroundedDoc { + url: string + sourceUpdatedAt: string + /** The selected grounding excerpt — the `doc` passed to the loop. */ + doc: string + chunkIndex: number + headingPath: string + totalChunks: number +} + +function letterCount(s: string): number { + return (s.match(/[a-zA-Z]/g) ?? []).length +} + +/** Reference/bibliography chunks are citation soup — never good question material. */ +function looksLikeReferences(headingPath: string, text: string): boolean { + if (/references|bibliography|acknowledg/i.test(headingPath)) return true + // A chunk that is mostly "[n]" / "et al." / years is a reference list. + const refMarkers = (text.match(/\[\d+\]|et al\.|arXiv:|doi:/gi) ?? []).length + return refMarkers >= 5 +} + +/** + * Fetch + chunk + select a grounding excerpt from a real document. Fails loud if the fetch is + * unverifiable or yields no usable prose chunk. + */ +export async function groundDoc(opts: GroundDocOptions): Promise { + const res = await politeFetch(opts.url, { cacheDir: opts.cacheDir, signal: opts.signal }) + if (!res.verifiable) { + throw new Error(`source not verifiable (${opts.url}): ${res.unverifiableReason ?? 'unknown'}`) + } + const text = htmlToText(res.body) + const maxChars = opts.maxChars ?? 1800 + const chunks = chunkMarkdown(text, { maxChars, targetChars: Math.round(maxChars * 0.8) }) + const minLetters = opts.minLetters ?? 400 + + const eligible = chunks.filter( + (c) => + !c.oversized && + letterCount(c.text) >= minLetters && + !looksLikeReferences(c.headingPath, c.text), + ) + if (eligible.length === 0) { + throw new Error( + `no usable prose chunk from ${opts.url} (${chunks.length} chunks, none eligible)`, + ) + } + + const focus = opts.focus?.toLowerCase() + const score = (text: string): number => { + const letters = letterCount(text) + if (!focus) return letters + const hits = ( + text.toLowerCase().match(new RegExp(focus.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), 'g')) ?? [] + ).length + return hits * 2000 + letters + } + const selected = eligible.reduce((best, c) => (score(c.text) > score(best.text) ? c : best)) + + return { + url: opts.url, + sourceUpdatedAt: res.sourceUpdatedAt, + doc: selected.text, + chunkIndex: selected.index, + headingPath: selected.headingPath, + totalChunks: chunks.length, + } +} diff --git a/src/autodata/index.ts b/src/autodata/index.ts new file mode 100644 index 0000000..e211f33 --- /dev/null +++ b/src/autodata/index.ts @@ -0,0 +1,52 @@ +/** + * Autodata — the LIVE dataset builder: ground on a real source document, run the agentic + * data-creation loop with REAL two-tier solver models (the paper's Qwen tiers via the Tangle + * router), and emit a discriminative QA dataset + the empirical strong/weak gap. + * + * The inner loop (`createDataCreationLoop` / `discriminativeAcceptRule`) is vendored from the + * agent-runtime example and composes only published substrate primitives. This package adds the + * real grounding (`grounding`), the real router roles (`router-roles`), and the pipeline + + * JSONL writer (`build-dataset`). + */ + +export { + type AutodataDatasetConfig, + type AutodataDatasetResult, + buildAutodataDataset, + type DatasetRow, + type DiscriminativeThresholds, +} from './build-dataset' +export { + type AcceptDecision, + createDataCreationLoop, + type DataCreationConfig, + type DataCreationResult, + type DataExample, + discriminativeAcceptRule, + type ExampleEvaluation, + qualityCheck, + type SolverArtifact, +} from './data-creation-loop' +export { + DEFAULT_SOURCE_URL, + type GroundDocOptions, + type GroundedDoc, + groundDoc, +} from './grounding' +export { + type AutodataRoles, + buildAutodataRoles, + CHALLENGER_MODEL, + DEFAULT_BASE_URL, + JUDGE_MODEL, + parseDataExample, + type RouterCallRecord, + type RouterChatInput, + type RouterChatResult, + type RouterRolesConfig, + routerChat, + type SmokeResult, + STRONG_SOLVER_MODEL, + smokeTestModels, + WEAK_SOLVER_MODEL, +} from './router-roles' diff --git a/src/autodata/offline-fixtures.ts b/src/autodata/offline-fixtures.ts new file mode 100644 index 0000000..77e0c9c --- /dev/null +++ b/src/autodata/offline-fixtures.ts @@ -0,0 +1,200 @@ +/** + * Credentialless offline stand-ins so the Autodata loop runs in CI with ZERO creds and reproducible + * numbers: scripted challenger/solvers + a mock-transport judge. None of this is the lesson — it is + * the minimum that lets the wiring be tested offline. The LIVE roles (real router models) live in + * `router-roles.ts`; the scores here are tuned to reproduce the paper's Table 1 separation (an EASY + * first-draft example barely separates the two solvers, a HARD loop-accepted one separates widely). + * + * Ported from agent-runtime `examples/agentic-data-creation/offline-fixtures.ts`. + */ + +import { createChatClient, llmJudge } from '@tangle-network/agent-eval' +import type { JudgeConfig } from '@tangle-network/agent-eval/campaign' +import { inProcessSandboxClient, type SandboxClient } from '@tangle-network/agent-runtime/loops' +import type { SandboxEvent } from '@tangle-network/sandbox' +import type { DataExample, SolverArtifact } from './data-creation-loop' + +export const groundingDoc = `Idempotency in the Payments API + +Every write to POST /charges may carry an Idempotency-Key header. The server stores the first +response under that key for 24 hours. A retry with the SAME key and the SAME request body replays +the stored response instead of charging again. A retry with the SAME key but a DIFFERENT body is a +conflict: the server rejects it with 422 Unprocessable Entity and creates no second charge. +Idempotency keys are scoped per merchant account.` + +export const baseInstruction = (doc: string): string => + `You are writing ONE training example from the document below. Produce a context excerpt, a ` + + `question answerable from it, a reference answer, and a 2-3 item rubric.\n\nDOCUMENT:\n${doc}` + +const easyExample: DataExample = { + context: 'Every write to POST /charges may carry an Idempotency-Key header.', + question: 'Which HTTP header carries the idempotency key on a POST /charges write?', + reference: 'The request uses an Idempotency-Key header.', + rubric: ['Names the Idempotency-Key header', 'Ties it to a POST /charges write'], +} + +const hardExamples: DataExample[] = [ + { + context: + 'A retry with the same key but a different body is a conflict: the server rejects it with 422 ' + + 'Unprocessable Entity and creates no second charge.', + question: + 'Why must the server reject a same-key, different-body retry with 422 instead of replaying the ' + + 'stored response, and what failure does that prevent?', + reference: + 'Replaying the stored response would apply it to a different request; rejecting with 422 surfaces ' + + 'the mismatch and prevents a double or incorrect charge.', + rubric: [ + 'States the server rejects the retry with 422', + 'Explains replaying the stored response would be wrong for a different body', + 'Identifies the prevented failure: a double or incorrect charge', + ], + }, + { + context: + 'The server stores the first response under the idempotency key for 24 hours and replays it on a ' + + 'retry with the same key and the same body.', + question: + 'Why does replaying the stored response on a same-key, same-body retry matter, and what failure ' + + 'does it prevent when a client retries after a dropped connection?', + reference: + 'The original request may already have charged; replaying returns that one result so a network ' + + 'retry does not create a second charge.', + rubric: [ + 'Explains the first request may have already succeeded', + 'States the stored response is replayed instead of re-charging', + 'Identifies the prevented failure: a duplicate charge on retry', + ], + }, + { + context: 'Idempotency keys are scoped per merchant account.', + question: + 'Why are idempotency keys scoped per merchant account, and what would break if they were global ' + + 'across all merchants?', + reference: + 'Per-merchant scoping isolates key spaces; a global scope would let one merchant key collide ' + + 'with another and replay the wrong merchant charge.', + rubric: [ + 'States keys are isolated per merchant account', + 'Explains a global scope risks cross-merchant key collisions', + 'Identifies the failure: replaying the wrong merchant charge', + ], + }, +] + +const hardQuestionPattern = /\b(why|explain|under what|what happens if|reason)\b/i + +/** + * Scripted challenger: first draft (no "REJECTED" in the prompt) → the EASY example; once the refine + * driver folds a "too easy" reject into the prompt, it ships the next HARD example — proving the + * loop's behavior changed because of the fold. Stateful so successive targets get DISTINCT examples. + */ +export function challengerClient(): SandboxClient { + let hardServed = 0 + return inProcessSandboxClient({ + onPrompt: (prompt): SandboxEvent[] => { + const wantsHarder = /rejected|too easy/i.test(prompt) + const example = wantsHarder + ? (hardExamples[hardServed++ % hardExamples.length] ?? easyExample) + : easyExample + return [ + { + type: 'llm_call', + data: { model: 'offline-challenger', tokensIn: 320, tokensOut: 90, costUsd: 0.0006 }, + }, + { type: 'result', data: { result: example } }, + ] + }, + }) +} + +/** + * Scripted solver: answers the rendered example and tags the answer with a grade marker the offline + * judge reads. The weak solver produces a thin answer; the strong solver a complete one. + */ +export function solverClient(strength: 'weak' | 'strong'): SandboxClient { + return inProcessSandboxClient({ + onPrompt: (prompt): SandboxEvent[] => { + const hard = hardQuestionPattern.test(prompt) + const sample = Number(/\[sample (\d+)\]/.exec(prompt)?.[1] ?? '0') + const body = + strength === 'strong' + ? 'A complete, rubric-covering answer grounded in the context.' + : 'A short, partial answer.' + const answer = `${body} <>` + return [ + { + type: 'llm_call', + data: { + model: `offline-${strength}-solver`, + tokensIn: 140, + tokensOut: 30, + costUsd: 0.0003, + }, + }, + { type: 'result', data: { result: { answer } } }, + ] + }, + }) +} + +/** A REAL `llmJudge` over a MOCK transport: returns a scripted [0,1] score from the grade marker. */ +export function buildRubricJudge(): JudgeConfig { + const chat = createChatClient({ + transport: 'mock', + defaultModel: 'offline-judge', + handler: async (req) => { + const text = req.messages + .map((m) => (typeof m.content === 'string' ? m.content : '')) + .join('\n') + const m = /<>/.exec(text) + const strength = m?.[1] + const difficulty = m?.[2] + const sampleIndex = m?.[3] + if (!strength || !difficulty || sampleIndex === undefined) { + throw new Error('offline judge: answer carried no grade marker') + } + const base = + difficulty === 'hard' + ? strength === 'strong' + ? 0.77 + : 0.46 + : strength === 'strong' + ? 0.86 + : 0.84 + // Per-sample jitter over samples 0,1,2 → −0.02, 0, +0.02, so the N× mean lands back on `base`. + const jitter = (Number(sampleIndex) - 1) * 0.02 + const score = Math.min(1, Math.max(0, base + jitter)) + return { + content: JSON.stringify({ + dimensions: { rubric_coverage: score, correctness: score }, + notes: `offline: ${strength} solver on ${difficulty} example (sample ${sampleIndex})`, + }), + usage: { promptTokens: 130, completionTokens: 25, totalTokens: 155 }, + costUsd: 0.0001, + model: 'offline-judge', + durationMs: 1, + raw: {}, + } + }, + }) + + return llmJudge( + 'rubric-judge', + 'Score the candidate ANSWER against the example RUBRIC. Return JSON ' + + '{"dimensions":{"rubric_coverage":N,"correctness":N},"notes":"..."} with each score in [0,1].', + { + chat, + dimensions: [ + { + key: 'rubric_coverage', + description: 'fraction of the rubric criteria the answer satisfies', + }, + { key: 'correctness', description: 'agreement with the reference answer' }, + ], + scale: 'unit', + renderUser: ({ artifact }) => + `RUBRIC:\n${artifact.example.rubric.map((r, i) => `${i + 1}. ${r}`).join('\n')}\n\nANSWER:\n${artifact.answer}`, + }, + ) +} diff --git a/src/autodata/router-roles.ts b/src/autodata/router-roles.ts new file mode 100644 index 0000000..6ba7413 --- /dev/null +++ b/src/autodata/router-roles.ts @@ -0,0 +1,442 @@ +/** + * The REAL two-tier roles for the Autodata loop, over the Tangle router. + * + * One transport seam — `routerChat` — POSTs `/chat/completions` and returns content + exact token + * usage + a per-call USD cost (the router's own cost when it returns one, else a documented + * rate-table estimate over the exact token counts; the source is flagged, never silently faked). + * The four roles are materialized on top of it: + * • challenger (glm-5.2) → an `inProcessSandboxClient` that asks for ONE JSON example and parses it + * • weak solver (qwen-2.5-7b) / strong solver (qwen3-235b) → `inProcessSandboxClient` answer workers + * • judge (glm-5.2) → an `llmJudge` `JudgeConfig` whose transport is a `sandbox-sdk` ChatClient + * wrapping `routerChat`; the judge's own spend is recorded into the same `CostLedger` (the loop + * only aggregates challenger + solver spend, so the judge channel would otherwise be invisible). + * + * glm-5.2 returns empty content unless `max_tokens` is generous, so every glm call is floored and the + * judge is built with an explicit `maxTokens`. + */ + +import { + type ChatCallOpts, + type ChatRequest, + type ChatResponse, + type CostLedger, + createChatClient, + llmJudge, +} from '@tangle-network/agent-eval' +import type { JudgeConfig } from '@tangle-network/agent-eval/campaign' +import { inProcessSandboxClient, type SandboxClient } from '@tangle-network/agent-runtime/loops' +import type { SandboxEvent } from '@tangle-network/sandbox' +import type { DataExample, SolverArtifact } from './data-creation-loop' + +export const DEFAULT_BASE_URL = 'https://router.tangle.tools/v1' + +// A genuine small-vs-large tier in one model family. The brief specified the Qwen tier +// (`qwen/qwen-2.5-7b-instruct` weak, `qwen/qwen3-235b-a22b` strong), but on the live Tangle router +// EVERY Qwen id 401s `No API key configured for model` for this key — the Qwen upstream is not +// provisioned (verified by probing `/v1/chat/completions` across the `/v1/models` catalog). The +// GLM family IS served, so the real tier here is the smallest GLM (`glm-4.5-air`) as the weak solver +// vs the latest (`glm-5.2`) as the strong solver. Same family, a real generational/size gap; swap +// these constants back to the Qwen ids once the router provisions that upstream. +export const WEAK_SOLVER_MODEL = 'glm-4.5-air' +export const STRONG_SOLVER_MODEL = 'glm-5.2' +export const CHALLENGER_MODEL = 'glm-5.2' +export const JUDGE_MODEL = 'glm-5.2' + +interface ModelPrice { + /** USD per 1M input tokens. */ + inputPerM: number + /** USD per 1M output tokens. */ + outputPerM: number +} + +/** + * Rate table for the $ estimate. The TOKEN COUNTS are exact (read from the router's `usage`); these + * rates are the documented basis for converting them to dollars WHEN the router returns no per-call + * cost. They are estimates, not invoices — `routerChat` flags every call's `costSource` so a report + * can say how many calls were router-priced vs rate-estimated. + */ +const PRICE_TABLE: Record = { + 'glm-4.5-air': { inputPerM: 0.2, outputPerM: 0.6 }, + 'glm-5.2': { inputPerM: 0.95, outputPerM: 3.0 }, +} + +/** Per-call usage record surfaced to an optional sink for cost-provenance reporting. */ +export interface RouterCallRecord { + model: string + promptTokens: number + completionTokens: number + costUsd: number + costSource: 'router' | 'estimated' + finishReason: string | null +} + +export interface RouterChatInput { + apiKey: string + baseUrl?: string + model: string + messages: { role: 'system' | 'user' | 'assistant'; content: string }[] + maxTokens: number + temperature?: number + jsonMode?: boolean + signal?: AbortSignal + onCall?: (rec: RouterCallRecord) => void +} + +export interface RouterChatResult { + content: string + promptTokens: number + completionTokens: number + costUsd: number + costSource: 'router' | 'estimated' + finishReason: string | null + raw: Record +} + +/** glm spends its budget on hidden reasoning and returns empty content unless max_tokens is high. */ +function maxTokensFloor(model: string): number { + return /glm/i.test(model) ? 1500 : 512 +} + +/** Read a per-call cost the router may return, across the field names proxies use. */ +function routerReportedCost(body: Record): number | null { + const usage = (body.usage ?? {}) as Record + const candidates = [body._response_cost, body.cost, usage.cost, usage.total_cost] + for (const c of candidates) { + if (typeof c === 'number' && Number.isFinite(c) && c > 0) return c + } + return null +} + +function estimateCostUsd(model: string, promptTokens: number, completionTokens: number): number { + const price = PRICE_TABLE[model] + if (!price) { + // Fail loud: a model we route to but cannot price would emit a 0 that masquerades as "free". + throw new Error(`no price-table entry for model '${model}' — add it before routing live spend`) + } + return (promptTokens * price.inputPerM + completionTokens * price.outputPerM) / 1_000_000 +} + +/** + * One Tangle-router chat call. Fails loud on a non-2xx status. Returns the visible content, the + * exact prompt/completion token counts, and a USD cost (router-reported when present, else + * rate-estimated over the real token counts) with its source flagged. + */ +export async function routerChat(input: RouterChatInput): Promise { + const baseUrl = (input.baseUrl ?? DEFAULT_BASE_URL).replace(/\/$/, '') + const max_tokens = Math.max(input.maxTokens, maxTokensFloor(input.model)) + const res = await fetch(`${baseUrl}/chat/completions`, { + method: 'POST', + headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${input.apiKey}` }, + signal: input.signal, + body: JSON.stringify({ + model: input.model, + messages: input.messages, + max_tokens, + temperature: input.temperature ?? 0.2, + stream: false, + ...(input.jsonMode ? { response_format: { type: 'json_object' } } : {}), + }), + }) + if (!res.ok) { + const detail = await res.text().catch(() => res.statusText) + throw new Error(`router ${res.status} for ${input.model}: ${detail.slice(0, 400)}`) + } + const body = (await res.json()) as Record + const choice = (body.choices as { message?: { content?: string }; finish_reason?: string }[])?.[0] + const usage = (body.usage ?? {}) as { prompt_tokens?: number; completion_tokens?: number } + const promptTokens = usage.prompt_tokens ?? 0 + const completionTokens = usage.completion_tokens ?? 0 + const reported = routerReportedCost(body) + const costUsd = reported ?? estimateCostUsd(input.model, promptTokens, completionTokens) + const costSource: 'router' | 'estimated' = reported !== null ? 'router' : 'estimated' + const finishReason = choice?.finish_reason ?? null + input.onCall?.({ + model: input.model, + promptTokens, + completionTokens, + costUsd, + costSource, + finishReason, + }) + return { + content: choice?.message?.content ?? '', + promptTokens, + completionTokens, + costUsd, + costSource, + finishReason, + raw: body, + } +} + +// ── Parsing the challenger's JSON example ───────────────────────────────────────────────────── + +/** Extract the first balanced top-level JSON object from a model response (handles ```json fences). */ +function extractJsonObject(text: string): string | null { + const fenced = /```(?:json)?\s*([\s\S]*?)```/i.exec(text) + const body = fenced ? (fenced[1] ?? '') : text + const start = body.indexOf('{') + if (start < 0) return null + let depth = 0 + let inString = false + let escaped = false + for (let i = start; i < body.length; i++) { + const ch = body[i] + if (escaped) { + escaped = false + continue + } + if (ch === '\\') { + escaped = true + continue + } + if (ch === '"') inString = !inString + if (inString) continue + if (ch === '{') depth++ + else if (ch === '}') { + depth-- + if (depth === 0) return body.slice(start, i + 1) + } + } + return null +} + +/** Parse a challenger response into a `DataExample`, or throw loud (the loop refines on the error). */ +export function parseDataExample(text: string): DataExample { + const json = extractJsonObject(text) + if (!json) throw new Error('challenger response contained no JSON object') + const parsed = JSON.parse(json) as Record + const rubric = parsed.rubric + if ( + typeof parsed.context !== 'string' || + typeof parsed.question !== 'string' || + typeof parsed.reference !== 'string' || + !Array.isArray(rubric) + ) { + throw new Error('challenger JSON missing a required field (context/question/reference/rubric)') + } + return { + context: parsed.context, + question: parsed.question, + reference: parsed.reference, + rubric: rubric.map((r) => String(r)), + } +} + +// ── The roles ───────────────────────────────────────────────────────────────────────────────── + +const challengerSystem = + 'You write ONE hard exam question from a source document. The question must require multi-step ' + + 'reasoning a small model would get wrong but a strong model would get right — never a verbatim ' + + 'lookup. Return STRICT JSON and nothing else: ' + + '{"context": string, "question": string, "reference": string, "rubric": string[] }. ' + + 'The context is a short excerpt from the document; the question must NOT be answerable by copying ' + + 'a sentence; the reference is the correct answer; the rubric is 2-3 scoring criteria. ' + + 'Do NOT put the reference answer verbatim inside the context.' + +const judgeSystem = + 'You are grading a candidate ANSWER to a question against a RUBRIC and a REFERENCE answer. ' + + 'Return JSON {"dimensions":{"rubric_coverage":N,"correctness":N},"notes":"..."} with each score ' + + 'in [0,1]. rubric_coverage = the fraction of rubric criteria the answer satisfies; correctness = ' + + 'how well the answer agrees with the reference. Be strict: a vague or partial answer scores low.' + +export interface RouterRolesConfig { + apiKey: string + baseUrl?: string + challengerModel?: string + weakModel?: string + strongModel?: string + judgeModel?: string + /** Judge spend is recorded here directly (the loop captures only challenger + solver spend). */ + ledger: CostLedger + /** Optional sink for every router call's cost provenance. */ + onCall?: (rec: RouterCallRecord) => void +} + +export interface AutodataRoles { + challenger: SandboxClient + weakSolver: SandboxClient + strongSolver: SandboxClient + judge: JudgeConfig +} + +function solverClient(cfg: RouterRolesConfig, model: string): SandboxClient { + return inProcessSandboxClient({ + onPrompt: async (prompt, ctx): Promise => { + const r = await routerChat({ + apiKey: cfg.apiKey, + baseUrl: cfg.baseUrl, + model, + messages: [{ role: 'user', content: prompt }], + maxTokens: 1024, + signal: ctx.signal, + onCall: cfg.onCall, + }) + return [ + { + type: 'llm_call', + data: { + model, + tokensIn: r.promptTokens, + tokensOut: r.completionTokens, + costUsd: r.costUsd, + }, + }, + { type: 'result', data: { result: { answer: r.content } } }, + ] + }, + }) +} + +function challengerClient(cfg: RouterRolesConfig): SandboxClient { + const model = cfg.challengerModel ?? CHALLENGER_MODEL + return inProcessSandboxClient({ + onPrompt: async (prompt, ctx): Promise => { + const r = await routerChat({ + apiKey: cfg.apiKey, + baseUrl: cfg.baseUrl, + model, + messages: [ + { role: 'system', content: challengerSystem }, + { role: 'user', content: prompt }, + ], + maxTokens: 1500, + jsonMode: true, + signal: ctx.signal, + onCall: cfg.onCall, + }) + const example = parseDataExample(r.content) + return [ + { + type: 'llm_call', + data: { + model, + tokensIn: r.promptTokens, + tokensOut: r.completionTokens, + costUsd: r.costUsd, + }, + }, + { type: 'result', data: { result: example } }, + ] + }, + }) +} + +function rubricJudge(cfg: RouterRolesConfig): JudgeConfig { + const judgeModel = cfg.judgeModel ?? JUDGE_MODEL + const chat = createChatClient({ + transport: 'sandbox-sdk', + defaultModel: judgeModel, + chat: async (req: ChatRequest, opts?: ChatCallOpts): Promise => { + const model = req.model ?? judgeModel + const messages = req.messages.map((m) => ({ + role: m.role, + content: + typeof m.content === 'string' + ? m.content + : m.content.map((p) => (p.type === 'text' ? p.text : '')).join('\n'), + })) + const r = await routerChat({ + apiKey: cfg.apiKey, + baseUrl: cfg.baseUrl, + model, + messages, + maxTokens: req.maxTokens ?? 1500, + temperature: req.temperature, + jsonMode: req.jsonMode, + signal: opts?.signal, + onCall: cfg.onCall, + }) + cfg.ledger.record({ + model, + channel: 'judge', + usage: { inputTokens: r.promptTokens, outputTokens: r.completionTokens }, + actualCostUsd: r.costUsd, + tags: { role: 'judge' }, + }) + return { + content: r.content, + usage: { + promptTokens: r.promptTokens, + completionTokens: r.completionTokens, + totalTokens: r.promptTokens + r.completionTokens, + }, + costUsd: r.costUsd, + model, + durationMs: 0, + finishReason: r.finishReason, + contentEmpty: r.content.trim() === '', + raw: r.raw, + } + }, + }) + + return llmJudge('autodata-rubric-judge', judgeSystem, { + chat, + maxTokens: 1500, + dimensions: [ + { + key: 'rubric_coverage', + description: 'fraction of the rubric criteria the answer satisfies', + }, + { key: 'correctness', description: 'agreement with the reference answer' }, + ], + scale: 'unit', + renderUser: ({ artifact }) => + `REFERENCE ANSWER:\n${artifact.example.reference}\n\n` + + `RUBRIC:\n${artifact.example.rubric.map((r, i) => `${i + 1}. ${r}`).join('\n')}\n\n` + + `CANDIDATE ANSWER:\n${artifact.answer}`, + }) +} + +/** Materialize all four live roles over the Tangle router. */ +export function buildAutodataRoles(cfg: RouterRolesConfig): AutodataRoles { + return { + challenger: challengerClient(cfg), + weakSolver: solverClient(cfg, cfg.weakModel ?? WEAK_SOLVER_MODEL), + strongSolver: solverClient(cfg, cfg.strongModel ?? STRONG_SOLVER_MODEL), + judge: rubricJudge(cfg), + } +} + +export interface SmokeResult { + model: string + ok: boolean + contentChars: number + finishReason: string | null + costUsd: number + costSource: 'router' | 'estimated' +} + +/** + * The cost gate: one cheap call per model, asserting non-empty content, BEFORE the loop burn. + * Returns a row per model so the caller can fail loud if any tier is dead. + */ +export async function smokeTestModels(cfg: { + apiKey: string + baseUrl?: string + models?: string[] + signal?: AbortSignal +}): Promise { + const models = cfg.models ?? [CHALLENGER_MODEL, WEAK_SOLVER_MODEL, STRONG_SOLVER_MODEL] + const rows: SmokeResult[] = [] + for (const model of models) { + const r = await routerChat({ + apiKey: cfg.apiKey, + baseUrl: cfg.baseUrl, + model, + messages: [{ role: 'user', content: 'Reply with the single word: ready.' }], + maxTokens: 32, + signal: cfg.signal, + }) + rows.push({ + model, + ok: r.content.trim().length > 0, + contentChars: r.content.trim().length, + finishReason: r.finishReason, + costUsd: r.costUsd, + costSource: r.costSource, + }) + } + return rows +} diff --git a/src/autodata/run.ts b/src/autodata/run.ts new file mode 100644 index 0000000..cca767f --- /dev/null +++ b/src/autodata/run.ts @@ -0,0 +1,150 @@ +/** + * Autodata — the LIVE runnable: cost-gate the three models, ground on a REAL arXiv document, run the + * agentic data-creation loop with the real two-tier solvers, and report the empirical strong/weak + * gap (plain first-draft vs loop-accepted), the cost split by role, and the JSONL dataset path. + * + * Run (key never printed): + * dotenvx run -f /home/drew/company/devops/secrets/agent-state.env -- \ + * pnpm tsx src/autodata/run.ts + * + * Env knobs: AUTODATA_URL, AUTODATA_FOCUS, AUTODATA_TARGET, AUTODATA_SAMPLES, AUTODATA_MAXRETRIES, + * AUTODATA_OUT, TANGLE_API_KEY (or TANGLE_ROUTER_KEY). + */ + +import { buildAutodataDataset } from './build-dataset' +import { DEFAULT_SOURCE_URL, groundDoc } from './grounding' +import { + CHALLENGER_MODEL, + STRONG_SOLVER_MODEL, + smokeTestModels, + WEAK_SOLVER_MODEL, +} from './router-roles' + +function envInt(name: string, fallback: number): number { + const raw = process.env[name] + if (!raw) return fallback + const n = Number.parseInt(raw, 10) + if (!Number.isFinite(n) || n <= 0) throw new Error(`${name}='${raw}' is not a positive integer`) + return n +} + +function fmt(x: number | null, digits = 3): string { + return x === null ? 'n/a' : x.toFixed(digits) +} + +async function main(): Promise { + const apiKey = process.env.TANGLE_API_KEY ?? process.env.TANGLE_ROUTER_KEY + if (!apiKey) throw new Error('no TANGLE_API_KEY in env — run under dotenvx so the key is set') + + const url = process.env.AUTODATA_URL ?? DEFAULT_SOURCE_URL + const focus = process.env.AUTODATA_FOCUS ?? 'attention' + const target = envInt('AUTODATA_TARGET', 3) + const samples = envInt('AUTODATA_SAMPLES', 3) + const maxRetries = envInt('AUTODATA_MAXRETRIES', 4) + const outPath = process.env.AUTODATA_OUT ?? 'data/autodata-dataset.jsonl' + + // ── 1. COST GATE: one cheap call per model, all must return non-empty content before the burn ── + console.log('Autodata · cost gate (one call per model)\n') + const smoke = await smokeTestModels({ + apiKey, + models: [CHALLENGER_MODEL, WEAK_SOLVER_MODEL, STRONG_SOLVER_MODEL], + }) + for (const s of smoke) { + console.log( + ` ${s.ok ? 'ok ' : 'DEAD'} ${s.model.padEnd(28)} chars=${String(s.contentChars).padStart(4)} ` + + `finish=${s.finishReason ?? '?'} cost=$${s.costUsd.toFixed(5)} (${s.costSource})`, + ) + } + const dead = smoke.filter((s) => !s.ok) + if (dead.length > 0) { + throw new Error(`cost gate failed — empty content from: ${dead.map((d) => d.model).join(', ')}`) + } + + // ── 2. Ground on a REAL document ── + const grounded = await groundDoc({ url, focus }) + console.log( + `\nGrounded on ${grounded.url}\n section='${grounded.headingPath}' chunk=${grounded.chunkIndex}/${grounded.totalChunks} ` + + `(${grounded.doc.length} chars, updated ${grounded.sourceUpdatedAt})`, + ) + console.log(` excerpt: ${grounded.doc.slice(0, 200).replace(/\s+/g, ' ')}...`) + + // ── 3. Run the loop with real two-tier solvers ── + console.log( + `\nManufacturing up to ${target} discriminating example(s) · samples=${samples} maxRetries=${maxRetries}\n` + + ` challenger/judge=${CHALLENGER_MODEL} weak=${WEAK_SOLVER_MODEL} strong=${STRONG_SOLVER_MODEL}`, + ) + const result = await buildAutodataDataset({ + apiKey, + source: grounded, + outPath, + target, + samples, + maxRetries, + }) + + // ── 4. The accepted set ── + console.log(`\n— Accepted examples (${result.accepted.length}/${target}) —`) + for (const [i, ex] of result.accepted.entries()) { + console.log(`\n [${i}] Q: ${ex.example.question}`) + console.log( + ` weak=${ex.weakScore.toFixed(2)} strong=${ex.strongScore.toFixed(2)} gap=${ex.gap.toFixed(2)}`, + ) + console.log(` ${ex.decision.reason}`) + } + + // ── 5. The empirical calibration (paper Table 1) ── + console.log('\n— Calibration: plain first-draft gap vs agentic loop-accepted gap —') + console.log( + ` plain (first-draft questions, n=${result.plainGaps.length}) mean gap = ${fmt(result.plainGapMean)}`, + ) + console.log( + ` agentic (loop-accepted questions, n=${result.agenticGaps.length}) mean gap = ${fmt(result.agenticGapMean)}`, + ) + console.log( + ` refined (best gap reached per slot, n=${result.refinedGaps.length}) mean gap = ${fmt(result.refinedGapMean)}`, + ) + // The honest comparison: plain first-draft gap vs the best the refinement reached. Acceptance is + // strict (gap >= 0.20); refined-vs-plain shows whether the fold widened the gap at all. + if (result.plainGapMean !== null && result.refinedGapMean !== null) { + const delta = result.refinedGapMean - result.plainGapMean + console.log( + ` Δ (refined − plain) = ${delta >= 0 ? '+' : ''}${delta.toFixed(3)} ` + + (delta >= 0.1 + ? '→ the loop WIDENS the strong/weak gap (empirical Table-1 direction)' + : '→ NO meaningful widening on these real models (honest null)'), + ) + } else { + console.log(' (insufficient data to compare — see accepted count)') + } + if (result.accepted.length === 0) { + console.log( + ' NOTE: 0 examples cleared the discriminative accept bar — the two GLM tiers did not separate.', + ) + } + + // ── 6. Cost split by role ── + const summary = result.cost.summary() + console.log('\n— Cost (CostLedger, by role) —') + console.log( + ` total: $${summary.totalCostUsd.toFixed(4)} over ${summary.totalCalls} recorded loops/calls` + + (summary.fullyPriced + ? ' (fully priced)' + : ` (unpriced models: ${summary.unpricedModels.join(', ')})`), + ) + for (const ch of summary.byChannel) { + console.log(` ${ch.channel.padEnd(14)} $${ch.costUsd.toFixed(4)} (${ch.calls} loops/calls)`) + } + if (result.costPerExampleUsd !== null) { + console.log(` cost per accepted example: $${result.costPerExampleUsd.toFixed(4)}`) + } + console.log( + ` call provenance: ${result.callProvenance.router} router-priced, ${result.callProvenance.estimated} rate-estimated`, + ) + + console.log(`\n— Dataset — ${result.rows.length} row(s) written to ${result.outPath}`) +} + +main().catch((err) => { + console.error(err) + process.exit(1) +}) diff --git a/tsup.config.ts b/tsup.config.ts index 1621ec2..3aed29f 100644 --- a/tsup.config.ts +++ b/tsup.config.ts @@ -8,6 +8,7 @@ export default defineConfig({ 'memory/index': 'src/memory/index.ts', 'sources/index': 'src/sources/index.ts', 'profiles/index': 'src/profiles/index.ts', + 'autodata/index': 'src/autodata/index.ts', }, format: ['esm'], dts: true,