Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
223 changes: 223 additions & 0 deletions Benchmarks/CacheAccessPersistence.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
// Spike (not part of the package build): persist per-entry cache access time — file `mtime` touch
// (option 1) vs an SQLite index. Run: swift Benchmarks/CacheAccessPersistence.swift [entryCount accessCount]
//
// Both keep the timestamp where the sweep can read it across launches with no separate manifest:
// - mtime: re-`setAttributes` the file's modification date on a hit; sweep `stat`s the directory.
// - sqlite: UPSERT a row keyed by cache key; sweep is an indexed SELECT.
// We measure the hot path (recordAccess) and the bulk sweep, the two operations the cache actually runs.

import Foundation
import SQLite3

let arguments = CommandLine.arguments
let entryCount = arguments.count > 1 ? Int(arguments[1])! : 10_000
let accessCount = arguments.count > 2 ? Int(arguments[2])! : 100_000
let SQLITE_TRANSIENT = unsafeBitCast(-1, to: sqlite3_destructor_type.self)

func seconds(_ body: () -> Void) -> Double {
let clock = ContinuousClock()
let elapsed = clock.measure(body)
let (whole, atto) = elapsed.components
return Double(whole) + Double(atto) / 1e18
}

// Right-align a string in a fixed-width column (avoids `%s`/`%d`, which corrupt String(format:) varargs).
func col(_ string: String, _ width: Int) -> String {
string.count >= width ? string : String(repeating: " ", count: width - string.count) + string
}

func countExpired(in directory: URL, cutoff: Date) -> Int {
let urls = (try? FileManager.default.contentsOfDirectory(at: directory, includingPropertiesForKeys: [.contentModificationDateKey])) ?? []
var expired = 0
for url in urls {
if let date = try? url.resourceValues(forKeys: [.contentModificationDateKey]).contentModificationDate, date < cutoff { expired += 1 }
}
return expired
}

func row(_ name: String, _ total: Double, ops: Int) {
let nsPerOp = String(format: "%.1f", total / Double(ops) * 1e9)
let opsPerSec = String(format: "%.0f", Double(ops) / total)
let label = name.padding(toLength: 38, withPad: " ", startingAt: 0)
print(" \(label)\(col(nsPerOp, 9)) ns/op \(col(opsPerSec, 12)) ops/sec (\(String(format: "%.3f", total))s)")
}

print("entries: \(entryCount) accesses: \(accessCount)\n")

// MARK: - Option 1: file mtime

let fileManager = FileManager.default
let mtimeDir = fileManager.temporaryDirectory.appendingPathComponent("bench-mtime-\(UUID().uuidString)")
try! fileManager.createDirectory(at: mtimeDir, withIntermediateDirectories: true)

var paths: [String] = []
for index in 0..<entryCount {
let url = mtimeDir.appendingPathComponent("entry-\(index)")
fileManager.createFile(atPath: url.path, contents: Data("x".utf8))
paths.append(url.path)
}
// Half the entries start "old" so the sweep actually collects expired ones.
let oldDate = Date(timeIntervalSinceNow: -10 * 24 * 60 * 60)
for index in stride(from: 0, to: entryCount, by: 2) {
try! fileManager.setAttributes([.modificationDate: oldDate], ofItemAtPath: paths[index])
}

print("Option 1 — file mtime")
// Sweep first, while half the entries are still seeded "old", so it actually collects expired ones.
let mtimeCutoff = Date(timeIntervalSinceNow: -7 * 24 * 60 * 60)
var mtimeExpired = 0
let mtimeSweep = seconds {
let urls = (try? fileManager.contentsOfDirectory(at: mtimeDir, includingPropertiesForKeys: [.contentModificationDateKey])) ?? []
for url in urls {
if let date = try? url.resourceValues(forKeys: [.contentModificationDateKey]).contentModificationDate, date < mtimeCutoff {
mtimeExpired += 1
}
}
}
row("sweep (scan + stat \(entryCount))", mtimeSweep, ops: entryCount)
print(" swept \(mtimeExpired) expired")

// Debounce fast path: a read only `stat`s the mtime and decides not to re-touch (the common case).
let mtimeStat = seconds {
for k in 0..<accessCount {
_ = try? URL(fileURLWithPath: paths[k % entryCount]).resourceValues(forKeys: [.contentModificationDateKey]).contentModificationDate
}
}
row("recordAccess (debounced: stat only)", mtimeStat, ops: accessCount)

// Worst case: touch the mtime on every access (no debounce).
let mtimeRecord = seconds {
for k in 0..<accessCount {
try? fileManager.setAttributes([.modificationDate: Date()], ofItemAtPath: paths[k % entryCount])
}
}
row("recordAccess (touch mtime, no debounce)", mtimeRecord, ops: accessCount)
print()

// MARK: - Option 2: SQLite index

let sqliteURL = fileManager.temporaryDirectory.appendingPathComponent("bench-\(UUID().uuidString).sqlite")
var database: OpaquePointer?
sqlite3_open(sqliteURL.path, &database)
sqlite3_exec(database, "PRAGMA journal_mode=WAL;", nil, nil, nil)
sqlite3_exec(database, "PRAGMA synchronous=NORMAL;", nil, nil, nil)
sqlite3_exec(database, "CREATE TABLE access(key TEXT PRIMARY KEY, ts REAL);", nil, nil, nil)
sqlite3_exec(database, "CREATE INDEX idx_ts ON access(ts);", nil, nil, nil)

sqlite3_exec(database, "BEGIN;", nil, nil, nil)
var seed: OpaquePointer?
sqlite3_prepare_v2(database, "INSERT INTO access(key, ts) VALUES (?, ?);", -1, &seed, nil)
for index in 0..<entryCount {
let ts = index % 2 == 0 ? oldDate.timeIntervalSince1970 : Date().timeIntervalSince1970
sqlite3_bind_text(seed, 1, "entry-\(index)", -1, SQLITE_TRANSIENT)
sqlite3_bind_double(seed, 2, ts)
sqlite3_step(seed)
sqlite3_reset(seed)
}
sqlite3_finalize(seed)
sqlite3_exec(database, "COMMIT;", nil, nil, nil)

let upsertSQL = "INSERT INTO access(key, ts) VALUES (?, ?) ON CONFLICT(key) DO UPDATE SET ts = excluded.ts;"

print("Option 2 — SQLite index (WAL, synchronous=NORMAL)")
// Sweep first, while half the rows are still seeded "old".
var select: OpaquePointer?
sqlite3_prepare_v2(database, "SELECT key FROM access WHERE ts < ?;", -1, &select, nil)
var sqliteExpired = 0
let sqliteSweep = seconds {
sqlite3_bind_double(select, 1, mtimeCutoff.timeIntervalSince1970)
while sqlite3_step(select) == SQLITE_ROW { sqliteExpired += 1 }
sqlite3_reset(select)
}
sqlite3_finalize(select)
row("sweep (indexed SELECT \(entryCount))", sqliteSweep, ops: entryCount)
print(" swept \(sqliteExpired) expired")

var upsert: OpaquePointer?
sqlite3_prepare_v2(database, upsertSQL, -1, &upsert, nil)
let sqliteRecordAuto = seconds {
for k in 0..<accessCount {
sqlite3_bind_text(upsert, 1, "entry-\(k % entryCount)", -1, SQLITE_TRANSIENT)
sqlite3_bind_double(upsert, 2, Date().timeIntervalSince1970)
sqlite3_step(upsert)
sqlite3_reset(upsert)
}
}
row("recordAccess (UPSERT, autocommit)", sqliteRecordAuto, ops: accessCount)

let sqliteRecordBatched = seconds {
sqlite3_exec(database, "BEGIN;", nil, nil, nil)
for k in 0..<accessCount {
sqlite3_bind_text(upsert, 1, "entry-\(k % entryCount)", -1, SQLITE_TRANSIENT)
sqlite3_bind_double(upsert, 2, Date().timeIntervalSince1970)
sqlite3_step(upsert)
sqlite3_reset(upsert)
}
sqlite3_exec(database, "COMMIT;", nil, nil, nil)
}
row("recordAccess (UPSERT, 1 transaction)", sqliteRecordBatched, ops: accessCount)
sqlite3_finalize(upsert)
print()

sqlite3_close(database)

// Footprint of the SQLite store (plus a -wal/-shm sidecar in use mid-session).
let sqliteBytes = ((try? fileManager.attributesOfItem(atPath: sqliteURL.path))?[.size] as? Int) ?? 0
print("SQLite db file: \(sqliteBytes / 1024) KB for \(entryCount) entries (+ a -wal/-shm sidecar). mtime adds 0 bytes.")

// MARK: - Sweep scaling: the sweep is the O(n) operation, so grow N to find where mtime spikes.

let shardCount = 16
print("\nSweep scaling — find expired (half the entries). 'sharded' = scan 1 of \(shardCount) shards (one launch's work):\n")
print(" \(col("entries", 10)) \(col("full mtime", 13)) \(col("sharded 1/\(shardCount)", 14)) \(col("sqlite", 10))")
for n in [10_000, 50_000, 100_000, 200_000] {
// mtime, sharded layout: n files spread across `shardCount` subdirectories, half seeded old.
let root = fileManager.temporaryDirectory.appendingPathComponent("sweep-\(n)-\(UUID().uuidString)")
var shards: [URL] = []
for shard in 0..<shardCount {
let dir = root.appendingPathComponent("shard-\(shard)")
try! fileManager.createDirectory(at: dir, withIntermediateDirectories: true)
shards.append(dir)
}
for index in 0..<n {
let url = shards[index % shardCount].appendingPathComponent("e-\(index)")
fileManager.createFile(atPath: url.path, contents: nil)
if index % 2 == 0 { try! fileManager.setAttributes([.modificationDate: oldDate], ofItemAtPath: url.path) }
}
// Full sweep scans every shard; the sharded per-launch sweep scans just one.
let fullScan = seconds { for shard in shards { _ = countExpired(in: shard, cutoff: mtimeCutoff) } }
let shardedScan = seconds { _ = countExpired(in: shards[0], cutoff: mtimeCutoff) }
try? fileManager.removeItem(at: root)

// sqlite: n rows, half old, indexed on ts.
let dbURL = fileManager.temporaryDirectory.appendingPathComponent("sweep-\(n)-\(UUID().uuidString).sqlite")
var db: OpaquePointer?
sqlite3_open(dbURL.path, &db)
sqlite3_exec(db, "CREATE TABLE access(key TEXT PRIMARY KEY, ts REAL); CREATE INDEX idx_ts ON access(ts);", nil, nil, nil)
sqlite3_exec(db, "BEGIN;", nil, nil, nil)
var ins: OpaquePointer?
sqlite3_prepare_v2(db, "INSERT INTO access(key, ts) VALUES (?, ?);", -1, &ins, nil)
for index in 0..<n {
sqlite3_bind_text(ins, 1, "e-\(index)", -1, SQLITE_TRANSIENT)
sqlite3_bind_double(ins, 2, index % 2 == 0 ? oldDate.timeIntervalSince1970 : Date().timeIntervalSince1970)
sqlite3_step(ins); sqlite3_reset(ins)
}
sqlite3_finalize(ins)
sqlite3_exec(db, "COMMIT;", nil, nil, nil)
var sel: OpaquePointer?
sqlite3_prepare_v2(db, "SELECT key FROM access WHERE ts < ?;", -1, &sel, nil)
let sqliteScan = seconds {
sqlite3_bind_double(sel, 1, mtimeCutoff.timeIntervalSince1970)
var expired = 0
while sqlite3_step(sel) == SQLITE_ROW { expired += 1 }
sqlite3_reset(sel)
}
sqlite3_finalize(sel)
sqlite3_close(db)
try? fileManager.removeItem(at: dbURL)

print(" \(col(String(n), 10)) \(col(String(format: "%.1f", fullScan * 1000), 10)) ms \(col(String(format: "%.1f", shardedScan * 1000), 11)) ms \(col(String(format: "%.1f", sqliteScan * 1000), 7)) ms")
}

try? fileManager.removeItem(at: mtimeDir)
try? fileManager.removeItem(at: sqliteURL)
82 changes: 82 additions & 0 deletions docs/benchmarks/cache-access-persistence.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# Benchmark: persisting cache access time — file `mtime` vs SQLite

A spike to decide how the disk cache should persist per-entry *last access* (for the sliding TTL /
warm-cold expiry) across launches, **without a separate manifest** that can drift from the files.

Two co-located approaches were compared:

- **Option 1 — file `mtime`.** Re-`setAttributes` the file's modification date on a hit; the sweep
`stat`s the cache directory. The timestamp lives *on* the file — nothing separate to sync. Also measured
**sharded**: split the directory into K subdirectories and sweep one per launch (O(N/K) per launch).
- **Option 2 — SQLite index.** A row per entry (`key TEXT PRIMARY KEY, ts REAL`, indexed on `ts`);
`UPSERT` on a hit, an indexed `SELECT` for the sweep.

Reproduce: `swift Benchmarks/CacheAccessPersistence.swift [entryCount accessCount]` (defaults 10k / 100k).
SQLite uses WAL + `synchronous=NORMAL` (the realistic durable-but-fast config). Numbers below are from
one run on macOS/APFS (Apple silicon) — indicative; absolute values differ on device, but the relative
picture is what matters.

## Results (10,000 entries, 100,000 accesses)

| Operation | Option 1 — `mtime` | Option 2 — SQLite |
|---|--:|--:|
| **recordAccess** (hot path, per read) | 25 µs touch · **15 µs** debounced (stat only) | **34 µs** autocommit · 2 µs batched (1 txn) |
| **sweep** (once per launch, 10k entries) | **~39 ms** | **~1 ms** |
| **footprint** | **0 bytes** extra | ~650 KB db + `-wal`/`-shm` sidecar |
| **moving parts** | none (timestamp on the file) | schema, C-API plumbing, a second store that can drift |

### Sweep scaling — where `mtime` spikes, and how sharding fixes it

The sweep is the O(n) operation (it must `stat` every file), so it's the one that blows up as the cache
grows. **Sharding** the cache directory into K subdirectories and sweeping just one shard per launch makes
each launch O(N/K); SQLite instead answers with an indexed range query:

| entries | full `mtime` | **sharded 1/16** | SQLite |
|--:|--:|--:|--:|
| 10,000 | 40 ms | 3 ms | 1 ms |
| 50,000 | 198 ms | 13 ms | 5 ms |
| 100,000 | 412 ms | 24 ms | 12 ms |
| 200,000 | **864 ms** | **53 ms** | 33 ms |

The full `mtime` scan spikes to ~0.9 s at 200k. Sharded (scan 1 of 16 per launch) cuts that ~16× to
**53 ms — the same ballpark as SQLite's 33 ms**, with zero dependency. The hot path barely moves with N
(per-op), so the sweep is the only place scale hurts, and sharding resolves it.

## Reading it

- **Hot path (the one that matters — runs on every cache read):** `mtime` touch (26 µs) and SQLite
autocommit (31 µs) are **comparable** — both are bound by per-op durability, not the data structure.
Debouncing the `mtime` touch (only re-touch when it's already stale) drops the common read to a bare
`stat` (~15 µs here, and that's inflated by `URL.resourceValues`; a raw `stat` is cheaper). SQLite only
wins the hot path if you **batch** writes in a transaction (1.5 µs) — but batching means buffering
access updates and flushing later, which adds a crash-loss window and a flush policy. For the cache's
bookkeeping that complexity isn't worth it.
- **Sweep:** the `mtime` scan grows with total entries — fine at 10k (~40 ms) but a **0.4–0.9 s spike at
100k–200k**, since it must `stat` every file. Two ways to tame it: **shard** (scan 1 of K shard
directories per launch → O(N/K)), or **SQLite** (indexed `SELECT`). Sharding at K=16 keeps the
per-launch sweep at **~53 ms even at 200k** — within ~1.5× of SQLite — while staying zero-dependency.
Its cost is *bounded GC latency*: a never-requested dead file lingers up to K launches before its
shard's turn (anything you actually request is expired immediately by the lazy-on-read check, so this
only delays garbage collection, never serves stale data), plus a one-integer "next shard" counter (not a
per-entry index — nothing to drift) and a shard prefix in the key→path mapping.
- **Footprint & complexity:** `mtime` adds zero storage and zero moving parts; the timestamp can't drift
from the file. SQLite adds a ~650 KB DB (+ WAL sidecars), the C-API plumbing, and — crucially — a
*second store to keep in sync with the files*, which is exactly the manifest problem we set out to avoid.

## Recommendation

**Option 1 — `mtime`, debounced, sharded from the start.** `mtime` ties SQLite on the hot path, is
zero-dependency, zero-footprint, and self-syncing (no second store to drift). Shard the cache directory
**unconditionally** (derive the shard from the low bits of the key hash that `destinationURL` already
computes for the 255-char fix) and sweep one shard per launch — the per-launch sweep stays in the tens of
ms at any size, so the spike never happens.

Shard from the start, not on-the-fly: an always-sharded layout is *one* code path with no migration, no
entry-count tracking, and no threshold to tune, whereas switching layouts at runtime needs an O(N) reorg
and two code paths. The small-cache cost is negligible (a few mostly-empty subdirectories; dead files GC
over ~K launches, while lazy-on-read expires anything you actually request immediately). It's how git's
`objects/` and browser caches lay out from the first entry.

**SQLite** would only win if you wanted the absolute-fastest sweep at very large N *and* preferred SQL
ergonomics; for a blob cache that rarely pays for its complexity (a ~650 KB store that can drift from the
files). Order of preference: `mtime` debounced + sharded → SQLite only as a last resort.