From b0cf1414a4318f239881765b30ee631128b63365 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Thu, 18 Jun 2026 16:03:59 +0200 Subject: [PATCH 1/3] refactor!: Rewrite the FilesystemStorageClient to use apify/crawlee-storage --- docs/upgrading/upgrading_v4.md | 17 +- packages/fs-storage/package.json | 7 +- .../src/background-handler/fs-utils.ts | 89 --- .../src/background-handler/index.ts | 51 -- packages/fs-storage/src/cache-helpers.ts | 414 ----------- packages/fs-storage/src/consts.ts | 4 - .../fs-storage/src/file-system-storage.ts | 365 +++------ packages/fs-storage/src/fs/common.ts | 5 - packages/fs-storage/src/fs/dataset/fs.ts | 43 -- packages/fs-storage/src/fs/dataset/index.ts | 16 - .../fs-storage/src/fs/key-value-store/fs.ts | 100 --- .../src/fs/key-value-store/index.ts | 17 - .../fs-storage/src/fs/request-queue/fs.ts | 88 --- .../fs-storage/src/fs/request-queue/index.ts | 10 - .../resource-clients/common/base-client.ts | 7 - .../src/resource-clients/dataset.ts | 230 ++---- .../src/resource-clients/key-value-store.ts | 380 ++++++---- .../src/resource-clients/request-queue.ts | 702 +++--------------- packages/fs-storage/src/utils.ts | 49 -- packages/fs-storage/test/__shared__.ts | 12 - packages/fs-storage/test/fs-fallback.test.ts | 46 +- .../test/key-value-store/special-keys.test.ts | 49 ++ .../key-value-store/with-extension.test.ts | 80 -- .../test/no-crash-on-big-buffers.test.ts | 39 +- .../test/request-queue/adapter.test.ts | 146 ++++ .../dangling-forefront-on-reload.test.ts | 86 --- .../test/request-queue/forefront.test.ts | 266 ------- .../handledRequestCount-should-update.test.ts | 51 -- .../ignore-non-json-files.test.ts | 69 -- .../request-queue/reload-persistence.test.ts | 43 ++ .../fs-storage/test/write-metadata.test.ts | 80 -- pnpm-lock.yaml | 63 +- vitest.config.mts | 8 +- 33 files changed, 876 insertions(+), 2756 deletions(-) delete mode 100644 packages/fs-storage/src/background-handler/fs-utils.ts delete mode 100644 packages/fs-storage/src/background-handler/index.ts delete mode 100644 packages/fs-storage/src/cache-helpers.ts delete mode 100644 packages/fs-storage/src/consts.ts delete mode 100644 packages/fs-storage/src/fs/common.ts delete mode 100644 packages/fs-storage/src/fs/dataset/fs.ts delete mode 100644 packages/fs-storage/src/fs/dataset/index.ts delete mode 100644 packages/fs-storage/src/fs/key-value-store/fs.ts delete mode 100644 packages/fs-storage/src/fs/key-value-store/index.ts delete mode 100644 packages/fs-storage/src/fs/request-queue/fs.ts delete mode 100644 packages/fs-storage/src/fs/request-queue/index.ts delete mode 100644 packages/fs-storage/src/resource-clients/common/base-client.ts delete mode 100644 packages/fs-storage/test/__shared__.ts create mode 100644 packages/fs-storage/test/key-value-store/special-keys.test.ts delete mode 100644 packages/fs-storage/test/key-value-store/with-extension.test.ts create mode 100644 packages/fs-storage/test/request-queue/adapter.test.ts delete mode 100644 packages/fs-storage/test/request-queue/dangling-forefront-on-reload.test.ts delete mode 100644 packages/fs-storage/test/request-queue/forefront.test.ts delete mode 100644 packages/fs-storage/test/request-queue/handledRequestCount-should-update.test.ts delete mode 100644 packages/fs-storage/test/request-queue/ignore-non-json-files.test.ts create mode 100644 packages/fs-storage/test/request-queue/reload-persistence.test.ts delete mode 100644 packages/fs-storage/test/write-metadata.test.ts diff --git a/docs/upgrading/upgrading_v4.md b/docs/upgrading/upgrading_v4.md index ad0258db4321..596110182665 100644 --- a/docs/upgrading/upgrading_v4.md +++ b/docs/upgrading/upgrading_v4.md @@ -683,12 +683,27 @@ const storageClient = new FileSystemStorageClient(); const inMemory = new MemoryStorageClient(); ``` -The `localDataDirectory`, `persistStorage`, and `writeMetadata` options are still accepted by `MemoryStorageClient` for source compatibility, but they are ignored — in-memory storage has nowhere to write. `FileSystemStorageClient` honors `localDataDirectory` and `writeMetadata`; it always persists, so it has no `persistStorage` option. +`MemoryStorageClient` no longer takes the `localDataDirectory`, `persistStorage`, or `writeMetadata` options — in-memory storage has nowhere to write, so they had no meaning. `FileSystemStorageClient` honors `localDataDirectory`; it always persists, so it has no `persistStorage` option, and the `writeMetadata` option has been removed there too (see [`writeMetadata` option removed](#writemetadata-option-removed)). ### No request lock expiry in `MemoryStorageClient` Because the in-memory queue lives entirely within a single process and is never shared with another consumer, `MemoryStorageClient`'s request queue no longer uses an expiring, cross-process lock. A fetched request simply stays *in progress* until it is handled or reclaimed; it never becomes fetchable again on its own after a timeout. `setExpectedRequestProcessingTimeSecs()` is therefore a no-op for in-memory storage. (Disk-backed `FileSystemStorageClient` keeps the lock-with-expiry behavior.) +### `writeMetadata` option removed + +`FileSystemStorageClient` no longer accepts the `writeMetadata` option. The underlying file-system storage now always writes metadata files (`__metadata__.json` for each storage and a `.__metadata__.json` sidecar for each key-value record), so the toggle no longer had any effect. Remove it from your storage client options: + +```diff + import { FileSystemStorageClient } from '@crawlee/fs-storage'; + + const storageClient = new FileSystemStorageClient({ + localDataDirectory: './storage', +- writeMetadata: true, + }); +``` + +`MemoryStorageClient` never accepted `writeMetadata` (it has no on-disk format to begin with), so there is nothing to change there. + ## Multiple crawler instances use separate default request queues In v3, every `BasicCrawler` (or subclass) that didn't receive an explicit `requestQueue` option would open the same default request queue. If you created two crawlers in the same process, they would silently share a queue — leading to request collisions and hard-to-debug deduplication issues. diff --git a/packages/fs-storage/package.json b/packages/fs-storage/package.json index ef6c35e52583..caa5c9d90985 100644 --- a/packages/fs-storage/package.json +++ b/packages/fs-storage/package.json @@ -42,14 +42,11 @@ "access": "public" }, "dependencies": { + "@crawlee/fs-storage-native": "0.1.5-beta.0", "@crawlee/types": "workspace:*", - "@sapphire/async-queue": "^1.5.5", "@sapphire/shapeshift": "^4.0.0", "content-type": "^1.0.5", - "fs-extra": "^11.3.0", "json5": "^2.2.3", - "mime-types": "^3.0.1", - "proper-lockfile": "^4.1.2", - "tslib": "^2.8.1" + "mime-types": "^3.0.1" } } diff --git a/packages/fs-storage/src/background-handler/fs-utils.ts b/packages/fs-storage/src/background-handler/fs-utils.ts deleted file mode 100644 index 2dc1e1ea80cd..000000000000 --- a/packages/fs-storage/src/background-handler/fs-utils.ts +++ /dev/null @@ -1,89 +0,0 @@ -import { writeFile } from 'node:fs'; -import { writeFile as writeFileP } from 'node:fs/promises'; -import { resolve } from 'node:path'; -import { setTimeout } from 'node:timers/promises'; - -import type { CrawleeLogger } from '@crawlee/types'; -import { ensureDir } from 'fs-extra/esm'; -import { lock } from 'proper-lockfile'; - -import type { BackgroundHandlerReceivedMessage, BackgroundHandlerUpdateMetadataMessage } from '../utils.js'; - -export async function handleMessage(message: BackgroundHandlerReceivedMessage, logger?: CrawleeLogger) { - switch (message.action) { - case 'update-metadata': - await updateMetadata(message); - break; - default: - // We're keeping this to make eslint happy + in the event we add a new action without adding checks for it - // we should be aware of them - logger?.warning( - `Unknown background handler message action ${(message as BackgroundHandlerReceivedMessage).action}`, - ); - } -} - -async function updateMetadata(message: BackgroundHandlerUpdateMetadataMessage) { - // Skip writing the actual metadata file. This is done after ensuring the directory exists so we have the directory present - if (!message.writeMetadata) { - return; - } - - // Ensure the directory for the entity exists - const dir = message.entityDirectory; - await ensureDir(dir); - - // Write the metadata to the file - const filePath = resolve(dir, '__metadata__.json'); - await writeFileP(filePath, JSON.stringify(message.data, null, '\t')); -} - -export async function lockAndWrite( - filePath: string, - data: unknown, - stringify = true, - retry = 10, - timeout = 10, -): Promise { - await lockAndCallback( - filePath, - async () => { - await new Promise((pResolve, reject) => { - writeFile(filePath, stringify ? JSON.stringify(data, null, '\t') : (data as Buffer), (err) => { - if (err) { - reject(err); - } else { - pResolve(); - } - }); - }); - }, - retry, - timeout, - ); -} - -export async function lockAndCallback Promise>( - filePath: string, - callback: Callback, - retry = 10, - timeout = 10, -): Promise>> { - let release: (() => Promise) | null = null; - try { - release = await lock(filePath, { realpath: false }); - - return await callback(); - } catch (e: any) { - if (e.code === 'ELOCKED' && retry > 0) { - await setTimeout(timeout); - return lockAndCallback(filePath, callback, retry - 1, timeout * 2); - } - - throw e; - } finally { - if (release) { - await release(); - } - } -} diff --git a/packages/fs-storage/src/background-handler/index.ts b/packages/fs-storage/src/background-handler/index.ts deleted file mode 100644 index 007a98e03049..000000000000 --- a/packages/fs-storage/src/background-handler/index.ts +++ /dev/null @@ -1,51 +0,0 @@ -import { randomUUID } from 'node:crypto'; - -import type { CrawleeLogger } from '@crawlee/types'; - -import type { BackgroundHandlerReceivedMessage } from '../utils.js'; -import { handleMessage } from './fs-utils.js'; - -/** - * A map of promises that are created when a background task is scheduled. - * This is used in FileSystemStorageClient#teardown to wait for all tasks to finish executing before exiting the process. - * @internal - */ -export const promiseMap: Map< - string, - { - promise: Promise; - resolve: () => void; - } -> = new Map(); - -export function scheduleBackgroundTask(message: BackgroundHandlerReceivedMessage, logger?: CrawleeLogger) { - const id = randomUUID(); - - let promiseResolve: () => void; - const promise = new Promise((res) => { - promiseResolve = res; - }); - - promiseMap.set(id, { - promise, - resolve: promiseResolve!, - }); - - void handleBackgroundMessage( - { - ...message, - messageId: id, - }, - logger, - ); -} - -async function handleBackgroundMessage( - message: BackgroundHandlerReceivedMessage & { messageId: string }, - logger?: CrawleeLogger, -) { - await handleMessage(message, logger); - - promiseMap.get(message.messageId)?.resolve(); - promiseMap.delete(message.messageId); -} diff --git a/packages/fs-storage/src/cache-helpers.ts b/packages/fs-storage/src/cache-helpers.ts deleted file mode 100644 index acb1e7dd6a83..000000000000 --- a/packages/fs-storage/src/cache-helpers.ts +++ /dev/null @@ -1,414 +0,0 @@ -import { access, opendir, readFile } from 'node:fs/promises'; -import { extname, resolve } from 'node:path'; - -import type * as storage from '@crawlee/types'; -import json5 from 'json5'; -import mimeTypes from 'mime-types'; - -import { DatasetFileSystemEntry } from './fs/dataset/fs.js'; -import { KeyValueFileSystemEntry } from './fs/key-value-store/fs.js'; -import { RequestQueueFileSystemEntry } from './fs/request-queue/fs.js'; -import { type FileSystemStorageClient } from './file-system-storage.js'; - -const uuidRegex = /[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/i; - -export async function findOrCacheDatasetByPossibleId(client: FileSystemStorageClient, entryNameOrId: string) { - // First check memory cache — match by id, name, or directoryName (which covers alias lookups) - const found = client.datasetClientCache.find( - (store) => - store.id === entryNameOrId || - store.name?.toLowerCase() === entryNameOrId.toLowerCase() || - store.directoryName.toLowerCase() === entryNameOrId.toLowerCase(), - ); - - if (found) { - return found; - } - - const datasetDir = resolve(client.datasetsDirectory, entryNameOrId); - - try { - // Check if directory exists - await access(datasetDir); - } catch { - return undefined; - } - - // Access the dataset folder - const directoryEntries = await opendir(datasetDir); - - let id: string | undefined; - let name: string | undefined; - let itemCount = 0; - - const entries = new Set(); - - let createdAt = new Date(); - let accessedAt = new Date(); - let modifiedAt = new Date(); - - let hasSeenMetadataFile = false; - - for await (const entry of directoryEntries) { - if (entry.isFile()) { - if (entry.name === '__metadata__.json') { - hasSeenMetadataFile = true; - - // we have found the store metadata file, build out information based on it - const fileContent = await readFile(resolve(datasetDir, entry.name), 'utf8'); - if (!fileContent) continue; - - const metadata = JSON.parse(fileContent) as storage.DatasetInfo; - id = metadata.id; - name = metadata.name; - itemCount = metadata.itemCount; - createdAt = new Date(metadata.createdAt); - accessedAt = new Date(metadata.accessedAt); - modifiedAt = new Date(metadata.modifiedAt); - - continue; - } - - const entryName = entry.name.split('.')[0]; - entries.add(entryName); - - if (!hasSeenMetadataFile) { - itemCount++; - } - } - } - - if (id === undefined && name === undefined) { - const isUuid = uuidRegex.test(entryNameOrId); - - if (isUuid) { - id = entryNameOrId; - } else { - name = entryNameOrId; - } - } - - const newClient = new DatasetClient({ - baseStorageDirectory: client.datasetsDirectory, - client, - id, - name, - }); - - // Overwrite properties - newClient.accessedAt = accessedAt; - newClient.createdAt = createdAt; - newClient.modifiedAt = modifiedAt; - newClient.itemCount = itemCount; - - for (const entryId of entries.values()) { - // We create a file system entry instead of possibly making an in-memory one to allow the pre-included data to be used on demand - const entry = new DatasetFileSystemEntry({ - storeDirectory: datasetDir, - entityId: entryId, - }); - - // eslint-disable-next-line dot-notation - newClient['datasetEntries'].set(entryId, entry); - } - - client.datasetClientCache.push(newClient); - - return newClient; -} - -export async function findOrCacheKeyValueStoreByPossibleId(client: FileSystemStorageClient, entryNameOrId: string) { - // First check memory cache — match by id, name, or directoryName (which covers alias lookups) - const found = client.keyValueStoreCache.find( - (store) => - store.id === entryNameOrId || - store.name?.toLowerCase() === entryNameOrId.toLowerCase() || - store.directoryName.toLowerCase() === entryNameOrId.toLowerCase(), - ); - - if (found) { - return found; - } - - const keyValueStoreDir = resolve(client.keyValueStoresDirectory, entryNameOrId); - - try { - // Check if directory exists - await access(keyValueStoreDir); - } catch { - return undefined; - } - - // Access the key value store folder - const directoryEntries = await opendir(keyValueStoreDir); - - let id: string | undefined; - let name: string | undefined; - let createdAt = new Date(); - let accessedAt = new Date(); - let modifiedAt = new Date(); - - type FsRecord = Omit; - const internalRecords = new Map(); - let hasSeenMetadataForEntry = false; - - for await (const entry of directoryEntries) { - if (entry.isFile()) { - if (entry.name === '__metadata__.json') { - // we have found the store metadata file, build out information based on it - const fileContent = await readFile(resolve(keyValueStoreDir, entry.name), 'utf8'); - if (!fileContent) continue; - - const metadata = JSON.parse(fileContent) as storage.KeyValueStoreInfo; - id = metadata.id; - name = metadata.name; - createdAt = new Date(metadata.createdAt); - accessedAt = new Date(metadata.accessedAt); - modifiedAt = new Date(metadata.modifiedAt); - - continue; - } - - if (entry.name.includes('.__metadata__.')) { - hasSeenMetadataForEntry = true; - - // This is an entry's metadata file, we can use it to create/extend the record - const fileContent = await readFile(resolve(keyValueStoreDir, entry.name), 'utf8'); - if (!fileContent) continue; - - const metadata = JSON.parse(fileContent) as FsRecord; - - const newRecord = { - ...internalRecords.get(metadata.key), - ...metadata, - } as FsRecord; - - internalRecords.set(metadata.key, newRecord); - - continue; - } - - // This is an entry in the store, we can use it to create/extend the record - const fileContent = await readFile(resolve(keyValueStoreDir, entry.name)); - const fileExtension = extname(entry.name); - const contentType = mimeTypes.contentType(entry.name) || 'text/plain'; - const extension = mimeTypes.extension(contentType) as string; - - // This is kept for backwards compatibility / to ignore invalid JSON files - if (contentType.includes('application/json')) { - const stringifiedJson = fileContent.toString('utf8'); - - try { - json5.parse(stringifiedJson); - } catch { - client.logger?.warning( - `Key-value entry "${entry.name}" for store ${entryNameOrId} has invalid JSON content and will be ignored from the store.`, - ); - continue; - } - } - - const nameSplit = entry.name.split('.'); - - if (fileExtension) { - nameSplit.pop(); - } - - const key = nameSplit.join('.'); - - const newRecord = { - key, - extension, - contentType, - ...internalRecords.get(key), - } satisfies FsRecord; - - internalRecords.set(key, newRecord); - } - } - - if (id === undefined && name === undefined) { - const isUuid = uuidRegex.test(entryNameOrId); - - if (isUuid) { - id = entryNameOrId; - } else { - name = entryNameOrId; - } - } - - const newClient = new KeyValueStoreClient({ - baseStorageDirectory: client.keyValueStoresDirectory, - client, - id, - name, - }); - - // Overwrite properties - newClient.accessedAt = accessedAt; - newClient.createdAt = createdAt; - newClient.modifiedAt = modifiedAt; - - for (const [key, record] of internalRecords) { - // We create a file system entry instead of possibly making an in-memory one to allow the pre-included data to be used on demand - const entry = new KeyValueFileSystemEntry({ - storeDirectory: keyValueStoreDir, - writeMetadata: hasSeenMetadataForEntry, - logger: client.logger, - }); - - // eslint-disable-next-line dot-notation - entry['rawRecord'] = { ...record }; - // eslint-disable-next-line dot-notation - entry['filePath'] = resolve(keyValueStoreDir, `${record.key}.${record.extension}`); - // eslint-disable-next-line dot-notation - entry['fileMetadataPath'] = resolve(keyValueStoreDir, `${record.key}.__metadata__.json`); - - // eslint-disable-next-line dot-notation - newClient['keyValueEntries'].set(key, entry); - } - - client.keyValueStoreCache.push(newClient); - - return newClient; -} - -export async function findRequestQueueByPossibleId(client: FileSystemStorageClient, entryNameOrId: string) { - // First check memory cache — match by id, name, or directoryName (which covers alias lookups) - const found = client.requestQueueCache.find( - (store) => - store.id === entryNameOrId || - store.name?.toLowerCase() === entryNameOrId.toLowerCase() || - store.directoryName.toLowerCase() === entryNameOrId.toLowerCase(), - ); - - if (found) { - return found; - } - - const requestQueueDir = resolve(client.requestQueuesDirectory, entryNameOrId); - - try { - // Check if directory exists - await access(requestQueueDir); - } catch { - return undefined; - } - - // Access the request queue folder - const directoryEntries = await opendir(requestQueueDir); - - let id: string | undefined; - let name: string | undefined; - let createdAt = new Date(); - let accessedAt = new Date(); - let modifiedAt = new Date(); - const entries = new Set(); - let forefrontRequestIds: string[] = []; - - // The request counts are derived from the request files actually present on disk rather than read - // from the metadata file: metadata is only persisted when `writeMetadata` is enabled (off by - // default), whereas request files are always persisted. Trusting the metadata counts would reset - // them to 0 on reload whenever `writeMetadata` is off, even though the requests survive on disk. - let pendingRequestCount = 0; - let handledRequestCount = 0; - - for await (const entry of directoryEntries) { - if (entry.isFile()) { - switch (entry.name) { - case '__metadata__.json': { - // we have found the store metadata file, build out information based on it - const fileContent = await readFile(resolve(requestQueueDir, entry.name), 'utf8'); - if (!fileContent) continue; - - const metadata = JSON.parse(fileContent) as storage.RequestQueueInfo; - - id = metadata.id; - name = metadata.name; - createdAt = new Date(metadata.createdAt); - accessedAt = new Date(metadata.accessedAt); - modifiedAt = new Date(metadata.modifiedAt); - forefrontRequestIds = (metadata as any)?.forefrontRequestIds ?? []; - - break; - } - default: { - // Skip non-JSON and files that start with a dot - if (entry.name.startsWith('.') || !entry.name.endsWith('.json')) { - continue; - } - - const entryName = entry.name.split('.')[0]; - - try { - // Try parsing the file to ensure it's even valid to begin with - const fileContent = await readFile(resolve(requestQueueDir, entry.name), 'utf8'); - const parsed = JSON.parse(fileContent) as { orderNo?: number | null }; - - entries.add(entryName); - - // A handled request has `orderNo === null`; anything else is still pending. - if (parsed.orderNo === null) { - handledRequestCount += 1; - } else { - pendingRequestCount += 1; - } - } catch { - client.logger?.warning( - `Request queue entry "${entry.name}" for store ${entryNameOrId} has invalid JSON content and will be ignored from the store.`, - ); - } - } - } - } - } - - if (id === undefined && name === undefined) { - const isUuid = uuidRegex.test(entryNameOrId); - - if (isUuid) { - id = entryNameOrId; - } else { - name = entryNameOrId; - } - } - - const newClient = new RequestQueueClient({ - baseStorageDirectory: client.requestQueuesDirectory, - client, - id, - name, - }); - - // Overwrite properties - newClient.accessedAt = accessedAt; - newClient.createdAt = createdAt; - newClient.modifiedAt = modifiedAt; - newClient.pendingRequestCount = pendingRequestCount; - newClient.handledRequestCount = handledRequestCount; - // Drop any persisted forefront ids whose request file is missing or unparseable on disk (and was - // therefore not added to `entries`). Keeping them would leave a dangling id in `forefrontRequestIds` - // that `listPendingHead` would resolve to a missing request and dereference as `undefined`. - // @ts-expect-error - Assigning to private property - newClient.forefrontRequestIds = forefrontRequestIds.filter((requestId) => entries.has(requestId)); - - for (const requestId of entries) { - const entry = new RequestQueueFileSystemEntry({ - requestId, - storeDirectory: requestQueueDir, - }); - - // eslint-disable-next-line dot-notation - newClient['requests'].set(requestId, entry); - } - - client.requestQueueCache.push(newClient); - - return newClient; -} - -/* eslint-disable import/first -- Fixing circulars */ -import { DatasetClient } from './resource-clients/dataset.js'; -import type { InternalKeyRecord } from './resource-clients/key-value-store.js'; -import { KeyValueStoreClient } from './resource-clients/key-value-store.js'; -import { RequestQueueClient } from './resource-clients/request-queue.js'; diff --git a/packages/fs-storage/src/consts.ts b/packages/fs-storage/src/consts.ts deleted file mode 100644 index db8b297607a9..000000000000 --- a/packages/fs-storage/src/consts.ts +++ /dev/null @@ -1,4 +0,0 @@ -/** - * Length of id property of a Request instance in characters. - */ -export const REQUEST_ID_LENGTH = 15; diff --git a/packages/fs-storage/src/file-system-storage.ts b/packages/fs-storage/src/file-system-storage.ts index ac427c9d343c..57817234e30f 100644 --- a/packages/fs-storage/src/file-system-storage.ts +++ b/packages/fs-storage/src/file-system-storage.ts @@ -1,18 +1,15 @@ -/* eslint-disable import/no-duplicates */ -import { access, readdir, rm } from 'node:fs/promises'; +import { existsSync } from 'node:fs'; import { resolve } from 'node:path'; import type * as storage from '@crawlee/types'; import type { CrawleeLogger } from '@crawlee/types'; import { s } from '@sapphire/shapeshift'; -import { ensureDirSync, move, moveSync, pathExistsSync } from 'fs-extra/esm'; -import { promiseMap, scheduleBackgroundTask } from './background-handler/index.js'; import { - findOrCacheDatasetByPossibleId, - findOrCacheKeyValueStoreByPossibleId, - findRequestQueueByPossibleId, -} from './cache-helpers.js'; + FileSystemDatasetClient as NativeDatasetClient, + FileSystemKeyValueStoreClient as NativeKeyValueStoreClient, + FileSystemRequestQueueClient as NativeRequestQueueClient, +} from '@crawlee/fs-storage-native'; import { DatasetClient } from './resource-clients/dataset.js'; import { KeyValueStoreClient } from './resource-clients/key-value-store.js'; import { RequestQueueClient } from './resource-clients/request-queue.js'; @@ -24,24 +21,25 @@ export interface FileSystemStorageOptions { */ localDataDirectory?: string; - /** - * Whether to also write optional metadata files when storing to disk. - * @default process.env.DEBUG?.includes('*') ?? process.env.DEBUG?.includes('crawlee:memory-storage') ?? false - */ - writeMetadata?: boolean; - /** * Optional logger for FileSystemStorageClient warnings. */ logger?: CrawleeLogger; } +/** + * A file-system storage client backed by the native `@crawlee/fs-storage-native` Rust extension. + * + * The native extension owns the on-disk format, timestamps, item counting, request-queue locking and + * state persistence. This class is responsible for resolving the user-facing `id` / `name` / `alias` + * identifiers to native storages, caching the opened clients (so that `storageExists`, `purge` and + * `teardown` can operate over them), and exposing them through the `@crawlee/types` interfaces. + */ export class FileSystemStorageClient implements storage.StorageClient { readonly localDataDirectory: string; readonly datasetsDirectory: string; readonly keyValueStoresDirectory: string; readonly requestQueuesDirectory: string; - readonly writeMetadata: boolean; readonly logger?: CrawleeLogger; readonly keyValueStoreCache: KeyValueStoreClient[] = []; @@ -51,7 +49,6 @@ export class FileSystemStorageClient implements storage.StorageClient { constructor(options: FileSystemStorageOptions = {}) { s.object({ localDataDirectory: s.string().optional(), - writeMetadata: s.boolean().optional(), }).parse(options); this.logger = options.logger; @@ -60,7 +57,7 @@ export class FileSystemStorageClient implements storage.StorageClient { // this function handles it without making BC breaks - it respects existing `crawlee_storage` // directories, and uses the `storage` only if it's not there. const defaultStorageDir = () => { - if (pathExistsSync(resolve('./crawlee_storage'))) { + if (existsSync(resolve('./crawlee_storage'))) { return './crawlee_storage'; } @@ -71,143 +68,111 @@ export class FileSystemStorageClient implements storage.StorageClient { this.datasetsDirectory = resolve(this.localDataDirectory, 'datasets'); this.keyValueStoresDirectory = resolve(this.localDataDirectory, 'key_value_stores'); this.requestQueuesDirectory = resolve(this.localDataDirectory, 'request_queues'); - this.writeMetadata = - options.writeMetadata ?? - process.env.DEBUG?.includes('*') ?? - process.env.DEBUG?.includes('crawlee:memory-storage') ?? - false; } /** - * Return a cache key that includes the resolved storage directory, so that - * two `FileSystemStorageClient` instances pointing at different directories get separate - * cache partitions. Mirrors crawlee-python's `FileSystemStorageClient` which - * includes `configuration.storage_dir` in its cache key. + * Return a cache key that includes the resolved storage directory, so that two + * `FileSystemStorageClient` instances pointing at different directories get separate cache + * partitions. Mirrors crawlee-python's `FileSystemStorageClient`, which includes the storage + * directory in its cache key. */ getStorageClientCacheKey(): string { return `FileSystemStorageClient:${resolve(this.localDataDirectory)}`; } private static resolveStorageKey(options: { id?: string; name?: string; alias?: string }): { - isAlias: boolean; - directoryKey: string | undefined; + id?: string; + name?: string; + alias?: string; + cacheKey: string | undefined; } { const isAlias = 'alias' in options && !!options.alias; const rawKey = isAlias ? options.alias : (options.name ?? options.id); // Normalize the internal __default__ alias to the user-facing 'default' name. - const directoryKey = rawKey === '__default__' ? 'default' : rawKey; - return { isAlias, directoryKey }; + const cacheKey = rawKey === '__default__' ? 'default' : rawKey; + return { id: options.id, name: options.name, alias: options.alias, cacheKey }; } async createDatasetClient(options: storage.CreateDatasetClientOptions = {}): Promise { - const { isAlias, directoryKey } = FileSystemStorageClient.resolveStorageKey(options); - - if (directoryKey) { - const found = await findOrCacheDatasetByPossibleId(this, directoryKey); + const { id, name, alias, cacheKey } = FileSystemStorageClient.resolveStorageKey(options); + + if (cacheKey) { + const found = this.datasetClientCache.find( + (store) => + store.id === cacheKey || + store.name?.toLowerCase() === cacheKey.toLowerCase() || + store.cacheKey.toLowerCase() === cacheKey.toLowerCase(), + ); if (found) { return found; } } - const newStore = new DatasetClient({ - name: isAlias ? undefined : directoryKey, - directoryName: directoryKey, - baseStorageDirectory: this.datasetsDirectory, - client: this, + const nativeClient = await NativeDatasetClient.open(id, name, alias, this.localDataDirectory); + const newStore = await DatasetClient.create({ + name: alias ? undefined : (name ?? cacheKey), + cacheKey: cacheKey ?? '', + nativeClient, }); this.datasetClientCache.push(newStore); - // Schedule the worker to write to the disk - const datasetInfo = newStore.toDatasetInfo(); - - scheduleBackgroundTask( - { - action: 'update-metadata', - entityType: 'datasets', - entityDirectory: newStore.datasetDirectory, - id: datasetInfo.name ?? datasetInfo.id, - data: datasetInfo, - writeMetadata: this.writeMetadata, - }, - this.logger, - ); - return newStore; } async createKeyValueStoreClient( options: storage.CreateKeyValueStoreClientOptions = {}, ): Promise { - const { isAlias, directoryKey } = FileSystemStorageClient.resolveStorageKey(options); - - if (directoryKey) { - const found = await findOrCacheKeyValueStoreByPossibleId(this, directoryKey); + const { id, name, alias, cacheKey } = FileSystemStorageClient.resolveStorageKey(options); + + if (cacheKey) { + const found = this.keyValueStoreCache.find( + (store) => + store.id === cacheKey || + store.name?.toLowerCase() === cacheKey.toLowerCase() || + store.cacheKey.toLowerCase() === cacheKey.toLowerCase(), + ); if (found) { return found; } } - const newStore = new KeyValueStoreClient({ - name: isAlias ? undefined : directoryKey, - directoryName: directoryKey, - baseStorageDirectory: this.keyValueStoresDirectory, - client: this, + const nativeClient = await NativeKeyValueStoreClient.open(id, name, alias, this.localDataDirectory); + const newStore = await KeyValueStoreClient.create({ + name: alias ? undefined : (name ?? cacheKey), + cacheKey: cacheKey ?? '', + nativeClient, + logger: this.logger, }); this.keyValueStoreCache.push(newStore); - // Schedule the worker to write to the disk - const kvStoreInfo = newStore.toKeyValueStoreInfo(); - - scheduleBackgroundTask( - { - action: 'update-metadata', - entityType: 'keyValueStores', - entityDirectory: newStore.keyValueStoreDirectory, - id: kvStoreInfo.name ?? kvStoreInfo.id, - data: kvStoreInfo, - writeMetadata: this.writeMetadata, - }, - this.logger, - ); - return newStore; } async createRequestQueueClient( options: storage.CreateRequestQueueClientOptions = {}, ): Promise { - const { isAlias, directoryKey } = FileSystemStorageClient.resolveStorageKey(options); - - if (directoryKey) { - const found = await findRequestQueueByPossibleId(this, directoryKey); + const { id, name, alias, cacheKey } = FileSystemStorageClient.resolveStorageKey(options); + + if (cacheKey) { + const found = this.requestQueueCache.find( + (queue) => + queue.id === cacheKey || + queue.name?.toLowerCase() === cacheKey.toLowerCase() || + queue.cacheKey.toLowerCase() === cacheKey.toLowerCase(), + ); if (found) { return found; } } - const newStore = new RequestQueueClient({ - name: isAlias ? undefined : directoryKey, - directoryName: directoryKey, - baseStorageDirectory: this.requestQueuesDirectory, - client: this, + const nativeClient = await NativeRequestQueueClient.open(id, name, alias, this.localDataDirectory); + const newStore = await RequestQueueClient.create({ + name: alias ? undefined : (name ?? cacheKey), + cacheKey: cacheKey ?? '', + nativeClient, }); this.requestQueueCache.push(newStore); - // Schedule the worker to write to the disk - const queueInfo = newStore.toRequestQueueInfo(); - - scheduleBackgroundTask( - { - action: 'update-metadata', - entityType: 'requestQueues', - entityDirectory: newStore.requestQueueDirectory, - id: queueInfo.name ?? queueInfo.id, - data: queueInfo, - writeMetadata: this.writeMetadata, - }, - this.logger, - ); - return newStore; } @@ -232,26 +197,23 @@ export class FileSystemStorageClient implements storage.StorageClient { return false; } - // Check in-memory cache by actual storage ID + // Check the in-memory cache by actual storage ID first. if (clients.some((store) => store.id === id)) { return true; } - // Check if a directory with that ID exists on disk. - // Only consider directories whose name matches the queried ID — this avoids - // false positives for alias-created directories (e.g. a directory named 'asdf' - // created via `{ alias: 'asdf' }` should not make `storageExists('asdf')` return true, - // since the actual storage ID is a UUID, not the alias string). + // Otherwise, check whether a directory named exactly after the queried ID exists on disk. + // Only an exact directory-name match counts — this avoids false positives for alias-created + // directories (e.g. a directory named 'asdf' created via `{ alias: 'asdf' }` should not make + // `storageExists('asdf')` return true, since the actual storage ID is a UUID, not the alias). + const cachedClients = clients as (KeyValueStoreClient | DatasetClient | RequestQueueClient)[]; + if (cachedClients.some((store) => store.cacheKey === id && store.id !== id)) { + return false; + } + + const { access } = await import('node:fs/promises'); try { await access(resolve(baseDir, id)); - - // If the directory exists but a cached client already owns this directory - // under a different ID, this is not a match. - const cachedClients = clients as { id: string; directoryName?: string }[]; - if (cachedClients.some((store) => store.directoryName === id && store.id !== id)) { - return false; - } - return true; } catch { return false; @@ -268,167 +230,30 @@ export class FileSystemStorageClient implements storage.StorageClient { } /** - * Cleans up the default storage directories before the run starts: - * - local directory containing the default dataset; - * - all records from the default key-value store in the local directory, except for the "INPUT" key; - * - local directory containing the default request queue. + * Cleans up the default storages before the run starts: + * - the default dataset; + * - all records from the default key-value store, except for the "INPUT" key; + * - the default request queue. */ async purge(): Promise { - // Key-value stores - const keyValueStores = await readdir(this.keyValueStoresDirectory).catch(() => []); - const keyValueStorePromises: Promise[] = []; - - for (const keyValueStoreFolder of keyValueStores) { - if (keyValueStoreFolder.startsWith('__CRAWLEE_TEMPORARY') || keyValueStoreFolder.startsWith('__OLD')) { - keyValueStorePromises.push( - (await this.batchRemoveFiles(resolve(this.keyValueStoresDirectory, keyValueStoreFolder)))(), - ); - } else if (keyValueStoreFolder === 'default' || keyValueStoreFolder === '__default__') { - keyValueStorePromises.push( - this.handleDefaultKeyValueStore(resolve(this.keyValueStoresDirectory, keyValueStoreFolder))(), - ); - } - } - - void Promise.allSettled(keyValueStorePromises); - - // Datasets - const datasets = await readdir(this.datasetsDirectory).catch(() => []); - const datasetPromises: Promise[] = []; - - for (const datasetFolder of datasets) { - if ( - datasetFolder === 'default' || - datasetFolder === '__default__' || - datasetFolder.startsWith('__CRAWLEE_TEMPORARY') - ) { - datasetPromises.push((await this.batchRemoveFiles(resolve(this.datasetsDirectory, datasetFolder)))()); - } - } - - void Promise.allSettled(datasetPromises); - - // Request queues - const requestQueues = await readdir(this.requestQueuesDirectory).catch(() => []); - const requestQueuePromises: Promise[] = []; - - for (const requestQueueFolder of requestQueues) { - if ( - requestQueueFolder === 'default' || - requestQueueFolder === '__default__' || - requestQueueFolder.startsWith('__CRAWLEE_TEMPORARY') - ) { - requestQueuePromises.push( - (await this.batchRemoveFiles(resolve(this.requestQueuesDirectory, requestQueueFolder)))(), - ); - } - } - - void Promise.allSettled(requestQueuePromises); + const isDefault = (store: { name?: string; cacheKey: string }) => + store.name === 'default' || store.cacheKey === 'default'; + + await Promise.all([ + // Preserve the run input (INPUT) when purging the default key-value store. + ...this.keyValueStoreCache.filter(isDefault).map(async (store) => store.purgeExceptInput()), + ...this.datasetClientCache.filter(isDefault).map(async (store) => store.purge()), + ...this.requestQueueCache.filter(isDefault).map(async (store) => store.purge()), + ]); } /** * This method should be called at the end of the process, to ensure all data is saved. + * + * It persists the state of every opened request queue so that requests fetched but not yet handled + * are not stuck (until their lock expires) for the next consumer of the same on-disk queue. */ async teardown(): Promise { - // Release any request locks this process still holds so that requests fetched but not yet - // handled are not stuck (until their lock expires) for the next consumer of the same on-disk - // queue. Other storage backends don't need this: the Apify platform releases a run's locks - // automatically, and the file-system storage doesn't lock at all. - await Promise.all(this.requestQueueCache.map(async (queue) => queue.releaseOwnLocks())); - - const promises = [...promiseMap.values()].map(async ({ promise }) => promise); - - await Promise.all(promises); - } - - private handleDefaultKeyValueStore(folder: string): () => Promise { - const storagePathExists = pathExistsSync(folder); - const temporaryPath = resolve(folder, '../__CRAWLEE_MIGRATING_KEY_VALUE_STORE__'); - - // For optimization, we want to only attempt to copy a few files from the default key-value store - const possibleInputKeys = ['INPUT', 'INPUT.json', 'INPUT.bin', 'INPUT.txt']; - - if (storagePathExists) { - // Create temporary folder to save important files in - ensureDirSync(temporaryPath); - - // Go through each file and save the ones that are important - for (const entity of possibleInputKeys) { - const originalFilePath = resolve(folder, entity); - const tempFilePath = resolve(temporaryPath, entity); - - try { - moveSync(originalFilePath, tempFilePath); - } catch { - // Ignore - } - } - - // Remove the original folder and all its content - let counter = 0; - let tempPathForOldFolder = resolve(folder, `../__OLD_DEFAULT_${counter}__`); - let done = false; - - while (!done) { - try { - moveSync(folder, tempPathForOldFolder); - done = true; - } catch { - tempPathForOldFolder = resolve(folder, `../__OLD_DEFAULT_${++counter}__`); - } - } - - // Replace the temporary folder with the original folder - moveSync(temporaryPath, folder); - - // Remove the old folder - return async () => (await this.batchRemoveFiles(tempPathForOldFolder))(); - } - - return async () => Promise.resolve(); - } - - private async batchRemoveFiles(folder: string, counter = 0): Promise<() => Promise> { - const folderExists = pathExistsSync(folder); - - if (folderExists) { - const temporaryFolder = resolve(folder, `../__CRAWLEE_TEMPORARY_${counter}__`); - - try { - // Rename the old folder to the new one to allow background deletions - await move(folder, temporaryFolder); - } catch { - // Folder exists already, try again with an incremented counter - return this.batchRemoveFiles(folder, ++counter); - } - - return async () => { - // Read all files in the folder - const entries = await readdir(temporaryFolder); - - let processed = 0; - let promises: Promise[] = []; - - for (const entry of entries) { - processed++; - promises.push(rm(resolve(temporaryFolder, entry), { force: true })); - - // Every 2000 files, delete them - if (processed % 2000 === 0) { - await Promise.allSettled(promises); - promises = []; - } - } - - // Ensure last promises are handled - await Promise.allSettled(promises); - - // Delete the folder itself - await rm(temporaryFolder, { force: true, recursive: true }); - }; - } - - return async () => Promise.resolve(); + await Promise.all(this.requestQueueCache.map(async (queue) => queue.persistState())); } } diff --git a/packages/fs-storage/src/fs/common.ts b/packages/fs-storage/src/fs/common.ts deleted file mode 100644 index 1a26019fe571..000000000000 --- a/packages/fs-storage/src/fs/common.ts +++ /dev/null @@ -1,5 +0,0 @@ -export interface StorageImplementation { - get(force?: boolean): Promise; - update(data: T): void | Promise; - delete(): void | Promise; -} diff --git a/packages/fs-storage/src/fs/dataset/fs.ts b/packages/fs-storage/src/fs/dataset/fs.ts deleted file mode 100644 index bf66d2d5ed84..000000000000 --- a/packages/fs-storage/src/fs/dataset/fs.ts +++ /dev/null @@ -1,43 +0,0 @@ -import { readFile, rm } from 'node:fs/promises'; -import { dirname, resolve } from 'node:path'; - -import { AsyncQueue } from '@sapphire/async-queue'; -import { ensureDir } from 'fs-extra/esm'; - -import { lockAndWrite } from '../../background-handler/fs-utils.js'; -import type { StorageImplementation } from '../common.js'; -import type { CreateStorageImplementationOptions } from './index.js'; - -export class DatasetFileSystemEntry implements StorageImplementation { - private filePath: string; - private fsQueue = new AsyncQueue(); - - constructor(options: CreateStorageImplementationOptions) { - this.filePath = resolve(options.storeDirectory, `${options.entityId}.json`); - } - - async get() { - await this.fsQueue.wait(); - try { - return JSON.parse(await readFile(this.filePath, 'utf-8')); - } finally { - this.fsQueue.shift(); - } - } - - async update(data: Data) { - await this.fsQueue.wait(); - try { - await ensureDir(dirname(this.filePath)); - await lockAndWrite(this.filePath, data); - } finally { - this.fsQueue.shift(); - } - } - - async delete() { - await this.fsQueue.wait(); - await rm(this.filePath, { force: true }); - this.fsQueue.shift(); - } -} diff --git a/packages/fs-storage/src/fs/dataset/index.ts b/packages/fs-storage/src/fs/dataset/index.ts deleted file mode 100644 index bd1f71cee9b4..000000000000 --- a/packages/fs-storage/src/fs/dataset/index.ts +++ /dev/null @@ -1,16 +0,0 @@ -import type { Dictionary } from '@crawlee/types'; - -import type { StorageImplementation } from '../common.js'; -import { DatasetFileSystemEntry } from './fs.js'; - -export function createDatasetStorageImplementation( - options: CreateStorageImplementationOptions, -): StorageImplementation { - return new DatasetFileSystemEntry(options); -} - -export interface CreateStorageImplementationOptions { - storeDirectory: string; - /** The actual id of the file to save */ - entityId: string; -} diff --git a/packages/fs-storage/src/fs/key-value-store/fs.ts b/packages/fs-storage/src/fs/key-value-store/fs.ts deleted file mode 100644 index 2d032d973dcb..000000000000 --- a/packages/fs-storage/src/fs/key-value-store/fs.ts +++ /dev/null @@ -1,100 +0,0 @@ -import { readFile, rm } from 'node:fs/promises'; -import { dirname, resolve } from 'node:path'; -import { basename } from 'node:path/win32'; - -import type { CrawleeLogger } from '@crawlee/types'; -import { AsyncQueue } from '@sapphire/async-queue'; -import { ensureDir } from 'fs-extra/esm'; -import mime from 'mime-types'; - -import { lockAndWrite } from '../../background-handler/fs-utils.js'; -import type { InternalKeyRecord } from '../../resource-clients/key-value-store.js'; -import type { StorageImplementation } from '../common.js'; -import type { CreateStorageImplementationOptions } from './index.js'; - -export class KeyValueFileSystemEntry implements StorageImplementation { - private storeDirectory: string; - private writeMetadata: boolean; - private logger?: CrawleeLogger; - - private filePath!: string; - private fileMetadataPath!: string; - private rawRecord!: Omit; - private fsQueue = new AsyncQueue(); - - constructor(options: CreateStorageImplementationOptions) { - this.storeDirectory = options.storeDirectory; - this.writeMetadata = options.writeMetadata; - this.logger = options.logger; - } - - async get(): Promise { - await this.fsQueue.wait(); - let file: Buffer | string; - - try { - file = await readFile(this.filePath); - } catch { - try { - const noExtFilePath = resolve(this.storeDirectory, this.rawRecord.key); - // Try without extension - file = await readFile(noExtFilePath); - this.logger?.warning( - [ - `Key-value entry "${this.rawRecord.key}" for store ${basename( - this.storeDirectory, - )} does not have a file extension, assuming it as text.`, - 'If you want to have correct interpretation of the file, you should add a file extension to the entry.', - ].join('\n'), - ); - file = file.toString('utf-8'); - this.filePath = noExtFilePath; - } catch { - // This is impossible to happen, but just in case - throw new Error(`Could not find file at ${this.filePath}`); - } - } finally { - this.fsQueue.shift(); - } - - return { - ...this.rawRecord, - value: file, - filePath: this.filePath, - }; - } - - async update(data: InternalKeyRecord) { - await this.fsQueue.wait(); - const contentType = mime.contentType(data.key); - const fileName = - // the content type might include charset, e.g. `text/html; charset=utf-8`, so we check via `startsWith` instead of `===` - contentType && data.contentType && contentType.startsWith(data.contentType) - ? data.key - : `${data.key}.${data.extension}`; - - this.filePath ??= resolve(this.storeDirectory, fileName); - this.fileMetadataPath ??= resolve(this.storeDirectory, `${data.key}.__metadata__.json`); - - const { value, ...rest } = data; - this.rawRecord = rest; - - try { - await ensureDir(dirname(this.filePath)); - await lockAndWrite(this.filePath, value, false); - - if (this.writeMetadata) { - await lockAndWrite(this.fileMetadataPath, JSON.stringify(rest), true); - } - } finally { - this.fsQueue.shift(); - } - } - - async delete() { - await this.fsQueue.wait(); - await rm(this.filePath, { force: true }); - await rm(this.fileMetadataPath, { force: true }); - this.fsQueue.shift(); - } -} diff --git a/packages/fs-storage/src/fs/key-value-store/index.ts b/packages/fs-storage/src/fs/key-value-store/index.ts deleted file mode 100644 index 789d15d2f9be..000000000000 --- a/packages/fs-storage/src/fs/key-value-store/index.ts +++ /dev/null @@ -1,17 +0,0 @@ -import type { CrawleeLogger } from '@crawlee/types'; - -import type { InternalKeyRecord } from '../../resource-clients/key-value-store.js'; -import type { StorageImplementation } from '../common.js'; -import { KeyValueFileSystemEntry } from './fs.js'; - -export function createKeyValueStorageImplementation( - options: CreateStorageImplementationOptions, -): StorageImplementation { - return new KeyValueFileSystemEntry(options); -} - -export interface CreateStorageImplementationOptions { - storeDirectory: string; - writeMetadata: boolean; - logger?: CrawleeLogger; -} diff --git a/packages/fs-storage/src/fs/request-queue/fs.ts b/packages/fs-storage/src/fs/request-queue/fs.ts deleted file mode 100644 index 87dde6a2163b..000000000000 --- a/packages/fs-storage/src/fs/request-queue/fs.ts +++ /dev/null @@ -1,88 +0,0 @@ -import { readFile, rm } from 'node:fs/promises'; -import { dirname, resolve } from 'node:path'; - -import { AsyncQueue } from '@sapphire/async-queue'; -import { ensureDir } from 'fs-extra/esm'; - -import { lockAndCallback, lockAndWrite } from '../../background-handler/fs-utils.js'; -import type { InternalRequest } from '../../resource-clients/request-queue.js'; -import type { StorageImplementation } from '../common.js'; -import type { CreateStorageImplementationOptions } from './index.js'; - -export class RequestQueueFileSystemEntry implements StorageImplementation { - private filePath: string; - private fsQueue = new AsyncQueue(); - private data?: InternalRequest; - private directoryExists = false; - - /** - * A "sweep" timeout that is created/refreshed whenever this entry is accessed/updated. - * It exists to ensure that the entry is not kept in memory indefinitely, by sweeping it after 60 seconds of inactivity (in order to keep memory usage low) - */ - private sweepTimeout?: NodeJS.Timeout; - - public orderNo?: number | null; - - constructor(options: CreateStorageImplementationOptions) { - this.filePath = resolve(options.storeDirectory, `${options.requestId}.json`); - } - - async get(force = false) { - await this.fsQueue.wait(); - this.setOrRefreshSweepTimeout(); - - if (this.data && !force) { - this.fsQueue.shift(); - return this.data; - } - - try { - return await lockAndCallback(this.filePath, async () => { - const req = JSON.parse(await readFile(this.filePath, 'utf-8')); - this.data = req; - - this.orderNo = req.orderNo; - - return req; - }); - } finally { - this.fsQueue.shift(); - } - } - - async update(data: InternalRequest) { - await this.fsQueue.wait(); - this.data = data; - - try { - if (!this.directoryExists) { - await ensureDir(dirname(this.filePath)); - this.directoryExists = true; - } - - await lockAndWrite(this.filePath, data); - - this.orderNo = data.orderNo; - } finally { - this.setOrRefreshSweepTimeout(); - this.fsQueue.shift(); - } - } - - async delete() { - await this.fsQueue.wait(); - await rm(this.filePath, { force: true }); - this.fsQueue.shift(); - } - - private setOrRefreshSweepTimeout() { - if (this.sweepTimeout) { - this.sweepTimeout.refresh(); - } else { - this.sweepTimeout = setTimeout(() => { - this.sweepTimeout = undefined; - this.data = undefined; - }, 60_000).unref(); - } - } -} diff --git a/packages/fs-storage/src/fs/request-queue/index.ts b/packages/fs-storage/src/fs/request-queue/index.ts deleted file mode 100644 index 995c399ad65c..000000000000 --- a/packages/fs-storage/src/fs/request-queue/index.ts +++ /dev/null @@ -1,10 +0,0 @@ -import { RequestQueueFileSystemEntry } from './fs.js'; - -export function createRequestQueueStorageImplementation(options: CreateStorageImplementationOptions) { - return new RequestQueueFileSystemEntry(options); -} - -export interface CreateStorageImplementationOptions { - storeDirectory: string; - requestId: string; -} diff --git a/packages/fs-storage/src/resource-clients/common/base-client.ts b/packages/fs-storage/src/resource-clients/common/base-client.ts deleted file mode 100644 index 72a994c2dd08..000000000000 --- a/packages/fs-storage/src/resource-clients/common/base-client.ts +++ /dev/null @@ -1,7 +0,0 @@ -export class BaseClient { - id: string; - - constructor(id: string) { - this.id = id; - } -} diff --git a/packages/fs-storage/src/resource-clients/dataset.ts b/packages/fs-storage/src/resource-clients/dataset.ts index 2da14e63a80f..a48844a63410 100644 --- a/packages/fs-storage/src/resource-clients/dataset.ts +++ b/packages/fs-storage/src/resource-clients/dataset.ts @@ -1,104 +1,95 @@ -/* eslint-disable import/no-duplicates */ -import { randomUUID } from 'node:crypto'; -import { rm } from 'node:fs/promises'; -import { resolve } from 'node:path'; - import type * as storage from '@crawlee/types'; import type { Dictionary } from '@crawlee/types'; import { s } from '@sapphire/shapeshift'; -import { scheduleBackgroundTask } from '../background-handler/index.js'; -import type { StorageImplementation } from '../fs/common.js'; -import { createDatasetStorageImplementation } from '../fs/dataset/index.js'; -import type { FileSystemStorageClient } from '../index.js'; -import { BaseClient } from './common/base-client.js'; +import type { FileSystemDatasetClient as NativeFileSystemDatasetClient } from '@crawlee/fs-storage-native'; /** - * This is what API returns in the x-apify-pagination-limit - * header when no limit query parameter is used. + * This is what the API returns in the `x-apify-pagination-limit` header when no limit query + * parameter is used. The native client expects an explicit upper bound, so we forward this value + * when the caller does not specify a `limit`. */ const LIST_ITEMS_LIMIT = 999_999_999_999; -/** - * Number of characters of the dataset item file names. - * E.g.: 000000019.json - 9 digits - */ -const LOCAL_ENTRY_NAME_DIGITS = 9; - export interface DatasetClientOptions { - id?: string; + /** The user-facing storage name, or `undefined` for unnamed (alias / default) storages. */ name?: string; /** - * The directory name to use on disk. When provided, takes precedence over `name` and `id` - * for the directory path. This allows alias-opened storages to have a directory name - * that differs from their metadata `name` (which is `undefined` for unnamed storages). + * The key used for cache lookup in {@link FileSystemStorageClient}. For named storages this equals + * the name; for alias (unnamed) storages it is the alias string. Falls back to the storage id. */ - directoryName?: string; - baseStorageDirectory: string; - client: FileSystemStorageClient; + cacheKey: string; + nativeClient: NativeFileSystemDatasetClient; } -export class DatasetClient - extends BaseClient - implements storage.DatasetClient -{ - name?: string; - /** - * The key used for directory naming and cache lookup. For named storages, this equals - * the name. For alias (unnamed) storages, this is the alias string. Falls back to id. - */ - directoryName: string; - createdAt = new Date(); - accessedAt = new Date(); - modifiedAt = new Date(); - itemCount = 0; - datasetDirectory: string; +/** + * A file-system dataset client backed by the native `@crawlee/fs-storage-native` Rust extension. + * + * This class is a thin adapter: it forwards each operation to the native client (which owns the + * on-disk format, timestamps and item counting) and converts results into the shapes expected by + * the `@crawlee/types` interfaces. + */ +export class DatasetClient implements storage.DatasetClient { + readonly name?: string; + readonly cacheKey: string; - private readonly datasetEntries = new Map>(); - private readonly client: FileSystemStorageClient; + private readonly nativeClient: NativeFileSystemDatasetClient; constructor(options: DatasetClientOptions) { - super(options.id ?? randomUUID()); this.name = options.name; - this.directoryName = options.directoryName ?? this.name ?? this.id; - this.datasetDirectory = resolve(options.baseStorageDirectory, this.directoryName); - this.client = options.client; + this.cacheKey = options.cacheKey; + this.nativeClient = options.nativeClient; } - async getMetadata(): Promise { - this.updateTimestamps(false); - return this.toDatasetInfo(); + /** The storage id assigned by the native client. */ + get id(): string { + return this._cachedId; } - async drop(): Promise { - const storeIndex = this.client.datasetClientCache.findIndex((store) => store.id === this.id); + /** + * The id is read once from the native metadata at construction time (see + * {@link DatasetClient.create}) and cached, so that the synchronous `id` getter — required by + * {@link FileSystemStorageClient.storageExists} and the cache lookups — does not have to await. + */ + private _cachedId!: string; + + get datasetDirectory(): string { + return this.nativeClient.pathToDataset; + } - if (storeIndex !== -1) { - const [oldClient] = this.client.datasetClientCache.splice(storeIndex, 1); - oldClient.itemCount = 0; - oldClient.datasetEntries.clear(); + static async create( + options: DatasetClientOptions, + ): Promise> { + const client = new DatasetClient(options); + client._cachedId = (await options.nativeClient.getMetadata()).id; + return client; + } + + async getMetadata(): Promise { + const metadata = await this.nativeClient.getMetadata(); + return { + id: metadata.id, + name: metadata.name ?? undefined, + accessedAt: new Date(metadata.accessedAt), + createdAt: new Date(metadata.createdAt), + modifiedAt: new Date(metadata.modifiedAt), + itemCount: metadata.itemCount, + }; + } - await rm(oldClient.datasetDirectory, { recursive: true, force: true }); - } + async drop(): Promise { + await this.nativeClient.dropStorage(); } async purge(): Promise { - this.itemCount = 0; - this.datasetEntries.clear(); - - // Remove item files from disk but keep the directory - const { readdir } = await import('node:fs/promises'); - const entries = await readdir(this.datasetDirectory).catch(() => []); - for (const entry of entries) { - if (entry !== '__metadata__.json') { - await rm(resolve(this.datasetDirectory, entry), { force: true }); - } - } - - this.updateTimestamps(true); + await this.nativeClient.purge(); + } + + async pushData(items: Data[]): Promise { + await this.nativeClient.pushData(items); } - getData(options: storage.DatasetClientListOptions = {}): Promise> { + async getData(options: storage.DatasetClientListOptions = {}): Promise> { const { desc, limit, offset } = s .object({ desc: s.boolean().optional(), @@ -107,95 +98,20 @@ export class DatasetClient }) .parse(options); - return this.getDataPage({ - desc, - offset: offset ?? 0, - limit: Math.min(limit ?? LIST_ITEMS_LIMIT, LIST_ITEMS_LIMIT), - }); - } - - private async getDataPage(options: storage.DatasetClientListOptions = {}): Promise> { - const { limit = LIST_ITEMS_LIMIT, offset = 0, desc } = options; - - const [start, end] = this.getStartAndEndIndexes( - desc ? Math.max(this.itemCount - offset - limit, 0) : offset, - limit, + const page = await this.nativeClient.getData( + offset ?? 0, + Math.min(limit ?? LIST_ITEMS_LIMIT, LIST_ITEMS_LIMIT), + desc ?? false, + false, ); - const items: Data[] = []; - - for (let idx = start; idx < end; idx++) { - const entryNumber = this.generateLocalEntryName(idx); - items.push(await this.datasetEntries.get(entryNumber)!.get()); - } - - this.updateTimestamps(false); - - return { - count: items.length, - desc: desc ?? false, - items: desc ? items.reverse() : items, - limit, - offset, - total: this.itemCount, - }; - } - - async pushData(items: Data[]): Promise { - for (const entry of items) { - const idx = this.generateLocalEntryName(++this.itemCount); - const storageEntry = createDatasetStorageImplementation({ - entityId: idx, - storeDirectory: this.datasetDirectory, - }); - - await storageEntry.update(entry); - - this.datasetEntries.set(idx, storageEntry); - } - - this.updateTimestamps(true); - } - - toDatasetInfo(): storage.DatasetInfo { return { - id: this.id, - accessedAt: this.accessedAt, - createdAt: this.createdAt, - itemCount: this.itemCount, - modifiedAt: this.modifiedAt, - name: this.name, + count: page.count, + desc: page.desc, + items: page.items as Data[], + limit: page.limit, + offset: page.offset, + total: page.total, }; } - - private generateLocalEntryName(idx: number): string { - return idx.toString().padStart(LOCAL_ENTRY_NAME_DIGITS, '0'); - } - - private getStartAndEndIndexes(offset: number, limit = this.itemCount) { - const start = offset + 1; - const end = Math.min(offset + limit, this.itemCount) + 1; - return [start, end] as const; - } - - private updateTimestamps(hasBeenModified: boolean) { - this.accessedAt = new Date(); - - if (hasBeenModified) { - this.modifiedAt = new Date(); - } - - const data = this.toDatasetInfo(); - scheduleBackgroundTask( - { - action: 'update-metadata', - data, - entityType: 'datasets', - entityDirectory: this.datasetDirectory, - id: this.name ?? this.id, - writeMetadata: this.client.writeMetadata, - }, - this.client.logger, - ); - } } diff --git a/packages/fs-storage/src/resource-clients/key-value-store.ts b/packages/fs-storage/src/resource-clients/key-value-store.ts index 66d64ee9a0a2..31060a47cee9 100644 --- a/packages/fs-storage/src/resource-clients/key-value-store.ts +++ b/packages/fs-storage/src/resource-clients/key-value-store.ts @@ -1,91 +1,118 @@ -import { randomUUID } from 'node:crypto'; -import { rm } from 'node:fs/promises'; +import { readdir, readFile } from 'node:fs/promises'; import { resolve } from 'node:path'; import { pathToFileURL } from 'node:url'; import type * as storage from '@crawlee/types'; +import type { CrawleeLogger } from '@crawlee/types'; import { s } from '@sapphire/shapeshift'; -import { scheduleBackgroundTask } from '../background-handler/index.js'; import { maybeParseBody } from '../body-parser.js'; -import type { StorageImplementation } from '../fs/common.js'; -import { createKeyValueStorageImplementation } from '../fs/key-value-store/index.js'; -import type { FileSystemStorageClient } from '../index.js'; +import type { FileSystemKeyValueStoreClient as NativeFileSystemKeyValueStoreClient } from '@crawlee/fs-storage-native'; import { isBuffer, isStream } from '../utils.js'; -import { BaseClient } from './common/base-client.js'; import mime from 'mime-types'; -const DEFAULT_LOCAL_FILE_EXTENSION = 'bin'; +const STORE_METADATA_FILENAME = '__metadata__.json'; +const RECORD_METADATA_SUFFIX = '.__metadata__.json'; -export interface KeyValueStoreClientOptions { - name?: string; - id?: string; - /** - * The directory name to use on disk. When provided, takes precedence over `name` and `id` - * for the directory path. This allows alias-opened storages to have a directory name - * that differs from their metadata `name` (which is `undefined` for unnamed storages). - */ - directoryName?: string; - baseStorageDirectory: string; - client: FileSystemStorageClient; -} - -export interface InternalKeyRecord { +/** A value file present on disk that the native client does not track (no metadata sidecar). */ +interface BareFile { + /** The decoded record key. */ key: string; - value: Buffer | string; - contentType?: string; - extension: string; - filePath?: string; + /** Absolute path to the value file on disk. */ + filePath: string; + /** Content type inferred from the file extension (defaults to `text/plain`). */ + contentType: string; } -export class KeyValueStoreClient extends BaseClient implements storage.KeyValueStoreClient { +export interface KeyValueStoreClientOptions { + /** The user-facing storage name, or `undefined` for unnamed (alias / default) storages. */ name?: string; /** - * The key used for directory naming and cache lookup. For named storages, this equals - * the name. For alias (unnamed) storages, this is the alias string. Falls back to id. + * The key used for cache lookup in {@link FileSystemStorageClient}. For named storages this equals + * the name; for alias (unnamed) storages it is the alias string. Falls back to the storage id. */ - directoryName: string; - createdAt = new Date(); - accessedAt = new Date(); - modifiedAt = new Date(); - keyValueStoreDirectory: string; + cacheKey: string; + nativeClient: NativeFileSystemKeyValueStoreClient; + logger?: CrawleeLogger; +} - private readonly keyValueEntries = new Map>(); - private readonly client: FileSystemStorageClient; +/** + * A file-system key-value store client backed by the native `@crawlee/fs-storage-native` Rust + * extension. + * + * The native client stores and returns raw bytes; this adapter is responsible for serializing + * arbitrary values into a `Buffer` on the way in ({@link KeyValueStoreClient.setValue}) and parsing + * them back into JS values on the way out ({@link KeyValueStoreClient.getValue}), preserving the + * historical content-type handling. + */ +export class KeyValueStoreClient implements storage.KeyValueStoreClient { + readonly name?: string; + readonly cacheKey: string; + + private readonly nativeClient: NativeFileSystemKeyValueStoreClient; + private readonly logger?: CrawleeLogger; + private _cachedId!: string; constructor(options: KeyValueStoreClientOptions) { - super(options.id ?? randomUUID()); this.name = options.name; - this.directoryName = options.directoryName ?? this.name ?? this.id; - this.keyValueStoreDirectory = resolve(options.baseStorageDirectory, this.directoryName); - this.client = options.client; + this.cacheKey = options.cacheKey; + this.nativeClient = options.nativeClient; + this.logger = options.logger; } - async getMetadata(): Promise { - this.updateTimestamps(false); - return this.toKeyValueStoreInfo(); + /** The storage id assigned by the native client. */ + get id(): string { + return this._cachedId; } - async drop(): Promise { - const storeIndex = this.client.keyValueStoreCache.findIndex((store) => store.id === this.id); + get keyValueStoreDirectory(): string { + return this.nativeClient.pathToKvs; + } - if (storeIndex !== -1) { - const [oldClient] = this.client.keyValueStoreCache.splice(storeIndex, 1); - oldClient.keyValueEntries.clear(); + static async create(options: KeyValueStoreClientOptions): Promise { + const client = new KeyValueStoreClient(options); + client._cachedId = (await options.nativeClient.getMetadata()).id; + return client; + } - await rm(oldClient.keyValueStoreDirectory, { recursive: true, force: true }); - } + async getMetadata(): Promise { + const metadata = await this.nativeClient.getMetadata(); + return { + id: metadata.id, + name: metadata.name ?? undefined, + accessedAt: new Date(metadata.accessedAt), + createdAt: new Date(metadata.createdAt), + modifiedAt: new Date(metadata.modifiedAt), + userId: '1', + }; + } + + async drop(): Promise { + await this.nativeClient.dropStorage(); } async purge(): Promise { - // Delete all entries - const entriesToDelete = [...this.keyValueEntries.entries()]; - for (const [key, entry] of entriesToDelete) { - this.keyValueEntries.delete(key); - await entry.delete(); + await this.nativeClient.purge(); + } + + /** + * Remove every record from the store except the run input (the `INPUT` key). Used by + * {@link FileSystemStorageClient.purge} to clean the default key-value store at the start of a run + * while preserving the run's input, matching the historical file-system storage behavior. + * + * The native client's `purge()` clears everything unconditionally, so we instead delete keys + * individually here. + */ + async purgeExceptInput(): Promise { + const keysToDelete: string[] = []; + const iterator = await this.nativeClient.iterateKeys(); + for await (const record of iterator) { + if (record.key !== 'INPUT') { + keysToDelete.push(record.key); + } } - this.updateTimestamps(true); + await Promise.all(keysToDelete.map(async (key) => this.nativeClient.deleteValue(key))); } async listKeys(options: storage.KeyValueStoreListKeysOptions = {}): Promise { @@ -97,20 +124,27 @@ export class KeyValueStoreClient extends BaseClient implements storage.KeyValueS }) .parse(options); - const items: storage.KeyValueStoreItemData[] = []; - - for (const storageEntry of this.keyValueEntries.values()) { - const record = await storageEntry.get(); + // The native iterator yields keys in lexical order and natively supports `exclusiveStartKey` + // and `limit`, but not `prefix`, and it does not throw for an unknown `exclusiveStartKey`. + // To preserve the historical semantics (prefix filtering and a hard error for a missing + // `exclusiveStartKey`), we collect the full key list and slice it here. We also merge in any + // untracked value files present on disk (native keys take precedence on collisions). + const itemsByKey = new Map(); + + for (const bareFile of await this.listBareFiles()) { + const size = await readFile(bareFile.filePath) + .then((buffer) => buffer.byteLength) + .catch(() => 0); + itemsByKey.set(bareFile.key, { key: bareFile.key, size }); + } - const size = Buffer.byteLength(record.value); - items.push({ - key: record.key, - size, - }); + const iterator = await this.nativeClient.iterateKeys(); + for await (const record of iterator) { + itemsByKey.set(record.key, { key: record.key, size: record.size ?? 0 }); } - // Lexically sort to emulate API. - items.sort((a, b) => a.key.localeCompare(b.key)); + // Emulate the API: keys are returned in lexical order. + const items = [...itemsByKey.values()].sort((a, b) => a.key.localeCompare(b.key)); let filteredItems = items.filter((item) => !prefix || item.key.startsWith(prefix)); @@ -129,60 +163,61 @@ export class KeyValueStoreClient extends BaseClient implements storage.KeyValueS filteredItems = filteredItems.slice(0, limit); } - this.updateTimestamps(false); - return filteredItems; } /** * Generates a public `file://` URL for accessing a specific record in the key-value store. * - * Returns `undefined` if the record does not exist or has no associated file path (i.e., it is not stored as a file). + * Returns `undefined` if the record does not exist. * @param key The key of the record to generate the public URL for. */ async getPublicUrl(key: string): Promise { s.string().parse(key); - const storageEntry = await this.keyValueEntries.get(key)?.get(); + // The native client builds the URL purely from the path and does not check existence, so we + // guard here to keep the historical `undefined`-for-missing contract. + if (await this.nativeClient.recordExists(key)) { + return this.nativeClient.getPublicUrl(key); + } - return storageEntry?.filePath ? pathToFileURL(storageEntry.filePath).href : undefined; + // Fall back to an untracked value file on disk. + const bareFile = await this.findBareFile(key); + return bareFile ? pathToFileURL(bareFile.filePath).href : undefined; } /** - * Tests whether a record with the given key exists in the key-value store without retrieving its value. + * Tests whether a record with the given key exists without retrieving its value. * * @param key The queried record key. - * @returns `true` if the record exists, `false` if it does not. + * @returns `true` if the record exists, `false` otherwise. */ async recordExists(key: string): Promise { s.string().parse(key); - - return this.keyValueEntries.has(key); + if (await this.nativeClient.recordExists(key)) { + return true; + } + return (await this.findBareFile(key)) !== undefined; } async getValue(key: string): Promise { s.string().parse(key); - const storageEntry = this.keyValueEntries.get(key); + const record = await this.nativeClient.getValue(key); - if (!storageEntry) { - return undefined; + if (record) { + // The native client always returns the raw bytes; parse them back into a JS value according + // to the stored content type (JSON → object, text → string, everything else stays a Buffer). + return { + key: record.key, + value: maybeParseBody(record.value, record.contentType), + contentType: record.contentType, + }; } - const entry = await storageEntry.get(); - - const record: storage.KeyValueStoreRecord = { - key: entry.key, - value: entry.value, - contentType: entry.contentType ?? (mime.contentType(entry.extension) as string), - }; - - // Auto-parse the body (JSON → object, text → string, etc.) - record.value = maybeParseBody(record.value, record.contentType!); - - this.updateTimestamps(false); - - return record; + // Fall back to a value file placed on disk out-of-band (e.g. a hand-written or + // platform-provided `INPUT.json`) that the native client does not track. + return this.readBareFile(key); } async setValue(record: storage.KeyValueStoreRecord): Promise { @@ -195,7 +230,7 @@ export class KeyValueStoreClient extends BaseClient implements storage.KeyValueS s.instance(Buffer), s.instance(ArrayBuffer), s.typedArray(), - // disabling validation will make shapeshift only check the object given is an actual object, not null, nor array + // disabling validation makes shapeshift only check the value is an actual object, not null nor array s.object({}).setValidationEnabled(false), ]), contentType: s.string().lengthGreaterThan(0).optional(), @@ -205,25 +240,26 @@ export class KeyValueStoreClient extends BaseClient implements storage.KeyValueS let { value, contentType } = record; const valueIsStream = isStream(value); - const isValueStreamOrBuffer = valueIsStream || isBuffer(value); - // To allow saving Objects to JSON without providing content type + + // To allow saving objects to JSON without providing a content type. if (!contentType) { if (isValueStreamOrBuffer) contentType = 'application/octet-stream'; else if (typeof value === 'string') contentType = 'text/plain; charset=utf-8'; else contentType = 'application/json; charset=utf-8'; } - const extension = mime.extension(contentType) || DEFAULT_LOCAL_FILE_EXTENSION; - + const extension = mime.extension(contentType) || 'bin'; const isContentTypeJson = extension === 'json'; if (isContentTypeJson && !isValueStreamOrBuffer && typeof value !== 'string') { try { value = JSON.stringify(value, null, 2); - } catch (err: any) { - const msg = `The record value cannot be stringified to JSON. Please provide other content type.\nCause: ${err.message}`; - throw new Error(msg); + } catch (error: any) { + throw new Error( + `The record value cannot be stringified to JSON. Please provide other content type.\n` + + `Cause: ${error.message}`, + ); } } @@ -235,67 +271,117 @@ export class KeyValueStoreClient extends BaseClient implements storage.KeyValueS value = Buffer.concat(chunks); } - const _record = { - extension, - key, - value, - contentType, - } satisfies InternalKeyRecord; - - const entry = createKeyValueStorageImplementation({ - storeDirectory: this.keyValueStoreDirectory, - writeMetadata: this.client.writeMetadata, - logger: this.client.logger, - }); - - await entry.update(_record); + // Normalize whatever is left into a Buffer for the native client. + const buffer = Buffer.isBuffer(value) + ? value + : value instanceof ArrayBuffer + ? Buffer.from(value) + : ArrayBuffer.isView(value) + ? Buffer.from(value.buffer, value.byteOffset, value.byteLength) + : Buffer.from(String(value)); - this.keyValueEntries.set(key, entry); - - this.updateTimestamps(true); + await this.nativeClient.setValue(key, buffer, contentType); } async deleteValue(key: string): Promise { s.string().parse(key); + await this.nativeClient.deleteValue(key); + } + + /** + * Read a value file that exists on disk but is not tracked by the native client (it has no + * metadata sidecar) — e.g. an `INPUT.json` placed by the user or the Apify platform. Returns the + * parsed record, or `undefined` if no such file exists or its content cannot be parsed. + */ + private async readBareFile(key: string): Promise { + const bareFile = await this.findBareFile(key); + if (!bareFile) { + return undefined; + } - const entry = this.keyValueEntries.get(key); + if (!mime.extension(bareFile.contentType)) { + this.logger?.warning?.( + `Key-value store record "${key}" was loaded from a file without a known extension; ` + + `assuming "${bareFile.contentType}".`, + ); + } - if (entry) { - this.keyValueEntries.delete(key); - this.updateTimestamps(true); - await entry.delete(); + let buffer: Buffer; + try { + buffer = await readFile(bareFile.filePath); + } catch { + return undefined; + } + + // Parse the body according to the inferred content type (JSON → object, text → string, ...). + // An unparseable value (e.g. malformed JSON) is treated as a missing record, matching the + // historical fallback behavior. + let value: unknown; + try { + value = maybeParseBody(buffer, bareFile.contentType); + } catch { + this.logger?.warning?.(`Failed to parse key-value store record "${key}" read from disk; ignoring it.`); + return undefined; } - } - toKeyValueStoreInfo(): storage.KeyValueStoreInfo { return { - id: this.id, - name: this.name, - accessedAt: this.accessedAt, - createdAt: this.createdAt, - modifiedAt: this.modifiedAt, - userId: '1', + key, + value, + contentType: bareFile.contentType, }; } - private updateTimestamps(hasBeenModified: boolean) { - this.accessedAt = new Date(); + /** Find an untracked value file on disk matching `key`, if any. */ + private async findBareFile(key: string): Promise { + const target = encodeURIComponent(key); + for (const bareFile of await this.listBareFiles()) { + if (encodeURIComponent(bareFile.key) === target) { + return bareFile; + } + } + return undefined; + } + + /** + * List value files present in the store directory that the native client does not track. Each + * such file is reported once, keyed by its (URL-decoded) name with the extension stripped, with a + * content type inferred from its extension. + */ + private async listBareFiles(): Promise { + let entries: string[]; + try { + entries = await readdir(this.keyValueStoreDirectory); + } catch { + // The store directory may not exist yet. + return []; + } + + const result: BareFile[] = []; + + for (const entry of entries) { + // Skip the store metadata file and per-record metadata sidecars. + if (entry === STORE_METADATA_FILENAME || entry.endsWith(RECORD_METADATA_SUFFIX)) { + continue; + } + + // A native-tracked record has a `.__metadata__.json` sidecar; if one exists for this + // entry, the native client already owns it and it is not a "bare" file. + if (entries.includes(`${entry}${RECORD_METADATA_SUFFIX}`)) { + continue; + } - if (hasBeenModified) { - this.modifiedAt = new Date(); + const dotIndex = entry.lastIndexOf('.'); + const extension = dotIndex > 0 ? entry.slice(dotIndex + 1) : ''; + const decodedName = decodeURIComponent(dotIndex > 0 ? entry.slice(0, dotIndex) : entry); + const contentType = (extension && (mime.contentType(extension) || undefined)) || 'text/plain'; + + result.push({ + key: decodedName, + filePath: resolve(this.keyValueStoreDirectory, entry), + contentType, + }); } - const data = this.toKeyValueStoreInfo(); - scheduleBackgroundTask( - { - action: 'update-metadata', - data, - entityType: 'keyValueStores', - entityDirectory: this.keyValueStoreDirectory, - id: this.name ?? this.id, - writeMetadata: this.client.writeMetadata, - }, - this.client.logger, - ); + return result; } } diff --git a/packages/fs-storage/src/resource-clients/request-queue.ts b/packages/fs-storage/src/resource-clients/request-queue.ts index ae3b70766386..6075cc2169d2 100644 --- a/packages/fs-storage/src/resource-clients/request-queue.ts +++ b/packages/fs-storage/src/resource-clients/request-queue.ts @@ -1,17 +1,10 @@ -import { randomUUID } from 'node:crypto'; -import { rm } from 'node:fs/promises'; -import { resolve } from 'node:path'; - import type * as storage from '@crawlee/types'; -import { AsyncQueue } from '@sapphire/async-queue'; import { s } from '@sapphire/shapeshift'; -import type { RequestQueueFileSystemEntry } from '../fs/request-queue/fs.js'; -import { scheduleBackgroundTask } from '../background-handler/index.js'; -import { createRequestQueueStorageImplementation } from '../fs/request-queue/index.js'; -import type { FileSystemStorageClient } from '../index.js'; -import { purgeNullsFromObject, uniqueKeyToRequestId } from '../utils.js'; -import { BaseClient } from './common/base-client.js'; +import type { + FileSystemRequestQueueClient as NativeFileSystemRequestQueueClient, + ProcessedRequest as NativeProcessedRequest, +} from '@crawlee/fs-storage-native'; const requestShape = s .object({ @@ -25,7 +18,6 @@ const requestShape = s .passthrough(); const requestShapeWithoutId = requestShape.omit(['id']); - const batchRequestShapeWithoutId = requestShapeWithoutId.array(); const requestOptionsShape = s.object({ @@ -33,294 +25,105 @@ const requestOptionsShape = s.object({ }); export interface RequestQueueClientOptions { + /** The user-facing storage name, or `undefined` for unnamed (alias / default) storages. */ name?: string; - id?: string; /** - * The directory name to use on disk. When provided, takes precedence over `name` and `id` - * for the directory path. This allows alias-opened storages to have a directory name - * that differs from their metadata `name` (which is `undefined` for unnamed storages). + * The key used for cache lookup in {@link FileSystemStorageClient}. For named storages this equals + * the name; for alias (unnamed) storages it is the alias string. Falls back to the storage id. */ - directoryName?: string; - baseStorageDirectory: string; - client: FileSystemStorageClient; -} - -export interface InternalRequest { - id: string; - orderNo: number | null; - url: string; - uniqueKey: string; - method: Exclude; - retryCount: number; - json: string; + cacheKey: string; + nativeClient: NativeFileSystemRequestQueueClient; } /** - * Default time (in seconds) for which a request fetched via {@link RequestQueueClient.fetchNextRequest} - * stays locked (in progress) before it becomes available again. Aligns with the historical request queue - * locking default. A consumer (e.g. a crawler) can raise this per queue via - * {@link RequestQueueClient.setExpectedRequestProcessingTimeSecs}. + * The native client tags each request it returns with an internal `orderNo` (its lock / ordering + * timestamp). Strip it before handing the request back so consumers see a plain `RequestOptions` as + * promised by the `@crawlee/types` contract, rather than a native implementation detail. Returns a + * fresh object — the native client's value is left untouched. */ -const DEFAULT_REQUEST_LOCK_SECS = 3 * 60; - -/** - * A request is "locked" (in progress) when its `orderNo` is pushed beyond the current time. The sign of - * `orderNo` is preserved so the original forefront / normal ordering is restored once the lock expires - * or the request is reclaimed. - */ -function isRequestLocked(orderNo: number | null, now: number): boolean { - return orderNo !== null && Math.abs(orderNo) > now; +function stripNativeInternals(request: Record | null): storage.RequestOptions | undefined { + if (!request) return undefined; + const { orderNo, ...rest } = request; + return rest as storage.RequestOptions; } -export class RequestQueueClient extends BaseClient implements storage.RequestQueueClient { - name?: string; - /** - * The key used for directory naming and cache lookup. For named storages, this equals - * the name. For alias (unnamed) storages, this is the alias string. Falls back to id. - */ - directoryName: string; - createdAt = new Date(); - accessedAt = new Date(); - modifiedAt = new Date(); - handledRequestCount = 0; - pendingRequestCount = 0; - requestQueueDirectory: string; - /** - * Serializes every operation that reads-then-writes this client's shared queue state — the - * `requests` map, the `forefrontRequestIds` array, the `inProgressRequestIds` set and the request - * counts. Those mutations span `await` points (disk/storage I/O), so without this lock a concurrent - * operation could interleave and corrupt them (e.g. a head scan pruning `forefrontRequestIds` while - * `addBatchOfRequests` pushes to it). Held by every mutating method as well as by `isEmpty`/ - * `isFinished`, whose head scan also prunes `forefrontRequestIds`. - */ - private readonly queueStateMutex = new AsyncQueue(); - private forefrontRequestIds: string[] = []; +function toQueueOperationInfo(processed: NativeProcessedRequest | null): storage.QueueOperationInfo | null { + if (!processed) return null; + return { + requestId: processed.requestId, + wasAlreadyHandled: processed.wasAlreadyHandled, + wasAlreadyPresent: processed.wasAlreadyPresent, + }; +} - /** - * IDs of requests this client has locked (fetched but not yet handled or reclaimed). Used by - * {@link releaseOwnLocks} to free our own in-progress requests on process termination, so that - * a crashed/migrated consumer does not block its requests for the full lock duration. - */ - private readonly inProgressRequestIds = new Set(); +/** + * A file-system request queue client backed by the native `@crawlee/fs-storage-native` Rust + * extension. + * + * Request ordering, in-progress locking and state persistence are all owned by the native client. + * This adapter forwards each operation, converts result shapes to the `@crawlee/types` interfaces, + * and strips the internal bookkeeping fields the native client adds to returned requests. + */ +export class RequestQueueClient implements storage.RequestQueueClient { + readonly name?: string; + readonly cacheKey: string; - private readonly requests = new Map(); - private readonly client: FileSystemStorageClient; - - /** - * How long (in seconds) a request fetched from this client stays locked (in progress). Defaults to - * {@link DEFAULT_REQUEST_LOCK_SECS} and is overridable via {@link setExpectedRequestProcessingTimeSecs}. - */ - private lockSecs = DEFAULT_REQUEST_LOCK_SECS; + private readonly nativeClient: NativeFileSystemRequestQueueClient; + private _cachedId!: string; constructor(options: RequestQueueClientOptions) { - super(options.id ?? randomUUID()); this.name = options.name; - this.directoryName = options.directoryName ?? this.name ?? this.id; - this.requestQueueDirectory = resolve(options.baseStorageDirectory, this.directoryName); - this.client = options.client; + this.cacheKey = options.cacheKey; + this.nativeClient = options.nativeClient; } - /** - * Applies how long {@link fetchNextRequest} locks a request before it becomes available again. The - * caller (the `RequestQueue` frontend) owns the policy of what this value should be — this method - * just applies it. - */ - setExpectedRequestProcessingTimeSecs(secs: number): void { - this.lockSecs = secs; + /** The storage id assigned by the native client. */ + get id(): string { + return this._cachedId; } - async getMetadata(): Promise { - this.updateTimestamps(false); - return this.toRequestQueueInfo(); - } - - async drop(): Promise { - // Serialize against other mutators (and the head scans in `isEmpty`/`isFinished`) so a concurrent - // operation cannot observe half-cleared state — e.g. a forefront id whose request has already been - // removed, which `listPendingHead` would then dereference as `undefined`. - await this.queueStateMutex.wait(); - - try { - const storeIndex = this.client.requestQueueCache.findIndex((queue) => queue.id === this.id); - - if (storeIndex !== -1) { - const [oldClient] = this.client.requestQueueCache.splice(storeIndex, 1); - oldClient.pendingRequestCount = 0; - // Clear all in-memory state, consistent with `purge`. Clearing `requests` alone would - // leave dangling ids in `forefrontRequestIds`/`inProgressRequestIds`, which a later head - // scan would resolve to a missing request and dereference. - oldClient.requests.clear(); - oldClient.forefrontRequestIds = []; - oldClient.inProgressRequestIds.clear(); - - await rm(oldClient.requestQueueDirectory, { recursive: true, force: true }); - } - } finally { - this.queueStateMutex.shift(); - } - } - - async purge(): Promise { - // Serialize against other mutators (and the head scans in `isEmpty`/`isFinished`) so a concurrent - // operation cannot observe or repopulate half-cleared state across the `await` below. - await this.queueStateMutex.wait(); - - try { - // Clear all in-memory state - this.requests.clear(); - this.forefrontRequestIds = []; - this.inProgressRequestIds.clear(); - this.handledRequestCount = 0; - this.pendingRequestCount = 0; - - // Reset the lock duration back to the default so a value raised via - // `setExpectedRequestProcessingTimeSecs` in an earlier run does not leak into a later one - this.lockSecs = DEFAULT_REQUEST_LOCK_SECS; - - // Remove request files from disk but keep the directory - const { readdir } = await import('node:fs/promises'); - const entries = await readdir(this.requestQueueDirectory).catch(() => []); - for (const entry of entries) { - if (entry !== '__metadata__.json') { - await rm(resolve(this.requestQueueDirectory, entry), { force: true }); - } - } - - this.updateTimestamps(true); - } finally { - this.queueStateMutex.shift(); - } + get requestQueueDirectory(): string { + return this.nativeClient.pathToRq; } - private *requestKeyIterator(): IterableIterator { - for (let i = this.forefrontRequestIds.length - 1; i >= 0; i--) { - yield this.forefrontRequestIds[i]; - } - - for (const key of this.requests.keys()) { - yield key; - } + static async create(options: RequestQueueClientOptions): Promise { + const client = new RequestQueueClient(options); + client._cachedId = (await options.nativeClient.getMetadata()).id; + return client; } /** - * Scans the queue and returns the pending head — requests that are neither handled nor currently - * locked (in progress) — ordered by `orderNo`, deduplicated. - * - * When `detectLockedRequests` is set, the result also carries a `hasLockedRequests` flag telling - * whether any unhandled-but-locked request was skipped along the way. This mirrors the Apify - * platform shared client's `queueHasLockedRequests` signal: it lets {@link isFinished} distinguish - * "no work left at all" from "work remains, but it is currently locked by some consumer (possibly - * another process)". Without it, a consumer would consider the queue finished and let the crawler - * shut down while another consumer still holds the last requests. - * - * Computing the flag is expensive: because a lock may sit anywhere in the queue, it forces a scan - * of every pending entry even when only `limit` items are wanted. Callers that only need the head - * (e.g. {@link fetchNextRequest}, {@link isEmpty}) leave it off so the scan can stop as soon as the - * page is filled, keeping those calls O(head) instead of O(N). - * - * Lock state lives in the persisted `orderNo` (see {@link isRequestLocked}), so that processes - * sharing the same on-disk queue observe each other's locks. We therefore re-read entries from - * storage to obtain fresh lock state, except for entries we can cheaply rule out as permanently - * handled via their cached `orderNo === null`. + * Tells the native client how long (in seconds) a fetched request stays locked before it becomes + * available again. The `@crawlee/types` interface declares this as synchronous (fire-and-forget), + * while the native call is asynchronous; we kick it off and let it settle in the background. */ - private async listPendingHead( - limit: number, - detectLockedRequests = false, - ): Promise<{ items: InternalRequest[]; hasLockedRequests?: boolean }> { - const now = Date.now(); - const items: InternalRequest[] = []; - let hasLockedRequests = false; - - // Tracks processed request IDs to avoid duplicates (request in both `forefrontRequestIds` and `requests`). - const seenRequestIds = new Set(); - // Tracks handled request IDs from `forefrontRequestIds` to be removed. - const handledForefrontIds = new Set(); - - for (const requestId of this.requestKeyIterator()) { - // Once the requested page is filled we can stop — unless the caller asked us to detect locked - // requests and we have not yet seen one, in which case we must keep scanning to find them. - if (items.length >= limit && (!detectLockedRequests || hasLockedRequests)) { - break; - } - - if (seenRequestIds.has(requestId)) { - continue; - } - - seenRequestIds.add(requestId); - - const storageEntry = this.requests.get(requestId)!; - - // Cheap rejection of permanently-handled requests using the cached `orderNo` (handled is a - // terminal state, so the cached value can be trusted without re-reading from storage). - if (storageEntry.orderNo === null) { - if (this.forefrontRequestIds.includes(requestId)) { - handledForefrontIds.add(requestId); - } - continue; - } - - // Re-read from storage to get fresh lock state — another process may have locked (or handled) - // this request since we last cached it. - const request = await storageEntry.get(true); - - // Handled in the meantime. - if (request.orderNo === null) { - if (this.forefrontRequestIds.includes(requestId)) { - handledForefrontIds.add(requestId); - } - continue; - } - - // Locked (in progress) by us or another process — skip until the lock expires, but remember - // that the queue is not truly empty. - if (isRequestLocked(request.orderNo, now)) { - hasLockedRequests = true; - continue; - } - - if (items.length < limit) { - items.push(request); - } - } - - this.forefrontRequestIds = this.forefrontRequestIds.filter((id) => !handledForefrontIds.has(id)); + setExpectedRequestProcessingTimeSecs(secs: number): void { + void this.nativeClient.setExpectedRequestProcessingTime(secs); + } + async getMetadata(): Promise { + const metadata = await this.nativeClient.getMetadata(); return { - items: items.sort((a, b) => a.orderNo! - b.orderNo!), - hasLockedRequests: detectLockedRequests ? hasLockedRequests : undefined, + id: metadata.id, + name: metadata.name ?? undefined, + accessedAt: new Date(metadata.accessedAt), + createdAt: new Date(metadata.createdAt), + modifiedAt: new Date(metadata.modifiedAt), + hadMultipleClients: metadata.hadMultipleClients, + handledRequestCount: metadata.handledRequestCount, + pendingRequestCount: metadata.pendingRequestCount, + totalRequestCount: metadata.totalRequestCount, + stats: {}, + userId: '1', }; } - async fetchNextRequest(): Promise { - this.updateTimestamps(false); - - await this.queueStateMutex.wait(); - - try { - const { - items: [head], - } = await this.listPendingHead(1); - - if (!head) { - return null; - } - - // Lock the request by pushing its `orderNo` beyond the lock expiry, preserving the sign so - // its original (forefront / normal) position is restored once the lock expires. The lock is - // persisted so other processes sharing this queue will not fetch the same request. - const lockExpiresAt = Date.now() + this.lockSecs * 1000; - head.orderNo = lockExpiresAt * (head.orderNo! > 0 ? 1 : -1); - await this.requests.get(head.id)!.update(head); - - // Remember that this client owns the lock, so we can release it on process termination - // (see `releaseOwnLocks`) instead of leaving the request stuck until the lock expires. - this.inProgressRequestIds.add(head.id); + async drop(): Promise { + await this.nativeClient.dropStorage(); + } - return this._jsonToRequest(head.json) ?? null; - } finally { - this.queueStateMutex.shift(); - } + async purge(): Promise { + await this.nativeClient.purge(); } async addBatchOfRequests( @@ -330,134 +133,40 @@ export class RequestQueueClient extends BaseClient implements storage.RequestQue batchRequestShapeWithoutId.parse(requests); requestOptionsShape.parse(options); - // Serialize against other mutators (and the head scans in `isEmpty`/`isFinished`) so that the - // shared `requests` map, `forefrontRequestIds` array and request counts are not corrupted by a - // concurrent operation interleaving at one of the `await` points below. - await this.queueStateMutex.wait(); - - try { - const result: storage.BatchAddRequestsResult = { - processedRequests: [], - unprocessedRequests: [], - }; - - for (const model of requests) { - const requestModel = this._createInternalRequest(model, options.forefront); - - const existingRequestWithIdEntry = this.requests.get(requestModel.id); - - if (existingRequestWithIdEntry) { - const existingRequestWithId = await existingRequestWithIdEntry.get(); - - result.processedRequests.push({ - requestId: existingRequestWithId.id, - uniqueKey: existingRequestWithId.uniqueKey, - wasAlreadyHandled: existingRequestWithId.orderNo === null, - wasAlreadyPresent: true, - }); - - continue; - } - - const newEntry = createRequestQueueStorageImplementation({ - requestId: requestModel.id, - storeDirectory: this.requestQueueDirectory, - }); - - await newEntry.update(requestModel); - - this.requests.set(requestModel.id, newEntry); - - if (requestModel.orderNo) { - this.pendingRequestCount += 1; - } else { - this.handledRequestCount += 1; - } - - if (options.forefront) { - this.forefrontRequestIds.push(requestModel.id); - } - - result.processedRequests.push({ - requestId: requestModel.id, - uniqueKey: requestModel.uniqueKey, - // We return wasAlreadyHandled: false even though the request may - // have been added as handled, because that's how API behaves. - wasAlreadyHandled: false, - wasAlreadyPresent: false, - }); - } - - this.updateTimestamps(true); + const response = await this.nativeClient.addBatchOfRequests( + requests as unknown as Record[], + options.forefront ?? false, + ); - return result; - } finally { - this.queueStateMutex.shift(); - } + return { + processedRequests: response.processedRequests.map((processed) => ({ + requestId: processed.requestId, + uniqueKey: processed.uniqueKey, + wasAlreadyHandled: processed.wasAlreadyHandled, + wasAlreadyPresent: processed.wasAlreadyPresent, + })), + unprocessedRequests: response.unprocessedRequests.map((unprocessed) => ({ + uniqueKey: unprocessed.uniqueKey, + url: unprocessed.url, + method: unprocessed.method ?? undefined, + })) as storage.BatchAddRequestsResult['unprocessedRequests'], + }; } async getRequest(uniqueKey: string): Promise { s.string().parse(uniqueKey); - this.updateTimestamps(false); - const id = uniqueKeyToRequestId(uniqueKey); - const json = (await this.requests.get(id)?.get())?.json; - return this._jsonToRequest(json); + return stripNativeInternals(await this.nativeClient.getRequest(uniqueKey)); + } + + async fetchNextRequest(): Promise { + return stripNativeInternals(await this.nativeClient.fetchNextRequest()) ?? null; } async markRequestAsHandled(request: storage.UpdateRequestSchema): Promise { requestShape.parse(request); - this.updateTimestamps(false); - - // Serialize against other mutators (and the head scans in `isEmpty`/`isFinished`) so the shared - // `requests` map, `inProgressRequestIds` set and request counts stay consistent across the - // `await` points below. - await this.queueStateMutex.wait(); - - try { - const id = uniqueKeyToRequestId(request.uniqueKey); - - const existingEntry = this.requests.get(id); - const existingRequest = await existingEntry?.get(); - - // The request must exist to be marked as handled. We intentionally do NOT require it to still - // be locked: a consumer whose processing outlived the lock (slow handler, GC/event-loop pause, - // a low `setExpectedRequestProcessingTimeSecs`) must still be able to mark its request handled, - // otherwise the request would be handed out again forever and the queue would never finish. - if (!existingRequest) { - return null; - } - - // A handled request has `orderNo === null`. Marking it again is an idempotent no-op. - const wasAlreadyHandled = existingRequest.orderNo === null; - - const handledAt = request.handledAt ?? new Date().toISOString(); - const requestModel = this._createInternalRequest({ ...request, handledAt }, false); - - const newEntry = createRequestQueueStorageImplementation({ - requestId: id, - storeDirectory: this.requestQueueDirectory, - }); - await newEntry.update(requestModel); - this.requests.set(id, newEntry); - - // The request is no longer in progress for this client. - this.inProgressRequestIds.delete(id); - - if (!wasAlreadyHandled) { - this.pendingRequestCount -= 1; - this.handledRequestCount += 1; - } - - this.updateTimestamps(true); - - return { - requestId: id, - wasAlreadyHandled, - wasAlreadyPresent: true, - }; - } finally { - this.queueStateMutex.shift(); - } + return toQueueOperationInfo( + await this.nativeClient.markRequestAsHandled(request as unknown as Record), + ); } async reclaimRequest( @@ -466,217 +175,28 @@ export class RequestQueueClient extends BaseClient implements storage.RequestQue ): Promise { requestShape.parse(request); requestOptionsShape.parse(options); - this.updateTimestamps(false); - - // Serialize against other mutators (and the head scans in `isEmpty`/`isFinished`) so the shared - // `requests` map, `forefrontRequestIds` array and `inProgressRequestIds` set stay consistent - // across the `await` points below. - await this.queueStateMutex.wait(); - - try { - const id = uniqueKeyToRequestId(request.uniqueKey); - - const existingEntry = this.requests.get(id); - const existingRequest = await existingEntry?.get(); - - // The request must exist and not already be handled to be reclaimed. As with - // `markRequestAsHandled`, we do NOT require it to still be locked — a consumer that failed - // after its lock expired must still be able to return the request to the queue (e.g. to honor - // a `forefront` reorder), rather than have the reclaim silently dropped. - if (!existingRequest || existingRequest.orderNo === null) { - return null; - } - - // Reclaiming resets the `orderNo` to a fresh timestamp, releasing the lock and restoring the - // request to the queue (at the front if `forefront`). - const requestModel = this._createInternalRequest(request, options.forefront); - - const newEntry = createRequestQueueStorageImplementation({ - requestId: id, - storeDirectory: this.requestQueueDirectory, - }); - await newEntry.update(requestModel); - this.requests.set(id, newEntry); - - // The request is no longer in progress for this client. - this.inProgressRequestIds.delete(id); - - if (options.forefront) { - this.forefrontRequestIds.push(id); - } - - this.updateTimestamps(true); - - return { - requestId: id, - wasAlreadyHandled: false, - wasAlreadyPresent: true, - }; - } finally { - this.queueStateMutex.shift(); - } + return toQueueOperationInfo( + await this.nativeClient.reclaimRequest( + request as unknown as Record, + options.forefront ?? false, + ), + ); } async isEmpty(): Promise { - this.updateTimestamps(false); - - // "Empty" means there is nothing left to fetch right now — i.e. the next `fetchNextRequest` - // would return `null`. Requests that are currently locked (in progress) are intentionally NOT - // counted here: they are not fetchable, so the queue is empty from a consumer's point of view. - // Whether those in-progress requests mean crawling is not yet done is a separate question, - // answered by `isFinished`. - // - // `listPendingHead` prunes `forefrontRequestIds` as it scans, so we must hold the queue-state mutex to avoid - // racing a concurrent mutator (e.g. `addBatchOfRequests`) at its `await` points. - await this.queueStateMutex.wait(); - - try { - const { items } = await this.listPendingHead(1); - return items.length === 0; - } finally { - this.queueStateMutex.shift(); - } + return this.nativeClient.isEmpty(); } async isFinished(): Promise { - this.updateTimestamps(false); - - // The queue is finished only when there is nothing left to fetch AND nothing currently locked - // (in progress) by any consumer. Counting locked requests is what allows a crawler to keep - // waiting while another consumer (possibly another process sharing this on-disk queue) still - // holds the last requests, instead of finishing prematurely. This mirrors the Apify platform - // shared client's `queueHasLockedRequests` signal. - // - // Detecting locked requests requires a full scan, hence the `detectLockedRequests` flag — unlike - // `fetchNextRequest`/`isEmpty`, which only need the head and can stop early. - // - // `listPendingHead` prunes `forefrontRequestIds` as it scans, so we must hold the queue-state mutex to avoid - // racing a concurrent mutator (e.g. `addBatchOfRequests`) at its `await` points. - await this.queueStateMutex.wait(); - - try { - const { items, hasLockedRequests } = await this.listPendingHead(1, true); - return items.length === 0 && !hasLockedRequests; - } finally { - this.queueStateMutex.shift(); - } + return this.nativeClient.isFinished(); } /** - * Release the locks of all requests this client currently has in progress, returning them to the - * queue so they can be fetched again immediately. - * - * On the Apify platform, a run's locks are released automatically when it migrates or aborts. This - * client, however, persists fake locks via `orderNo`, so it needs to clean up after itself. - * `FileSystemStorageClient.teardown()` calls this for every cached queue at the end of the process so that - * a fetched-but-unhandled request is not stuck (waiting for its lock to expire) for the next consumer - * of the same on-disk queue. + * Persist the native client's in-memory state to disk. Called by + * {@link FileSystemStorageClient.teardown} so that fetched-but-unhandled requests are not stuck + * for the next consumer of the same on-disk queue. */ - async releaseOwnLocks(): Promise { - if (this.inProgressRequestIds.size === 0) { - return; - } - - await this.queueStateMutex.wait(); - - try { - const now = Date.now(); - - for (const id of this.inProgressRequestIds) { - const entry = this.requests.get(id); - const request = await entry?.get(true); - - // Skip requests that were handled or whose lock already expired/changed — we only undo - // locks we still hold. - if (!request || !isRequestLocked(request.orderNo, now)) { - continue; - } - - // Reset the lock to a fresh timestamp, preserving the sign so the request keeps its - // original (forefront / normal) ordering once unlocked. - request.orderNo = now * (request.orderNo! > 0 ? 1 : -1); - await entry!.update(request); - } - - this.inProgressRequestIds.clear(); - this.updateTimestamps(true); - } finally { - this.queueStateMutex.shift(); - } - } - - toRequestQueueInfo(): storage.RequestQueueInfo { - return { - accessedAt: this.accessedAt, - createdAt: this.createdAt, - hadMultipleClients: false, - handledRequestCount: this.handledRequestCount, - id: this.id, - modifiedAt: this.modifiedAt, - name: this.name, - pendingRequestCount: this.pendingRequestCount, - stats: {}, - totalRequestCount: this.requests.size, - userId: '1', - }; - } - - private updateTimestamps(hasBeenModified: boolean) { - this.accessedAt = new Date(); - - if (hasBeenModified) { - this.modifiedAt = new Date(); - } - - const data = { - ...this.toRequestQueueInfo(), - forefrontRequestIds: this.forefrontRequestIds, - }; - - scheduleBackgroundTask( - { - action: 'update-metadata', - data, - entityType: 'requestQueues', - entityDirectory: this.requestQueueDirectory, - id: this.name ?? this.id, - writeMetadata: this.client.writeMetadata, - }, - this.client.logger, - ); - } - - private _jsonToRequest(requestJson?: string): T | undefined { - if (!requestJson) return undefined; - const request = JSON.parse(requestJson); - return purgeNullsFromObject(request); - } - - private _createInternalRequest(request: storage.RequestSchema, forefront?: boolean): InternalRequest { - const orderNo = this._calculateOrderNo(request, forefront); - const id = uniqueKeyToRequestId(request.uniqueKey); - - if (request.id && request.id !== id) { - throw new Error('Request ID does not match its uniqueKey.'); - } - - const json = JSON.stringify({ ...request, id }); - return { - id, - json, - method: request.method, - orderNo, - retryCount: request.retryCount ?? 0, - uniqueKey: request.uniqueKey, - url: request.url, - }; - } - - private _calculateOrderNo(request: storage.RequestSchema, forefront?: boolean) { - if (request.handledAt) return null; - - const timestamp = Date.now(); - - return forefront ? -timestamp : timestamp; + async persistState(): Promise { + await this.nativeClient.persistState(); } } diff --git a/packages/fs-storage/src/utils.ts b/packages/fs-storage/src/utils.ts index 34a823e293b7..28c6959b326e 100644 --- a/packages/fs-storage/src/utils.ts +++ b/packages/fs-storage/src/utils.ts @@ -1,36 +1,5 @@ -import { createHash } from 'node:crypto'; - -import type * as storage from '@crawlee/types'; import { s } from '@sapphire/shapeshift'; -import { REQUEST_ID_LENGTH } from './consts.js'; - -/** - * Removes all properties with a null value - * from the provided object. - */ -export function purgeNullsFromObject(object: T): T { - if (object && typeof object === 'object' && !Array.isArray(object)) { - for (const [key, value] of Object.entries(object)) { - if (value === null) Reflect.deleteProperty(object as Record, key); - } - } - - return object; -} - -/** - * Creates a standard request ID (same as Platform). - */ -export function uniqueKeyToRequestId(uniqueKey: string): string { - const str = createHash('sha256') - .update(uniqueKey) - .digest('base64') - .replace(/(\+|\/|=)/g, ''); - - return str.length > REQUEST_ID_LENGTH ? str.slice(0, REQUEST_ID_LENGTH) : str; -} - export function isBuffer(value: unknown): boolean { try { s.union([s.instance(Buffer), s.instance(ArrayBuffer), s.typedArray()]).parse(value); @@ -48,21 +17,3 @@ export function isStream(value: any): boolean { ['on', 'pipe'].every((key) => key in value && typeof value[key] === 'function') ); } - -export type BackgroundHandlerReceivedMessage = BackgroundHandlerUpdateMetadataMessage; - -export type BackgroundHandlerUpdateMetadataMessage = - | MetadataUpdate<'datasets', storage.DatasetInfo> - | MetadataUpdate<'keyValueStores', storage.KeyValueStoreInfo> - | MetadataUpdate<'requestQueues', storage.RequestQueueInfo>; - -type EntityType = 'datasets' | 'keyValueStores' | 'requestQueues'; - -interface MetadataUpdate { - entityType: Type; - id: string; - action: 'update-metadata'; - entityDirectory: string; - data: DataType; - writeMetadata: boolean; -} diff --git a/packages/fs-storage/test/__shared__.ts b/packages/fs-storage/test/__shared__.ts deleted file mode 100644 index 5a2769b8707f..000000000000 --- a/packages/fs-storage/test/__shared__.ts +++ /dev/null @@ -1,12 +0,0 @@ -import { access } from 'node:fs/promises'; -import { setTimeout } from 'node:timers/promises'; - -export async function waitTillWrittenToDisk(path: string): Promise { - try { - await access(path); - return undefined; - } catch { - await setTimeout(50); - return waitTillWrittenToDisk(path); - } -} diff --git a/packages/fs-storage/test/fs-fallback.test.ts b/packages/fs-storage/test/fs-fallback.test.ts index 908dab207389..704e03fca529 100644 --- a/packages/fs-storage/test/fs-fallback.test.ts +++ b/packages/fs-storage/test/fs-fallback.test.ts @@ -1,11 +1,15 @@ import { randomUUID } from 'node:crypto'; -import { rm, writeFile } from 'node:fs/promises'; +import { mkdir, rm, writeFile } from 'node:fs/promises'; import { resolve } from 'node:path'; import { FileSystemStorageClient } from '@crawlee/fs-storage'; import type { KeyValueStoreRecord } from '@crawlee/types'; -import { ensureDir } from 'fs-extra/esm'; +// The storage is backed by the native `@crawlee/fs-storage-native` extension, which only serves +// key-value records it has written itself (tracked via per-record metadata sidecars). The +// `KeyValueStoreClient` adapter layers a fallback on top so that value files placed into the store +// directory out-of-band — e.g. a hand-written or platform-provided `INPUT.json` — are still readable. +// These tests pin both the store-identity metadata fallback and that bare-file fallback. describe('fallback to fs for reading', () => { const tmpLocation = resolve(import.meta.dirname, './tmp/fs-fallback'); const storage = new FileSystemStorageClient({ @@ -15,8 +19,8 @@ describe('fallback to fs for reading', () => { const expectedFsDate = new Date(2022, 0, 1); beforeAll(async () => { - // Create "default" key-value store and give it an entry - await ensureDir(resolve(storage.keyValueStoresDirectory, 'default')); + // "default" store: metadata file + a bare INPUT.json (no per-record metadata sidecar). + await mkdir(resolve(storage.keyValueStoresDirectory, 'default'), { recursive: true }); await writeFile( resolve(storage.keyValueStoresDirectory, 'default/__metadata__.json'), JSON.stringify({ @@ -32,19 +36,22 @@ describe('fallback to fs for reading', () => { JSON.stringify({ foo: 'bar but from fs' }), ); - await ensureDir(resolve(storage.keyValueStoresDirectory, 'other')); + // "other" store: a bare INPUT.json with no store metadata file at all. + await mkdir(resolve(storage.keyValueStoresDirectory, 'other'), { recursive: true }); await writeFile( resolve(storage.keyValueStoresDirectory, 'other/INPUT.json'), JSON.stringify({ foo: 'bar but from fs' }), ); - await ensureDir(resolve(storage.keyValueStoresDirectory, 'no-ext')); + // "no-ext" store: a value file with no extension — loaded as raw text. + await mkdir(resolve(storage.keyValueStoresDirectory, 'no-ext'), { recursive: true }); await writeFile( resolve(storage.keyValueStoresDirectory, 'no-ext/INPUT'), JSON.stringify({ foo: 'bar but from fs' }), ); - await ensureDir(resolve(storage.keyValueStoresDirectory, 'invalid-json')); + // "invalid-json" store: a malformed INPUT.json — ignored. + await mkdir(resolve(storage.keyValueStoresDirectory, 'invalid-json'), { recursive: true }); await writeFile(resolve(storage.keyValueStoresDirectory, 'invalid-json/INPUT.json'), '{'); }); @@ -52,9 +59,7 @@ describe('fallback to fs for reading', () => { await rm(tmpLocation, { force: true, recursive: true }); }); - // POST INIT // - - test('attempting to read "default" key value store with "__metadata__" present should read from fs', async () => { + test('reads store identity from the on-disk metadata, and a bare INPUT.json value', async () => { const defaultStore = await storage.createKeyValueStoreClient({ name: 'default' }); const defaultStoreInfo = await defaultStore.getMetadata(); @@ -69,7 +74,7 @@ describe('fallback to fs for reading', () => { }); }); - test('attempting to read "other" key value store with no "__metadata__" present should read from fs, even if accessed without generating id first', async () => { + test('reads a bare INPUT.json even with no store metadata present', async () => { const otherStore = await storage.createKeyValueStoreClient({ name: 'other' }); const input = await otherStore.getValue('INPUT'); @@ -80,13 +85,13 @@ describe('fallback to fs for reading', () => { }); }); - test('attempting to read "default_2" key value store that has no data on disk should still be accessible after creation', async () => { + test('a store with no data on disk is still accessible after creation', async () => { const default2Store = await storage.createKeyValueStoreClient({ name: 'default_2' }); const info = await default2Store.getMetadata(); expect(info.name).toEqual('default_2'); }); - test('attempting to read "no-ext" key value store should load the missing extension file correctly', async () => { + test('loads a value file with no extension as raw text', async () => { const noExtStore = await storage.createKeyValueStoreClient({ name: 'no-ext' }); const input = await noExtStore.getValue('INPUT'); @@ -97,10 +102,23 @@ describe('fallback to fs for reading', () => { }); }); - test('attempting to read "invalid-json" key value store should ignore the invalid "INPUT" json file', async () => { + test('ignores an invalid-JSON bare value file', async () => { const invalidJsonStore = await storage.createKeyValueStoreClient({ name: 'invalid-json' }); const input = await invalidJsonStore.getValue('INPUT'); expect(input).toBeUndefined(); }); + + test('bare files are visible to recordExists, getPublicUrl and listKeys', async () => { + const otherStore = await storage.createKeyValueStoreClient({ name: 'other' }); + + expect(await otherStore.recordExists('INPUT')).toBe(true); + expect(await otherStore.recordExists('does-not-exist')).toBe(false); + + const url = await otherStore.getPublicUrl('INPUT'); + expect(url).toMatch(/^file:\/\/.*INPUT\.json$/); + + const keys = await otherStore.listKeys(); + expect(keys.map((item) => item.key)).toContain('INPUT'); + }); }); diff --git a/packages/fs-storage/test/key-value-store/special-keys.test.ts b/packages/fs-storage/test/key-value-store/special-keys.test.ts new file mode 100644 index 000000000000..3117cc9f3d8e --- /dev/null +++ b/packages/fs-storage/test/key-value-store/special-keys.test.ts @@ -0,0 +1,49 @@ +import { rm } from 'node:fs/promises'; +import { resolve } from 'node:path'; + +import { FileSystemStorageClient } from '@crawlee/fs-storage'; + +// Keys may contain characters that are unsafe in a file name (e.g. `.` or `/`). The adapter must +// round-trip such keys correctly through `setValue` / `getValue` / `listKeys` regardless of how the +// underlying native client encodes them on disk. These tests exercise that via the public API only — +// the concrete on-disk filenames are the native client's concern and are not asserted here. +describe('KeyValueStore handles keys with file-name-unsafe characters', () => { + const tmpLocation = resolve(import.meta.dirname, '../tmp/special-keys'); + + afterAll(async () => { + await rm(tmpLocation, { force: true, recursive: true }); + }); + + test('round-trips a key containing a dot', async () => { + const storage = new FileSystemStorageClient({ localDataDirectory: tmpLocation }); + const store = await storage.createKeyValueStoreClient({ name: 'dotted' }); + + await store.setValue({ + key: 'jibberish2.html', + value: 'Hi there!', + contentType: 'text/html', + }); + + const record = await store.getValue('jibberish2.html'); + expect(record?.value).toBe('Hi there!'); + expect(record?.contentType).toBe('text/html'); + + expect(await store.recordExists('jibberish2.html')).toBe(true); + const keys = await store.listKeys(); + expect(keys.map((item) => item.key)).toContain('jibberish2.html'); + }); + + test('round-trips a key containing a slash', async () => { + const storage = new FileSystemStorageClient({ localDataDirectory: tmpLocation }); + const store = await storage.createKeyValueStoreClient({ name: 'slashed' }); + + await store.setValue({ key: 'nested/key', value: { ok: true } }); + + const record = await store.getValue('nested/key'); + expect(record?.value).toStrictEqual({ ok: true }); + + expect(await store.recordExists('nested/key')).toBe(true); + const keys = await store.listKeys(); + expect(keys.map((item) => item.key)).toContain('nested/key'); + }); +}); diff --git a/packages/fs-storage/test/key-value-store/with-extension.test.ts b/packages/fs-storage/test/key-value-store/with-extension.test.ts deleted file mode 100644 index 30f0061fc71f..000000000000 --- a/packages/fs-storage/test/key-value-store/with-extension.test.ts +++ /dev/null @@ -1,80 +0,0 @@ -import { existsSync } from 'node:fs'; -import { resolve } from 'node:path'; - -import { emptyDirSync } from 'fs-extra/esm'; - -import { createKeyValueStorageImplementation } from '../../src/fs/key-value-store/index.js'; - -describe('KeyValueStore should append extension only when needed', () => { - const mockImageBuffer = Buffer.from('This is a test image', 'utf8'); - - afterAll(() => emptyDirSync('tmp')); - - test('should append extension when needed (jpg)', async () => { - const testDir = resolve('tmp', 'test_no_extension'); - const storage = createKeyValueStorageImplementation({ - storeDirectory: testDir, - writeMetadata: true, - }); - await storage.update({ - key: 'jibberish', - value: mockImageBuffer, - contentType: 'image/jpeg', - extension: 'jpeg', - }); - - expect(existsSync(resolve(testDir, 'jibberish.jpeg'))).toBeTruthy(); - expect(existsSync(resolve(testDir, 'jibberish'))).toBeFalsy(); - }); - - test('should append extension when needed (html)', async () => { - const testDir = resolve('tmp', 'test_no_extension'); - const storage = createKeyValueStorageImplementation({ - storeDirectory: testDir, - writeMetadata: true, - }); - await storage.update({ - key: 'jibberish2', - value: 'Hi there!', - contentType: 'text/html', - extension: 'html', - }); - - expect(existsSync(resolve(testDir, 'jibberish2.html'))).toBeTruthy(); - expect(existsSync(resolve(testDir, 'jibberish2'))).toBeFalsy(); - }); - - test('should not append extension when already available', async () => { - const testDir = resolve('tmp', 'test_extension'); - const storage = createKeyValueStorageImplementation({ - storeDirectory: testDir, - writeMetadata: true, - }); - await storage.update({ - key: 'jibberish.jpg', - value: mockImageBuffer, - contentType: 'image/jpeg', - extension: 'jpeg', - }); - - expect(existsSync(resolve(testDir, 'jibberish.jpg'))).toBeTruthy(); - expect(existsSync(resolve(testDir, 'jibberish.jpg.jpeg'))).toBeFalsy(); - }); - - test('should not append extension when already available', async () => { - const testDir = resolve('tmp', 'test_extension'); - const storage = createKeyValueStorageImplementation({ - storeDirectory: testDir, - writeMetadata: true, - }); - await storage.update({ - key: 'jibberish2.html', - value: 'Hi there!', - contentType: 'text/html', - extension: 'html', - }); - - expect(existsSync(resolve(testDir, 'jibberish2.html'))).toBeTruthy(); - expect(existsSync(resolve(testDir, 'jibberish2.html.html'))).toBeFalsy(); - }); -}); diff --git a/packages/fs-storage/test/no-crash-on-big-buffers.test.ts b/packages/fs-storage/test/no-crash-on-big-buffers.test.ts index 0ecdd4bdf951..aa1ad7d5a2e0 100644 --- a/packages/fs-storage/test/no-crash-on-big-buffers.test.ts +++ b/packages/fs-storage/test/no-crash-on-big-buffers.test.ts @@ -1,5 +1,8 @@ -// https://github.com/apify/crawlee/issues/1732 -// https://github.com/apify/crawlee/issues/1710 +// Regression guard for https://github.com/apify/crawlee/issues/1732 and +// https://github.com/apify/crawlee/issues/1710 — storing a large binary value must not crash (the old +// pure-TS implementation overflowed the stack on big buffers). The native client does the actual +// write; the adapter passes a `Buffer` straight through. Here we verify a large buffer round-trips +// through `setValue` / `getValue` via the public API. import { rm } from 'node:fs/promises'; import { resolve } from 'node:path'; @@ -7,11 +10,9 @@ import { resolve } from 'node:path'; import { FileSystemStorageClient } from '@crawlee/fs-storage'; import type { KeyValueStoreClient } from '@crawlee/types'; -describe('FileSystemStorageClient should not crash when saving a big buffer', () => { +describe('KeyValueStore round-trips a large binary value', () => { const tmpLocation = resolve(import.meta.dirname, './tmp/no-buffer-crash'); - const storage = new FileSystemStorageClient({ - localDataDirectory: tmpLocation, - }); + const storage = new FileSystemStorageClient({ localDataDirectory: tmpLocation }); let store: KeyValueStoreClient; @@ -23,21 +24,19 @@ describe('FileSystemStorageClient should not crash when saving a big buffer', () await rm(tmpLocation, { force: true, recursive: true }); }); - test('should not crash when saving a big buffer', async () => { - let zip: Buffer; - - if (process.env.CRAWLEE_DIFFICULT_TESTS) { - const numbers = Array.from([...Array(18_100_000).keys()].map((i) => i * 3_000_000)); - - zip = Buffer.from([...numbers]); - } else { - zip = Buffer.from([...Array(100_000)].map((i) => i * 8)); + test('stores and reads back a large buffer without crashing', async () => { + const size = process.env.CRAWLEE_DIFFICULT_TESTS ? 50_000_000 : 1_000_000; + const zip = Buffer.alloc(size); + // Fill with a non-trivial, verifiable pattern. + for (let i = 0; i < size; i += 1) { + zip[i] = i % 256; } - try { - await store.setValue({ key: 'owo.zip', value: zip }); - } catch (err) { - expect(err).not.toBeDefined(); - } + await store.setValue({ key: 'owo.zip', value: zip, contentType: 'application/zip' }); + + const record = await store.getValue('owo.zip'); + expect(Buffer.isBuffer(record?.value)).toBe(true); + expect((record!.value as Buffer).length).toBe(size); + expect(record!.value.equals(zip)).toBe(true); }); }); diff --git a/packages/fs-storage/test/request-queue/adapter.test.ts b/packages/fs-storage/test/request-queue/adapter.test.ts new file mode 100644 index 000000000000..15a61a1cbb95 --- /dev/null +++ b/packages/fs-storage/test/request-queue/adapter.test.ts @@ -0,0 +1,146 @@ +import { rm } from 'node:fs/promises'; +import { resolve } from 'node:path'; + +import { FileSystemStorageClient } from '@crawlee/fs-storage'; +import type { RequestQueueClient } from '@crawlee/types'; + +// The request-queue ordering, locking and finished-ness semantics are owned (and exhaustively tested) +// by the native `@crawlee/fs-storage-native` extension. These tests cover what the *adapter* adds on +// top: mapping requests and operation results between the native shapes and the `@crawlee/types` +// interfaces, and a thin lifecycle smoke test to catch wiring regressions. +describe('RequestQueueClient adapter', () => { + const tmpLocation = resolve(import.meta.dirname, './tmp/adapter'); + + let requestQueue: RequestQueueClient; + let testIndex = 0; + + beforeEach(async () => { + // Isolate each test with its own storage directory and queue so persisted counts/requests from + // one test cannot leak into the next. + const storage = new FileSystemStorageClient({ localDataDirectory: resolve(tmpLocation, `${testIndex++}`) }); + requestQueue = await storage.createRequestQueueClient({ name: 'adapter' }); + }); + + afterAll(async () => { + await rm(tmpLocation, { force: true, recursive: true }); + }); + + test('fetchNextRequest returns a clean request: internal `orderNo` stripped, fields preserved', async () => { + await requestQueue.addBatchOfRequests([ + { url: 'http://example.com/1', uniqueKey: '1', userData: { foo: 'bar' } }, + ]); + + const request = await requestQueue.fetchNextRequest(); + + expect(request).not.toBeNull(); + expect(request!.url).toBe('http://example.com/1'); + expect(request!.uniqueKey).toBe('1'); + expect(request!.userData).toStrictEqual({ foo: 'bar' }); + // `id` is a real request field and is surfaced. + expect(typeof request!.id).toBe('string'); + // `orderNo` is the native client's internal bookkeeping and must not leak through the adapter. + expect(request).not.toHaveProperty('orderNo'); + }); + + test('getRequest looks up by uniqueKey and strips internal fields', async () => { + await requestQueue.addBatchOfRequests([{ url: 'http://example.com/1', uniqueKey: '1' }]); + + const request = await requestQueue.getRequest('1'); + + expect(request?.url).toBe('http://example.com/1'); + expect(request).not.toHaveProperty('orderNo'); + expect(await requestQueue.getRequest('does-not-exist')).toBeUndefined(); + }); + + test('addBatchOfRequests maps the native response into BatchAddRequestsResult', async () => { + const result = await requestQueue.addBatchOfRequests([ + { url: 'http://example.com/1', uniqueKey: '1' }, + { url: 'http://example.com/1', uniqueKey: '1' }, // duplicate uniqueKey + ]); + + expect(result.unprocessedRequests).toStrictEqual([]); + expect(result.processedRequests).toHaveLength(2); + + const [first, second] = result.processedRequests; + expect(first).toMatchObject({ uniqueKey: '1', wasAlreadyPresent: false, wasAlreadyHandled: false }); + expect(typeof first.requestId).toBe('string'); + // The second one is deduplicated by uniqueKey. + expect(second).toMatchObject({ uniqueKey: '1', wasAlreadyPresent: true }); + }); + + test('markRequestAsHandled maps the native result into QueueOperationInfo', async () => { + await requestQueue.addBatchOfRequests([{ url: 'http://example.com/1', uniqueKey: '1' }]); + const request = await requestQueue.fetchNextRequest(); + + const info = await requestQueue.markRequestAsHandled({ ...request!, id: request!.id! }); + + expect(info).toMatchObject({ requestId: request!.id, wasAlreadyHandled: true, wasAlreadyPresent: true }); + }); + + test('getMetadata maps native metadata into RequestQueueInfo (Date timestamps, counts)', async () => { + await requestQueue.addBatchOfRequests([{ url: 'http://example.com/1', uniqueKey: '1' }]); + const request = await requestQueue.fetchNextRequest(); + await requestQueue.markRequestAsHandled({ ...request!, id: request!.id! }); + + const metadata = await requestQueue.getMetadata(); + + // Native count fields are surfaced... + expect(metadata.handledRequestCount).toBe(1); + expect(metadata.pendingRequestCount).toBe(0); + expect(metadata.totalRequestCount).toBe(1); + // ...ISO-string timestamps are converted to `Date`... + expect(metadata.createdAt).toBeInstanceOf(Date); + expect(metadata.modifiedAt).toBeInstanceOf(Date); + expect(metadata.accessedAt).toBeInstanceOf(Date); + // ...and the adapter synthesizes the framework-shape fields. + expect(metadata.id).toEqual(expect.any(String)); + expect(metadata.hadMultipleClients).toBe(false); + }); + + test('a request added as already-handled counts toward handledRequestCount', async () => { + // Regression guard: re-inserting an already-handled request must not be counted as pending. + await requestQueue.addBatchOfRequests([ + { url: 'http://example.com/1', uniqueKey: '1', handledAt: new Date().toISOString() }, + ]); + + const metadata = await requestQueue.getMetadata(); + expect(metadata.handledRequestCount).toBe(1); + expect(metadata.pendingRequestCount).toBe(0); + }); + + test('forwards `forefront` so a later request can be served first', async () => { + await requestQueue.addBatchOfRequests([{ url: 'http://example.com/1', uniqueKey: '1' }]); + await requestQueue.addBatchOfRequests([{ url: 'http://example.com/2', uniqueKey: '2' }], { forefront: true }); + + // We only assert that the `forefront` flag reaches the native client (the forefront request is + // served before the regular one); the exact ordering algorithm is the native client's concern. + const first = await requestQueue.fetchNextRequest(); + expect(first!.uniqueKey).toBe('2'); + }); + + test('lifecycle: fetch marks in-progress, handle empties and finishes the queue', async () => { + await requestQueue.addBatchOfRequests([{ url: 'http://example.com/1', uniqueKey: '1' }]); + expect(await requestQueue.isEmpty()).toBe(false); + + const request = await requestQueue.fetchNextRequest(); + // Fetched (in-progress): nothing left to fetch, but not finished until handled. + expect(await requestQueue.isEmpty()).toBe(true); + expect(await requestQueue.isFinished()).toBe(false); + // While in progress it is not handed out again. + expect(await requestQueue.fetchNextRequest()).toBeNull(); + + await requestQueue.markRequestAsHandled({ ...request!, id: request!.id! }); + expect(await requestQueue.isFinished()).toBe(true); + }); + + test('reclaimRequest returns an in-progress request to the queue', async () => { + await requestQueue.addBatchOfRequests([{ url: 'http://example.com/1', uniqueKey: '1' }]); + + const first = await requestQueue.fetchNextRequest(); + const info = await requestQueue.reclaimRequest({ ...first!, id: first!.id! }); + expect(info).toMatchObject({ requestId: first!.id, wasAlreadyHandled: false }); + + const again = await requestQueue.fetchNextRequest(); + expect(again!.uniqueKey).toBe('1'); + }); +}); diff --git a/packages/fs-storage/test/request-queue/dangling-forefront-on-reload.test.ts b/packages/fs-storage/test/request-queue/dangling-forefront-on-reload.test.ts deleted file mode 100644 index 0dec65342d10..000000000000 --- a/packages/fs-storage/test/request-queue/dangling-forefront-on-reload.test.ts +++ /dev/null @@ -1,86 +0,0 @@ -import { randomUUID } from 'node:crypto'; -import { rm, writeFile } from 'node:fs/promises'; -import { resolve } from 'node:path'; - -import { FileSystemStorageClient } from '@crawlee/fs-storage'; -import type { InternalRequest } from '@crawlee/fs-storage/src/resource-clients/request-queue'; -import type { RequestSchema } from '@crawlee/types'; -import { ensureDir } from 'fs-extra/esm'; - -/** - * On reload, `forefrontRequestIds` is restored verbatim from the persisted metadata, while the - * `requests` map is rebuilt only from request files actually found and parseable on disk. If a - * persisted forefront id has no backing request file (deleted, never written, or corrupt JSON), it - * would dangle: a later head scan resolves it to a missing request and dereferences `undefined`. - * The reload must drop such ids so head scans stay safe. - */ -describe('Request queue reload drops dangling forefront ids without a backing request file', () => { - const tmpLocation = resolve(import.meta.dirname, './tmp/req-queue-dangling-forefront'); - - const writeMetadata = async (storage: FileSystemStorageClient, forefrontRequestIds: string[]) => { - await ensureDir(resolve(storage.requestQueuesDirectory, 'default')); - await writeFile( - resolve(storage.requestQueuesDirectory, 'default/__metadata__.json'), - JSON.stringify({ - id: randomUUID(), - name: 'default', - createdAt: new Date(2022, 0, 1), - accessedAt: new Date(2022, 0, 1), - modifiedAt: new Date(2022, 0, 1), - pendingRequestCount: 1, - handledRequestCount: 0, - forefrontRequestIds, - }), - ); - }; - - const writeValidRequest = async (storage: FileSystemStorageClient, id: string, url: string, uniqueKey: string) => { - await writeFile( - resolve(storage.requestQueuesDirectory, `default/${id}.json`), - JSON.stringify({ - id, - orderNo: -1, - url, - uniqueKey, - method: 'GET', - retryCount: 0, - json: JSON.stringify({ id, url, uniqueKey } satisfies RequestSchema), - } satisfies InternalRequest), - ); - }; - - afterEach(async () => { - await rm(tmpLocation, { force: true, recursive: true }); - }); - - test('a forefront id with no request file on disk does not crash a later head scan', async () => { - const storage = new FileSystemStorageClient({ localDataDirectory: tmpLocation }); - - // The metadata references a forefront id 'missing' for which no request file exists on disk. - await writeMetadata(storage, ['missing']); - - const queue = await storage.createRequestQueueClient({ name: 'default' }); - - // None of these head scans must throw, and an empty/finished queue must be reported. - await expect(queue.isEmpty()).resolves.toBe(true); - await expect(queue.isFinished()).resolves.toBe(true); - await expect(queue.fetchNextRequest()).resolves.toBeNull(); - }); - - test('a valid forefront request is still served while a dangling sibling id is dropped', async () => { - const storage = new FileSystemStorageClient({ localDataDirectory: tmpLocation }); - - // Two forefront ids in metadata: one backed by a real file, one dangling. - await writeMetadata(storage, ['missing', '123']); - await writeValidRequest(storage, '123', 'http://example.com', 'owo'); - - const queue = await storage.createRequestQueueClient({ name: 'default' }); - - // The dangling id is dropped; the valid forefront request is still fetchable. - const first = await queue.fetchNextRequest(); - expect(first).not.toBeNull(); - expect(first!.url).toEqual('http://example.com'); - - expect(await queue.fetchNextRequest()).toBeNull(); - }); -}); diff --git a/packages/fs-storage/test/request-queue/forefront.test.ts b/packages/fs-storage/test/request-queue/forefront.test.ts deleted file mode 100644 index b95ed944db77..000000000000 --- a/packages/fs-storage/test/request-queue/forefront.test.ts +++ /dev/null @@ -1,266 +0,0 @@ -import { rm } from 'node:fs/promises'; -import { resolve } from 'node:path'; -import { setTimeout as sleep } from 'node:timers/promises'; - -import { FileSystemStorageClient } from '@crawlee/fs-storage'; -import type { RequestQueueClient } from '@crawlee/types'; - -/** - * Drains the queue via `fetchNextRequest`, marking each request as handled, and returns the - * pathnames in the order they were served. - */ -async function fetchOrder(client: RequestQueueClient): Promise { - const order: string[] = []; - - for (let request = await client.fetchNextRequest(); request !== null; request = await client.fetchNextRequest()) { - order.push(new URL(request.url).pathname); - await client.markRequestAsHandled({ ...request, id: request.id! }); - } - - return order; -} - -describe('RequestQueue respects `forefront` when fetching requests', () => { - const storage = new FileSystemStorageClient({}); - - let requestQueue: RequestQueueClient; - - beforeEach(async () => { - requestQueue = await storage.createRequestQueueClient({ name: 'forefront' }); - }); - - afterEach(async () => { - await requestQueue.drop(); - }); - - test('requests without `forefront` respect sequential order', async () => { - await requestQueue.addBatchOfRequests([{ url: 'http://example.com/1', uniqueKey: '1' }]); - // Waiting a few ms is required since we use Date.now() to compute orderNo - await sleep(2); - await requestQueue.addBatchOfRequests([{ url: 'http://example.com/2', uniqueKey: '2' }]); - - expect(await fetchOrder(requestQueue)).toEqual(['/1', '/2']); - }); - - test('`forefront` requests are prioritized', async () => { - await requestQueue.addBatchOfRequests([{ url: 'http://example.com/1', uniqueKey: '1' }]); - // Waiting a few ms is required since we use Date.now() to compute orderNo - await sleep(2); - await requestQueue.addBatchOfRequests([{ url: 'http://example.com/2', uniqueKey: '2' }], { forefront: true }); - - expect(await fetchOrder(requestQueue)).toEqual(['/2', '/1']); - }); - - test('global `forefront` ordering is preserved across several inserts', async () => { - await requestQueue.addBatchOfRequests([{ url: 'http://example.com/1', uniqueKey: '1' }]); - await sleep(2); - await requestQueue.addBatchOfRequests([{ url: 'http://example.com/2', uniqueKey: '2' }], { forefront: true }); - await sleep(2); - await requestQueue.addBatchOfRequests([{ url: 'http://example.com/3', uniqueKey: '3' }], { forefront: true }); - - expect(await fetchOrder(requestQueue)).toEqual(['/3', '/2', '/1']); - }); - - test('`addBatchOfRequests` respects `forefront`', async () => { - await requestQueue.addBatchOfRequests([{ url: 'http://example.com/3', uniqueKey: '3' }]); - - await sleep(2); - - await requestQueue.addBatchOfRequests( - [ - { url: 'http://example.com/1', uniqueKey: '1' }, - { url: 'http://example.com/2', uniqueKey: '2' }, - ], - { forefront: true }, - ); - - const order = await fetchOrder(requestQueue); - expect(order).toHaveLength(3); - // Both forefront requests come before the original; their relative order is arbitrary. - expect(order[2]).toEqual('/3'); - expect([ - ['/2', '/1', '/3'], - ['/1', '/2', '/3'], - ]).toContainEqual(order); - }); - - test('a reclaimed request is served again', async () => { - await requestQueue.addBatchOfRequests([{ url: 'http://example.com/1', uniqueKey: '1' }]); - - const first = await requestQueue.fetchNextRequest(); - expect(first!.url).toEqual('http://example.com/1'); - - // Reclaiming a fetched (in-progress) request returns it to the queue. - await requestQueue.reclaimRequest({ ...first!, id: first!.id! }); - - const second = await requestQueue.fetchNextRequest(); - expect(second!.url).toEqual('http://example.com/1'); - }); - - test('a reclaimed `forefront` request jumps to the front', async () => { - await requestQueue.addBatchOfRequests([ - { url: 'http://example.com/1', uniqueKey: '1' }, - { url: 'http://example.com/2', uniqueKey: '2' }, - ]); - - const first = await requestQueue.fetchNextRequest(); - expect(first!.url).toEqual('http://example.com/1'); - - await requestQueue.reclaimRequest({ ...first!, id: first!.id! }, { forefront: true }); - - const next = await requestQueue.fetchNextRequest(); - expect(next!.url).toEqual('http://example.com/1'); - }); - - test('handling all requests empties the queue', async () => { - await requestQueue.addBatchOfRequests([ - { url: 'http://example.com/1', uniqueKey: '1' }, - { url: 'http://example.com/2', uniqueKey: '2' }, - { url: 'http://example.com/3', uniqueKey: '3' }, - ]); - - expect(await requestQueue.isEmpty()).toBe(false); - - await fetchOrder(requestQueue); - - expect(await requestQueue.isEmpty()).toBe(true); - expect(await requestQueue.fetchNextRequest()).toBeNull(); - }); - - test('a fetched (locked) request leaves the queue empty but unfinished until it is handled', async () => { - await requestQueue.addBatchOfRequests([{ url: 'http://example.com/1', uniqueKey: '1' }]); - - const request = await requestQueue.fetchNextRequest(); - expect(request).not.toBeNull(); - - // The request is locked (in progress), not handled. There is nothing left to fetch, so the - // queue is empty — but it is not finished. The "not finished" signal is what stops a crawler - // from shutting down while a request is still being processed by some consumer. - expect(await requestQueue.isEmpty()).toBe(true); - expect(await requestQueue.isFinished()).toBe(false); - - await requestQueue.markRequestAsHandled({ ...request!, id: request!.id! }); - expect(await requestQueue.isEmpty()).toBe(true); - expect(await requestQueue.isFinished()).toBe(true); - }); -}); - -describe('RequestQueue locks fetched requests', () => { - const storage = new FileSystemStorageClient({}); - - let requestQueue: RequestQueueClient; - - beforeEach(async () => { - requestQueue = await storage.createRequestQueueClient({ name: 'locking' }); - }); - - afterEach(async () => { - await requestQueue.drop(); - }); - - test('a fetched request becomes available again after its lock expires', async () => { - vitest.useFakeTimers(); - - try { - await requestQueue.addBatchOfRequests([{ url: 'http://example.com/1', uniqueKey: '1' }]); - - const first = await requestQueue.fetchNextRequest(); - expect(first!.uniqueKey).toBe('1'); - - // While locked, the request is not handed out again. - expect(await requestQueue.fetchNextRequest()).toBeNull(); - - // After the lock expires (default 3 minutes), the request is fetchable again — this is what - // prevents a crashed consumer from blocking its requests forever. - vitest.advanceTimersByTime(3 * 60 * 1000 + 1000); - - const retried = await requestQueue.fetchNextRequest(); - expect(retried!.uniqueKey).toBe('1'); - } finally { - vitest.useRealTimers(); - } - }); -}); - -describe('RequestQueue locking is visible across clients sharing on-disk storage', () => { - const tmpLocation = resolve(import.meta.dirname, './tmp/req-queue-cross-process'); - // Two independent storage instances over the same directory emulate two separate processes. - const storageA = new FileSystemStorageClient({ localDataDirectory: tmpLocation }); - const storageB = new FileSystemStorageClient({ localDataDirectory: tmpLocation }); - - afterAll(async () => { - await rm(tmpLocation, { force: true, recursive: true }); - }); - - test('two clients on the same queue never fetch the same request', async () => { - const clientA = await storageA.createRequestQueueClient({ name: 'shared' }); - await clientA.addBatchOfRequests([ - { url: 'http://example.com/1', uniqueKey: '1' }, - { url: 'http://example.com/2', uniqueKey: '2' }, - ]); - - const clientB = await storageB.createRequestQueueClient({ name: 'shared' }); - - const fromA = await clientA.fetchNextRequest(); - const fromB = await clientB.fetchNextRequest(); - - expect(fromA).not.toBeNull(); - expect(fromB).not.toBeNull(); - // The lock written by one client is observed by the other, so they get distinct requests. - expect(fromA!.uniqueKey).not.toBe(fromB!.uniqueKey); - - // Both requests are now locked, so neither client can fetch anything more. - expect(await clientA.fetchNextRequest()).toBeNull(); - expect(await clientB.fetchNextRequest()).toBeNull(); - - await clientA.drop(); - }); - - test('a client does not report the queue finished while another client holds the last request', async () => { - const clientA = await storageA.createRequestQueueClient({ name: 'shared-is-empty' }); - await clientA.addBatchOfRequests([{ url: 'http://example.com/1', uniqueKey: '1' }]); - - const clientB = await storageB.createRequestQueueClient({ name: 'shared-is-empty' }); - - // Client A fetches (and thus locks) the only request. - const fromA = await clientA.fetchNextRequest(); - expect(fromA).not.toBeNull(); - - // Client B has nothing it can fetch right now, so from its point of view the queue is empty... - expect(await clientB.fetchNextRequest()).toBeNull(); - expect(await clientB.isEmpty()).toBe(true); - // ...but the request still exists and is merely locked by A, so B must NOT consider the queue - // finished — otherwise the crawler driving B could shut down while A is still processing. - expect(await clientB.isFinished()).toBe(false); - - // Once A handles the request, it is gone for good and B sees a finished queue. - await clientA.markRequestAsHandled({ ...fromA!, id: fromA!.id! }); - expect(await clientB.isEmpty()).toBe(true); - expect(await clientB.isFinished()).toBe(true); - - await clientA.drop(); - }); - - test('teardown releases this client locks so another client can fetch immediately', async () => { - const clientA = await storageA.createRequestQueueClient({ name: 'shared-teardown' }); - await clientA.addBatchOfRequests([{ url: 'http://example.com/1', uniqueKey: '1' }]); - - // Client A fetches (locks) the request, then the process tears down without handling it. - const fromA = await clientA.fetchNextRequest(); - expect(fromA).not.toBeNull(); - - const clientB = await storageB.createRequestQueueClient({ name: 'shared-teardown' }); - // While A holds the lock, B cannot fetch the request. - expect(await clientB.fetchNextRequest()).toBeNull(); - - // Tearing down A's storage releases its locks (instead of leaving them until the 3-minute - // expiry), so B can pick the request up right away. - await storageA.teardown(); - - const fromB = await clientB.fetchNextRequest(); - expect(fromB).not.toBeNull(); - expect(fromB!.uniqueKey).toBe('1'); - - await clientB.drop(); - }); -}); diff --git a/packages/fs-storage/test/request-queue/handledRequestCount-should-update.test.ts b/packages/fs-storage/test/request-queue/handledRequestCount-should-update.test.ts deleted file mode 100644 index eff5934d0fc0..000000000000 --- a/packages/fs-storage/test/request-queue/handledRequestCount-should-update.test.ts +++ /dev/null @@ -1,51 +0,0 @@ -import { rm } from 'node:fs/promises'; -import { resolve } from 'node:path'; - -import { FileSystemStorageClient } from '@crawlee/fs-storage'; -import type { RequestQueueClient } from '@crawlee/types'; - -describe('RequestQueue handledRequestCount should update', () => { - // Use an isolated storage directory so persisted request files from a previous run cannot leak - // into this one (a handled request surviving on disk would be deduplicated on the next add). - const localDataDirectory = resolve(import.meta.dirname, './tmp/handled-request-count'); - const storage = new FileSystemStorageClient({ localDataDirectory }); - - let requestQueue: RequestQueueClient; - - beforeAll(async () => { - requestQueue = await storage.createRequestQueueClient({ name: 'handledRequestCount' }); - }); - - afterAll(async () => { - await rm(localDataDirectory, { force: true, recursive: true }); - }); - - test('after marking a request as handled, it should increment the handledRequestCount', async () => { - await requestQueue.addBatchOfRequests([{ url: 'http://example.com/1', uniqueKey: '1' }]); - - const request = await requestQueue.fetchNextRequest(); - expect(request).not.toBeNull(); - - await requestQueue.markRequestAsHandled({ - url: 'http://example.com/1', - uniqueKey: '1', - id: request!.id!, - }); - - const updatedStatistics = await requestQueue.getMetadata(); - expect(updatedStatistics.handledRequestCount).toEqual(1); - }); - - test('adding an already handled request should increment the handledRequestCount', async () => { - await requestQueue.addBatchOfRequests([ - { - url: 'http://example.com/2', - uniqueKey: '2', - handledAt: new Date().toISOString(), - }, - ]); - - const updatedStatistics = await requestQueue.getMetadata(); - expect(updatedStatistics.handledRequestCount).toEqual(2); - }); -}); diff --git a/packages/fs-storage/test/request-queue/ignore-non-json-files.test.ts b/packages/fs-storage/test/request-queue/ignore-non-json-files.test.ts deleted file mode 100644 index 2525b025a4b7..000000000000 --- a/packages/fs-storage/test/request-queue/ignore-non-json-files.test.ts +++ /dev/null @@ -1,69 +0,0 @@ -import { randomUUID } from 'node:crypto'; -import { rm, writeFile } from 'node:fs/promises'; -import { resolve } from 'node:path'; - -import { FileSystemStorageClient } from '@crawlee/fs-storage'; -import type { InternalRequest } from '@crawlee/fs-storage/src/resource-clients/request-queue'; -import type { RequestSchema } from '@crawlee/types'; -import { ensureDir } from 'fs-extra/esm'; - -describe('when falling back to fs, Request queue should ignore non-JSON files', () => { - const tmpLocation = resolve(import.meta.dirname, './tmp/req-queue-ignore-non-json'); - const storage = new FileSystemStorageClient({ - localDataDirectory: tmpLocation, - }); - - beforeAll(async () => { - // Create "default" request queue and give it faulty entries - await ensureDir(resolve(storage.requestQueuesDirectory, 'default')); - await writeFile( - resolve(storage.requestQueuesDirectory, 'default/__metadata__.json'), - JSON.stringify({ - id: randomUUID(), - name: 'default', - createdAt: new Date(2022, 0, 1), - accessedAt: new Date(2022, 0, 1), - modifiedAt: new Date(2022, 0, 1), - }), - ); - - await writeFile( - resolve(storage.requestQueuesDirectory, 'default/123.json'), - JSON.stringify({ - id: '123', - orderNo: 1, - url: 'http://example.com', - uniqueKey: 'owo', - method: 'GET', - retryCount: 0, - json: JSON.stringify({ - uniqueKey: 'owo', - url: 'http://example.com', - id: '123', - } satisfies RequestSchema), - } satisfies InternalRequest), - ); - - await writeFile(resolve(storage.requestQueuesDirectory, 'default/.DS_Store'), 'owo'); - await writeFile(resolve(storage.requestQueuesDirectory, 'default/invalid.txt'), 'owo'); - }); - - afterAll(async () => { - await rm(tmpLocation, { force: true, recursive: true }); - }); - - test('attempting to list "default" request queue should ignore non-JSON files', async () => { - const defaultQueue = await storage.createRequestQueueClient({ name: 'default' }); - const defaultQueueInfo = await defaultQueue.getMetadata(); - - expect(defaultQueueInfo.name).toEqual('default'); - - // Only the single valid JSON-backed request should be fetchable; the non-JSON files are ignored. - const first = await defaultQueue.fetchNextRequest(); - expect(first).not.toBeNull(); - expect(first!.url).toEqual('http://example.com'); - - const second = await defaultQueue.fetchNextRequest(); - expect(second).toBeNull(); - }); -}); diff --git a/packages/fs-storage/test/request-queue/reload-persistence.test.ts b/packages/fs-storage/test/request-queue/reload-persistence.test.ts new file mode 100644 index 000000000000..8b7bf0e209a9 --- /dev/null +++ b/packages/fs-storage/test/request-queue/reload-persistence.test.ts @@ -0,0 +1,43 @@ +import { rm } from 'node:fs/promises'; +import { resolve } from 'node:path'; + +import { FileSystemStorageClient } from '@crawlee/fs-storage'; + +// The native client owns request-queue persistence; what this test exercises is the *adapter* wiring +// that drives it: `FileSystemStorageClient.teardown()` must flush every opened queue's state via +// `persistState()`, and reopening through a fresh `FileSystemStorageClient` over the same directory +// must restore the pending requests (with the adapter's request mapping intact). +describe('Request queue persists across reopen via teardown', () => { + const tmpLocation = resolve(import.meta.dirname, './tmp/req-queue-reload'); + + afterEach(async () => { + await rm(tmpLocation, { force: true, recursive: true }); + }); + + test('requests added and persisted are restored when the queue is reopened', async () => { + const storage = new FileSystemStorageClient({ localDataDirectory: tmpLocation }); + const queue = await storage.createRequestQueueClient({ name: 'default' }); + + await queue.addBatchOfRequests([ + { url: 'http://example.com/1', uniqueKey: '1' }, + { url: 'http://example.com/2', uniqueKey: '2' }, + ]); + + // `teardown` flushes the native client state to disk. + await storage.teardown(); + + // Reopen over the same directory, emulating a fresh process. + const reopenedStorage = new FileSystemStorageClient({ localDataDirectory: tmpLocation }); + const reopenedQueue = await reopenedStorage.createRequestQueueClient({ name: 'default' }); + + const metadata = await reopenedQueue.getMetadata(); + expect(metadata.pendingRequestCount).toBe(2); + expect(metadata.totalRequestCount).toBe(2); + + const first = await reopenedQueue.fetchNextRequest(); + const second = await reopenedQueue.fetchNextRequest(); + + expect([first?.url, second?.url].sort()).toStrictEqual(['http://example.com/1', 'http://example.com/2']); + expect(await reopenedQueue.fetchNextRequest()).toBeNull(); + }); +}); diff --git a/packages/fs-storage/test/write-metadata.test.ts b/packages/fs-storage/test/write-metadata.test.ts deleted file mode 100644 index d2a357f4ff60..000000000000 --- a/packages/fs-storage/test/write-metadata.test.ts +++ /dev/null @@ -1,80 +0,0 @@ -import { readdir, rm } from 'node:fs/promises'; -import { resolve } from 'node:path'; - -import { FileSystemStorageClient } from '@crawlee/fs-storage'; - -import { waitTillWrittenToDisk } from './__shared__.js'; - -describe('writeMetadata option', () => { - const tmpLocation = resolve(import.meta.dirname, './tmp/write-metadata-tests'); - - afterAll(async () => { - await rm(tmpLocation, { force: true, recursive: true }); - }); - - describe('when false', () => { - const localDataDirectory = resolve(tmpLocation, './no-metadata'); - const storage = new FileSystemStorageClient({ - localDataDirectory, - writeMetadata: false, - }); - - test('creating a data store should not write __metadata__.json file', async () => { - const keyValueStore = await storage.createKeyValueStoreClient(); - const info = await keyValueStore.getMetadata(); - const expectedPath = resolve(storage.keyValueStoresDirectory, info.id); - - // We check that reading the directory for the store throws an error, which means it wasn't created on disk - await expect(async () => readdir(expectedPath)).rejects.toThrow(); - }); - - test('creating a key-value pair in a key-value store should not write __metadata__.json file for the value', async () => { - const keyValueStore = await storage.createKeyValueStoreClient(); - await keyValueStore.setValue({ key: 'foo', value: 'test' }); - - const keyValueStoreInfo = await keyValueStore.getMetadata(); - const expectedFilePath = resolve(storage.keyValueStoresDirectory, `${keyValueStoreInfo.id}/foo.txt`); - await waitTillWrittenToDisk(expectedFilePath); - - const directoryFiles = await readdir(resolve(storage.keyValueStoresDirectory, keyValueStoreInfo.id)); - - expect(directoryFiles).toHaveLength(1); - }); - }); - - describe('when true', () => { - const localDataDirectory = resolve(tmpLocation, './metadata'); - const storage = new FileSystemStorageClient({ - localDataDirectory, - writeMetadata: true, - }); - - test('creating a data store should write __metadata__.json file', async () => { - const keyValueStore = await storage.createKeyValueStoreClient(); - const info = await keyValueStore.getMetadata(); - const expectedPath = resolve(storage.keyValueStoresDirectory, info.id); - await waitTillWrittenToDisk(expectedPath); - - const directoryFiles = await readdir(expectedPath); - - expect(directoryFiles).toHaveLength(1); - }); - - test('creating a key-value pair in a key-value store should write __metadata__.json file for the value', async () => { - const keyValueStore = await storage.createKeyValueStoreClient(); - await keyValueStore.setValue({ key: 'foo', value: 'test' }); - - const keyValueStoreInfo = await keyValueStore.getMetadata(); - const expectedFilePath = resolve(storage.keyValueStoresDirectory, `${keyValueStoreInfo.id}/foo.txt`); - const expectedMetadataPath = resolve( - storage.keyValueStoresDirectory, - `${keyValueStoreInfo.id}/foo.__metadata__.json`, - ); - await Promise.all([waitTillWrittenToDisk(expectedFilePath), waitTillWrittenToDisk(expectedMetadataPath)]); - - const directoryFiles = await readdir(resolve(storage.keyValueStoresDirectory, keyValueStoreInfo.id)); - - expect(directoryFiles).toHaveLength(3); - }); - }); -}); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index d7dcaa39f913..7838a6cf6b48 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -538,33 +538,24 @@ importers: packages/fs-storage: dependencies: + '@crawlee/fs-storage-native': + specifier: 0.1.5-beta.0 + version: 0.1.5-beta.0 '@crawlee/types': specifier: workspace:* version: link:../types - '@sapphire/async-queue': - specifier: ^1.5.5 - version: 1.5.5 '@sapphire/shapeshift': specifier: ^4.0.0 version: 4.0.0 content-type: specifier: ^1.0.5 version: 1.0.5 - fs-extra: - specifier: ^11.3.0 - version: 11.3.4 json5: specifier: ^2.2.3 version: 2.2.3 mime-types: specifier: ^3.0.1 version: 3.0.2 - proper-lockfile: - specifier: ^4.1.2 - version: 4.1.2 - tslib: - specifier: ^2.8.1 - version: 2.8.1 packages/got-scraping-client: dependencies: @@ -2010,6 +2001,35 @@ packages: resolution: {integrity: sha512-Yn32E5IdmENLITg36XN1ty4OLPMcqzDjkEvSdZ0dRV5jcJR89sKi47FOs2eXpW+n7IGhbzPDkGKUirPPRrRkjg==} engines: {node: '>=16.0.0'} + '@crawlee/fs-storage-native-darwin-arm64@0.1.5-beta.0': + resolution: {integrity: sha512-vD8OL8KEGh2A1Xx3h7UjSe/+lEOXzucpW9dQY5ZjCdV0iULjTQmJ1m536/QyvQ97i+H2zb2v/PMqbng4/SLZdg==} + engines: {node: '>= 20'} + cpu: [arm64] + os: [darwin] + + '@crawlee/fs-storage-native-darwin-x64@0.1.5-beta.0': + resolution: {integrity: sha512-BOuooGyxTXHi4iVVYrfmD6ZPMEBkOW8wS2GwhiuoEMvay+fGl9rHwOJaYeY+N6JVWrd/BScvlUw4MvUPR7u1Lw==} + engines: {node: '>= 20'} + cpu: [x64] + os: [darwin] + + '@crawlee/fs-storage-native-linux-x64-gnu@0.1.5-beta.0': + resolution: {integrity: sha512-QNuDxOJ2wwDtQhUJR1hw6M/XGfgA9mAgGksDBVYrjJyHfMCOzsfRIaOB2bDACdWo73moCIqj1iy3TAPwEKUPeA==} + engines: {node: '>= 20'} + cpu: [x64] + os: [linux] + libc: [glibc] + + '@crawlee/fs-storage-native-win32-x64-msvc@0.1.5-beta.0': + resolution: {integrity: sha512-I2lsJhimkLb5U/rdxLvaww4bD0fXMfEA+JgcWvkoMGIiMyKIkkQgj6Ht2TF6Z5yq6OumaDnR/dBh9mzB4gFqLg==} + engines: {node: '>= 20'} + cpu: [x64] + os: [win32] + + '@crawlee/fs-storage-native@0.1.5-beta.0': + resolution: {integrity: sha512-HCP2s+6sRBn7927Y4gTt3ni/iiqHY2emjHQi30trtYcfL4bveQtzCsE673PhDBfa4abDlnVw0EKyWgLRvaYQzw==} + engines: {node: '>= 20'} + '@crawlee/memory-storage@3.16.0': resolution: {integrity: sha512-ol1PSWj5LL1ALjEZ+zJdLaZx4bGPIP6vXly4AmbtyFg2iq+m1BudtXL+dWFdv/qN8f+N8ljPF5VwKAVxg2uy3Q==} engines: {node: '>= 16'} @@ -14093,6 +14113,25 @@ snapshots: transitivePeerDependencies: - supports-color + '@crawlee/fs-storage-native-darwin-arm64@0.1.5-beta.0': + optional: true + + '@crawlee/fs-storage-native-darwin-x64@0.1.5-beta.0': + optional: true + + '@crawlee/fs-storage-native-linux-x64-gnu@0.1.5-beta.0': + optional: true + + '@crawlee/fs-storage-native-win32-x64-msvc@0.1.5-beta.0': + optional: true + + '@crawlee/fs-storage-native@0.1.5-beta.0': + optionalDependencies: + '@crawlee/fs-storage-native-darwin-arm64': 0.1.5-beta.0 + '@crawlee/fs-storage-native-darwin-x64': 0.1.5-beta.0 + '@crawlee/fs-storage-native-linux-x64-gnu': 0.1.5-beta.0 + '@crawlee/fs-storage-native-win32-x64-msvc': 0.1.5-beta.0 + '@crawlee/memory-storage@3.16.0': dependencies: '@apify/log': 2.5.35 diff --git a/vitest.config.mts b/vitest.config.mts index b31ed8f6fef8..4915f65630a4 100644 --- a/vitest.config.mts +++ b/vitest.config.mts @@ -39,8 +39,12 @@ const baseConfig = defineConfig({ { find: '@crawlee/playwright', replacement: resolve(__dirname, './packages/playwright-crawler/src') }, { find: '@crawlee/puppeteer', replacement: resolve(__dirname, './packages/puppeteer-crawler/src') }, { find: '@crawlee/stagehand', replacement: resolve(__dirname, './packages/stagehand-crawler/src') }, - { find: /^@crawlee\/(.*)\/(.*)$/, replacement: resolve(__dirname, './packages/$1/$2') }, - { find: /^@crawlee\/(.*)$/, replacement: resolve(__dirname, './packages/$1/src') }, + // The generic `@crawlee/*` aliases below map specifiers to workspace package sources. They + // exclude `@crawlee/fs-storage-native` via a negative lookahead, since it is a real external + // (npm) dependency with no `packages/fs-storage-native` source — letting it resolve normally + // through node_modules. + { find: /^@crawlee\/(?!fs-storage-native)(.*)\/(.*)$/, replacement: resolve(__dirname, './packages/$1/$2') }, + { find: /^@crawlee\/(?!fs-storage-native)(.*)$/, replacement: resolve(__dirname, './packages/$1/src') }, { find: /^test\/(.*)$/, replacement: resolve(__dirname, './test/$1') }, ], retry: process.env.RETRY_TESTS ? 3 : 0, From c442f5bc7b1a1a6e58b32d9f28fd9701cee22699 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Fri, 19 Jun 2026 12:26:26 +0200 Subject: [PATCH 2/3] Fix tests --- test/core/enqueue_links/click_elements.test.ts | 4 ++++ test/core/enqueue_links/enqueue_links.test.ts | 10 +++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/test/core/enqueue_links/click_elements.test.ts b/test/core/enqueue_links/click_elements.test.ts index 8476ed98a168..d2fce7d14cf3 100644 --- a/test/core/enqueue_links/click_elements.test.ts +++ b/test/core/enqueue_links/click_elements.test.ts @@ -13,6 +13,7 @@ import { import type { Browser as PWBrowser, Page as PWPage } from 'playwright'; import type { Browser as PPBrowser, Target } from 'puppeteer'; import { runExampleComServer } from '../../shared/_helper.js'; +import { MemoryStorageEmulator } from '../../shared/MemoryStorageEmulator.js'; function isPuppeteerBrowser(browser: PPBrowser | PWBrowser): browser is PPBrowser { return (browser as PPBrowser).targets !== undefined; @@ -56,6 +57,7 @@ const testCases = [ testCases.forEach(({ caseName, launchBrowser, clickElements, utils }) => { describe(`${caseName}: enqueueLinksByClickingElements()`, () => { + const localStorageEmulator = new MemoryStorageEmulator(); let browser: PPBrowser | PWBrowser; let server: Server; @@ -72,9 +74,11 @@ testCases.forEach(({ caseName, launchBrowser, clickElements, utils }) => { afterAll(async () => { await browser.close(); server.close(); + await localStorageEmulator.destroy(); }); beforeEach(async () => { + await localStorageEmulator.init(); page = await browser.newPage(); }); diff --git a/test/core/enqueue_links/enqueue_links.test.ts b/test/core/enqueue_links/enqueue_links.test.ts index b098727564bc..1b7c4907dfb0 100644 --- a/test/core/enqueue_links/enqueue_links.test.ts +++ b/test/core/enqueue_links/enqueue_links.test.ts @@ -15,6 +15,8 @@ import type { Browser as PuppeteerBrowser, Page as PuppeteerPage } from 'puppete import log from '@apify/log'; +import { MemoryStorageEmulator } from '../../shared/MemoryStorageEmulator.js'; + const HTML = ` @@ -60,14 +62,20 @@ async function createRequestQueueMock() { } describe('enqueueLinks()', () => { + const localStorageEmulator = new MemoryStorageEmulator(); let ll: number; beforeAll(() => { ll = log.getLevel(); log.setLevel(log.LEVELS.ERROR); }); - afterAll(() => { + beforeEach(async () => { + await localStorageEmulator.init(); + }); + + afterAll(async () => { log.setLevel(ll); + await localStorageEmulator.destroy(); }); describe.each([[launchPuppeteer], [launchPlaywright]] as const)('using %s', (method) => { From c8bb798b39f287dc34131855c23e9641df62da81 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Fri, 19 Jun 2026 13:42:04 +0200 Subject: [PATCH 3/3] Correctly utilize the assumeSoleOwner flag --- .../parallel-scraping/parallel-scraper.mjs | 21 +++-- .../parallel-scraping/parallel-scraping.mdx | 46 +++++++--- docs/guides/parallel-scraping/shared.mjs | 12 ++- .../fs-storage/src/file-system-storage.ts | 29 ++++++- .../request-queue/assume-sole-owner.test.ts | 85 +++++++++++++++++++ 5 files changed, 172 insertions(+), 21 deletions(-) create mode 100644 packages/fs-storage/test/request-queue/assume-sole-owner.test.ts diff --git a/docs/guides/parallel-scraping/parallel-scraper.mjs b/docs/guides/parallel-scraping/parallel-scraper.mjs index 78162002cdba..9d84f6881116 100644 --- a/docs/guides/parallel-scraping/parallel-scraper.mjs +++ b/docs/guides/parallel-scraping/parallel-scraper.mjs @@ -1,5 +1,6 @@ import { fork } from 'node:child_process'; +import { FileSystemStorageClient } from '@crawlee/fs-storage'; import { Configuration, Dataset, PlaywrightCrawler, log } from 'crawlee'; import { router } from './routes.mjs'; @@ -76,13 +77,19 @@ if (!process.env.IN_WORKER_THREAD) { // Get the request queue const requestQueue = await getOrInitQueue(false); - // Disable the automatic purge on start and configure crawlee to store the worker-specific data in a separate directory - // (needs to be done AFTER the queue is initialized when running locally) + // Disable the automatic purge on start, so we don't lose the queue we prepared const config = new Configuration({ purgeOnStart: false, - storageClientOptions: { - localDataDirectory: `./storage/worker-${process.env.WORKER_INDEX}`, - }, + }); + + // Store the worker's own internal state (its default dataset, key-value store, etc.) in a separate + // directory so the workers don't collide with each other (needs to be done AFTER the queue is + // initialized when running locally). This directory is private to a single worker, so we set + // `assumeSoleOwner: true` — the concurrency-safe locking only matters for the shared `shop-urls` + // queue, which gets its own storage client in `requestQueue.mjs`. + const storageClient = new FileSystemStorageClient({ + localDataDirectory: `./storage/worker-${process.env.WORKER_INDEX}`, + assumeSoleOwner: true, }); workerLogger.debug('Setting up crawler.'); @@ -98,6 +105,10 @@ if (!process.env.IN_WORKER_THREAD) { // highlight-end // Let's also limit the crawler's concurrency, we don't want to overload a single process 🐌 maxConcurrency: 5, + // Use the worker-specific, concurrency-safe storage client we created above + // highlight-start + storageClient, + // highlight-end }, config, ); diff --git a/docs/guides/parallel-scraping/parallel-scraping.mdx b/docs/guides/parallel-scraping/parallel-scraping.mdx index a22284a6fe04..016206b89b75 100644 --- a/docs/guides/parallel-scraping/parallel-scraping.mdx +++ b/docs/guides/parallel-scraping/parallel-scraping.mdx @@ -60,6 +60,16 @@ The first step in our conversion process will be creating a common file (let's c The exported function, `getOrInitQueue`, might seem like it does a lot. In essence, it just ensures the request queue is initialized, and if requested, ensures it starts off with an empty state. +:::caution Make the shared queue concurrency-safe with `assumeSoleOwner: false` + +Because every worker process opens this same `shop-urls` queue at the same time, it **must** use the concurrency-safe locking behavior of `FileSystemStorageClient`. That's why `getOrInitQueue` opens the queue with a storage client constructed with `assumeSoleOwner: false`. + +By default, `FileSystemStorageClient` assumes it is the *sole* consumer of a queue (`assumeSoleOwner: true`). On open it immediately reclaims any requests left *in progress* — great for a single-process crawl recovering after a crash, but disastrous when workers run side by side: each worker would happily grab requests another worker is still processing, so the same URL gets scraped multiple times. + +Setting `assumeSoleOwner: false` tells the client to treat an in-progress request as a potential live peer's lock and only reclaim it once the lock expires on the wall clock, so two workers never process the same request at once. + +::: + ### Adapting our previous scraper to enqueue the product URLs to the new queue In the `src/routes.mjs` file of the scraper we previously built, we have a handler for the `CATEGORY` label. Let's adapt that handler to enqueue the product URLs to the new queue we created. @@ -122,34 +132,44 @@ This will check how the script is executed as. If this value has _any_ value, it We use this to ensure the parent process stays alive until all the worker processes exit. Otherwise, the worker processes would just get spawned, and lose the ability to communicate with the parent. You might not need this depending on your use case (maybe you just need to spawn workers and let them process). -#### What's with all those `Configuration` calls? +#### What's with all the `Configuration` and storage client setup? -There are three steps we want to do for the worker processes: +There are two things we want to do for the worker processes: -- get the queue that supports locking from the same location as the parent process -- ensure the default storages do **not** get purged on start, as otherwise we'd lose the queue we prepared, and initialize a special storage for worker processes so they do not collide with each other +- get the shared queue from the same location as the parent process (it already comes with the concurrency-safe storage client we set up in `requestQueue.mjs`) +- ensure the default storages do **not** get purged on start, as otherwise we'd lose the queue we prepared, and give each worker its own private storage directory for its internal state so the workers don't collide with each other In order, that's what these lines do: ```javascript title="src/parallel-scraper.mjs" -// Get the request queue from the parent process (step 1) +import { FileSystemStorageClient } from '@crawlee/fs-storage'; + +// Get the shared request queue from the parent process (step 1) const requestQueue = await getOrInitQueue(false); -// Disable the automatic purge on start and configure crawlee to store the worker-specific data -// in a separate directory (needs to be done AFTER the queue is initialized when running locally) (step 2) -const config = new Configuration({ - purgeOnStart: false, - storageClientOptions: { - localDataDirectory: `./storage/worker-${process.env.WORKER_INDEX}`, - }, +// Disable the automatic purge on start, so we don't lose the queue we prepared (step 2) +const config = new Configuration({ purgeOnStart: false }); + +// Store the worker's own internal state in a separate directory so workers don't collide (step 2, +// cont.). Needs to be done AFTER the queue is initialized when running locally. This directory is +// private to a single worker, so we explicitly set `assumeSoleOwner: true`. +const storageClient = new FileSystemStorageClient({ + localDataDirectory: `./storage/worker-${process.env.WORKER_INDEX}`, + assumeSoleOwner: true, }); ``` +:::note Why no `assumeSoleOwner: false` here? + +Each worker's `./storage/worker-N` directory is private to that single worker — nothing else opens it — so the default `assumeSoleOwner: true` is exactly right. The concurrency-safe locking only matters for storage that is genuinely shared across processes, which is the `shop-urls` queue in `requestQueue.mjs`, not this per-worker internal state. + +::: + #### Telling the crawler to use the worker configuration You might have noticed several lines highlighted in the code above. Those show how you provide the shared request queue to the crawler. -You might have also noticed we passed in a second parameter to the constructor of the crawler, the `config` variable we created earlier. This is needed to ensure the crawler uses the worker-specific storages for internal states, and that they do not collide with each other. +You might have also noticed we passed in the `config` and `storageClient` we created earlier to the crawler. These ensure the crawler uses the worker-specific storages for its own internal state (so the workers do not collide with each other), while still consuming the shared, concurrency-safe `shop-urls` queue we provided explicitly. #### Why do we use `process.send` instead of `context.pushData`? diff --git a/docs/guides/parallel-scraping/shared.mjs b/docs/guides/parallel-scraping/shared.mjs index 0233e37fa5a9..fe1e758e6136 100644 --- a/docs/guides/parallel-scraping/shared.mjs +++ b/docs/guides/parallel-scraping/shared.mjs @@ -1,8 +1,16 @@ +import { FileSystemStorageClient } from '@crawlee/fs-storage'; import { RequestQueue } from 'crawlee'; // The request queue shared by all the parallel workers let queue; +// The `shop-urls` queue is opened concurrently by every worker process, so it must use the +// concurrency-safe locking behavior. With `assumeSoleOwner: false`, a request another worker is +// still processing is treated as a live peer's lock and is not handed out again until that lock +// expires — so two workers never scrape the same URL at once. (We point at the default `./storage` +// location, which is where this shared queue lives.) +const sharedStorageClient = new FileSystemStorageClient({ assumeSoleOwner: false }); + /** * @param {boolean} makeFresh Whether the queue should be cleared before returning it * @returns The queue @@ -12,11 +20,11 @@ export async function getOrInitQueue(makeFresh = false) { return queue; } - queue = await RequestQueue.open('shop-urls'); + queue = await RequestQueue.open('shop-urls', { storageClient: sharedStorageClient }); if (makeFresh) { await queue.drop(); - queue = await RequestQueue.open('shop-urls'); + queue = await RequestQueue.open('shop-urls', { storageClient: sharedStorageClient }); } return queue; diff --git a/packages/fs-storage/src/file-system-storage.ts b/packages/fs-storage/src/file-system-storage.ts index 57817234e30f..a845a1f13a0f 100644 --- a/packages/fs-storage/src/file-system-storage.ts +++ b/packages/fs-storage/src/file-system-storage.ts @@ -25,6 +25,22 @@ export interface FileSystemStorageOptions { * Optional logger for FileSystemStorageClient warnings. */ logger?: CrawleeLogger; + + /** + * Assert that this process is the *sole* consumer of every request queue it opens. + * + * When `true` (the default), opening a queue immediately reclaims any requests that a previous + * run left *in progress* (e.g. after a crash), so they become fetchable again right away. This is + * the right behavior for the common single-process crawl. + * + * Set this to `false` if multiple processes share the same on-disk request queue concurrently + * (for example, the {@apilink parallel scraping setup | "Parallel Scraping Guide"}). In that mode + * an in-progress request is treated as a potential live peer's lock and is only reclaimed once + * that lock expires on the wall clock, so two workers won't process the same request at once. + * + * @default true + */ + assumeSoleOwner?: boolean; } /** @@ -41,6 +57,7 @@ export class FileSystemStorageClient implements storage.StorageClient { readonly keyValueStoresDirectory: string; readonly requestQueuesDirectory: string; readonly logger?: CrawleeLogger; + readonly assumeSoleOwner: boolean; readonly keyValueStoreCache: KeyValueStoreClient[] = []; readonly datasetClientCache: DatasetClient[] = []; @@ -49,9 +66,11 @@ export class FileSystemStorageClient implements storage.StorageClient { constructor(options: FileSystemStorageOptions = {}) { s.object({ localDataDirectory: s.string().optional(), + assumeSoleOwner: s.boolean().optional(), }).parse(options); this.logger = options.logger; + this.assumeSoleOwner = options.assumeSoleOwner ?? true; // v3.0.0 used `crawlee_storage` as the default, we changed this in v3.0.1 to just `storage`, // this function handles it without making BC breaks - it respects existing `crawlee_storage` @@ -165,7 +184,15 @@ export class FileSystemStorageClient implements storage.StorageClient { } } - const nativeClient = await NativeRequestQueueClient.open(id, name, alias, this.localDataDirectory); + const nativeClient = await NativeRequestQueueClient.open( + id, + name, + alias, + this.localDataDirectory, + // useTestClock — always real wall-clock outside of native tests. + undefined, + this.assumeSoleOwner, + ); const newStore = await RequestQueueClient.create({ name: alias ? undefined : (name ?? cacheKey), cacheKey: cacheKey ?? '', diff --git a/packages/fs-storage/test/request-queue/assume-sole-owner.test.ts b/packages/fs-storage/test/request-queue/assume-sole-owner.test.ts new file mode 100644 index 000000000000..682899db1da1 --- /dev/null +++ b/packages/fs-storage/test/request-queue/assume-sole-owner.test.ts @@ -0,0 +1,85 @@ +import { rm } from 'node:fs/promises'; +import { resolve } from 'node:path'; + +import { FileSystemStorageClient } from '@crawlee/fs-storage'; + +// `assumeSoleOwner` controls how the native `@crawlee/fs-storage-native` extension treats requests +// left *in progress* by a previous run (a dangling `orderNo` lock on disk) when a queue is reopened. +// The reclaim/respect-peer-lock semantics are owned by the native extension; these tests verify the +// adapter's contract on top of it: the option defaults to `true`, is honored when set, and that the +// resulting behavior reaches all the way down to the native queue. +describe('FileSystemStorageClient assumeSoleOwner', () => { + const tmpLocation = resolve(import.meta.dirname, './tmp/assume-sole-owner'); + + afterEach(async () => { + await rm(tmpLocation, { force: true, recursive: true }); + }); + + test('defaults to true', () => { + const storage = new FileSystemStorageClient({ localDataDirectory: tmpLocation }); + expect(storage.assumeSoleOwner).toBe(true); + }); + + test('respects an explicit false', () => { + const storage = new FileSystemStorageClient({ localDataDirectory: tmpLocation, assumeSoleOwner: false }); + expect(storage.assumeSoleOwner).toBe(false); + }); + + // Seed a queue with two requests, fetch (lock) one without handling it or tearing down — leaving a + // dangling in-progress lock on disk, exactly the "process died mid-flight" situation. + async function seedQueueWithDanglingLock(dir: string) { + const storage = new FileSystemStorageClient({ localDataDirectory: dir }); + const queue = await storage.createRequestQueueClient({ name: 'default' }); + await queue.addBatchOfRequests([ + { url: 'http://example.com/1', uniqueKey: '1' }, + { url: 'http://example.com/2', uniqueKey: '2' }, + ]); + const locked = await queue.fetchNextRequest(); + expect(locked).not.toBeNull(); + // Intentionally NO markRequestAsHandled and NO teardown/persistState — the lock is left dangling. + return locked!; + } + + test('true (default): reopening preserves contents but relinquishes the dangling lock', async () => { + const dir = resolve(tmpLocation, 'sole-owner-true'); + const locked = await seedQueueWithDanglingLock(dir); + + // Reopen the same directory as sole owner, without purging. + const reopened = new FileSystemStorageClient({ localDataDirectory: dir, assumeSoleOwner: true }); + const queue = await reopened.createRequestQueueClient({ name: 'default' }); + + // Contents preserved: both requests still present, none handled. + const metadata = await queue.getMetadata(); + expect(metadata.totalRequestCount).toBe(2); + expect(metadata.handledRequestCount).toBe(0); + expect(metadata.pendingRequestCount).toBe(2); + + // Lock relinquished: BOTH requests are fetchable again, including the one locked before. + const a = await queue.fetchNextRequest(); + const b = await queue.fetchNextRequest(); + expect([a?.uniqueKey, b?.uniqueKey].sort()).toStrictEqual(['1', '2']); + // The previously-locked request survived with its data intact. + const reFetched = await queue.getRequest(locked.uniqueKey); + expect(reFetched?.url).toBe(locked.url); + }); + + test('false: reopening keeps the dangling lock (concurrency-safe mode)', async () => { + const dir = resolve(tmpLocation, 'sole-owner-false'); + await seedQueueWithDanglingLock(dir); + + // Reopen in concurrency-safe mode: an in-progress request is treated as a potential live peer's + // lock and is NOT reclaimed until it expires. + const reopened = new FileSystemStorageClient({ localDataDirectory: dir, assumeSoleOwner: false }); + const queue = await reopened.createRequestQueueClient({ name: 'default' }); + + // Contents are still preserved... + const metadata = await queue.getMetadata(); + expect(metadata.totalRequestCount).toBe(2); + expect(metadata.pendingRequestCount).toBe(2); + + // ...but only the un-locked request is handed out; the locked one stays in progress. + const a = await queue.fetchNextRequest(); + expect(a?.uniqueKey).toBe('2'); + expect(await queue.fetchNextRequest()).toBeNull(); + }); +});