From 1eb455ab70c88eb086b2dcd5a98cab5d798b7357 Mon Sep 17 00:00:00 2001 From: Victor Fusco <1221933+vfusco@users.noreply.github.com> Date: Thu, 11 Jun 2026 19:38:37 -0300 Subject: [PATCH 01/16] test(integration): shard suite into parallel compose projects --- .github/workflows/build.yml | 47 ++++++++++++-- .gitignore | 1 + Dockerfile | 4 +- Makefile | 94 +++++++++++++++++++++++++-- scripts/check-integration-shards.sh | 88 +++++++++++++++++++++++++ scripts/compose-integration-run.sh | 55 ++++++++++++++++ scripts/run-integration-tests.sh | 78 +++++++++++++++++++--- test/compose/compose.integration.yaml | 6 ++ test/integration/main_test.go | 13 +++- 9 files changed, 363 insertions(+), 23 deletions(-) create mode 100755 scripts/check-integration-shards.sh create mode 100755 scripts/compose-integration-run.sh diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 8132b7ead..d0bae4548 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -218,10 +218,39 @@ jobs: - name: Run unit tests run: make unit-test-with-compose + # Runs the shard coverage guard and derives the integration matrix from the + # Makefile's INTEGRATION_SHARDS, so adding a shard never requires a workflow + # change and an unassigned test fails fast here. + integration-test-setup: + runs-on: ubuntu-24.04 + outputs: + shards: ${{ steps.shards.outputs.shards }} + steps: + - name: Checkout source code + uses: actions/checkout@v4 + + - name: Setup Go + uses: actions/setup-go@v5 + with: + go-version-file: go.mod + + - name: Check shard coverage + run: make integration-test-shard-check + + - name: List shards + id: shards + run: echo "shards=$(make -s list-integration-shards)" >> "$GITHUB_OUTPUT" + integration-test: runs-on: ubuntu-24.04 - needs: [build] + needs: [build, integration-test-setup] timeout-minutes: 60 + strategy: + fail-fast: false + matrix: + shard: ${{ fromJson(needs.integration-test-setup.outputs.shards) }} + env: + COMPOSE_PROJECT: rollups-it-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.shard }} permissions: contents: read packages: read @@ -257,17 +286,25 @@ jobs: - name: Download test dependencies run: make download-test-dependencies - - name: Run integration tests - run: make integration-test-with-compose + - name: Run integration shard ${{ matrix.shard }} + run: | + make integration-test-shard SHARD=${{ matrix.shard }} \ + COMPOSE_PROJECT="$COMPOSE_PROJECT" - name: Upload integration test logs if: always() uses: actions/upload-artifact@v4 with: - name: integration-test-logs - path: integration-logs.txt + name: integration-test-logs-${{ matrix.shard }} + path: integration-logs-${{ matrix.shard }}.txt retention-days: 3 + - name: Clean up compose project + if: always() + run: | + docker compose -p "$COMPOSE_PROJECT" \ + -f test/compose/compose.integration.yaml down -v --remove-orphans || true + publish_artifacts: name: Publish artifacts needs: [basic-checks, build, unit-test, integration-test] diff --git a/.gitignore b/.gitignore index fa3b0e0ae..ac2440d0c 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,4 @@ machine-snapshot/** /applications /test/downloads /snapshots +/integration-logs*.txt diff --git a/Dockerfile b/Dockerfile index 6445e3f59..cfb172621 100644 --- a/Dockerfile +++ b/Dockerfile @@ -20,8 +20,10 @@ ARG GO_BUILD_PATH RUN </dev/null || true ; \ + done + @docker compose -p rollups-node-integration -f test/compose/compose.integration.yaml down -v --remove-orphans 2>/dev/null || true + # ============================================================================= # Tests # ============================================================================= @@ -497,9 +512,78 @@ unit-test-with-compose: $(CARTESI_TEST_MACHINE_IMAGES) ## Run unit tests using d lint-with-docker: ## Run linting inside Docker (no host Go needed) @docker run --rm cartesi/rollups-node:tester sh -c 'make lint && make vet && make fmt-check' +# ============================================================================= +# Integration test sharding +# ============================================================================= +# Each shard runs as an isolated Docker Compose project (own Anvil, Postgres, +# and test-managed node); tests within a shard stay sequential. The anchors +# (^...$$) are load-bearing: go test -run matches unanchored, and several test +# names are prefixes of others (Foreclose / ForecloseReplay / ForeclosePrt). +# Every top-level test must match exactly one shard; this is enforced by +# `make integration-test-shard-check`. +# +# Shards are grouped by semantic family, not balanced by runtime: `withdrawal` +# is a single test while `restart` and `replay` are the heaviest. Each shard +# gets its own CI runner and the full per-job `go test -timeout 55m` +# (run-integration-tests.sh) budget; `restart` (multi-suite, ~25-min setup +# contexts) is the first to watch if a shard ever approaches that ceiling. +# Discovery (integration-test-shard-check) lists tests with a plain Go +# toolchain, so the integration package must stay free of the Cartesi CGo +# dependency for the check to build on the CI setup runner. +INTEGRATION_SHARDS := basic quorum prt replay restart withdrawal + +INTEGRATION_SHARD_basic := ^Test(EchoAuthority|RejectException|MultiApp|EchoAuthorityStaging)$$ +INTEGRATION_SHARD_quorum := ^Test(EchoQuorum|SameBlockInputs)$$ +INTEGRATION_SHARD_prt := ^Test(EchoPrt|RejectExceptionPrt|ForeclosePrt)$$ +INTEGRATION_SHARD_replay := ^Test(Foreclose|ForecloseReplay|DivergentClaim)$$ +INTEGRATION_SHARD_restart := ^Test(Restart|SnapshotPolicy)$$ +INTEGRATION_SHARD_withdrawal := ^TestWithdrawalLifecycle$$ + +COMPOSE_PROJECT ?= rollups-node-integration +INTEGRATION_LOGS ?= integration-logs.txt +INTEGRATION_TEST_JOBS ?= 3 + integration-test-with-compose: $(CARTESI_TEST_MACHINE_IMAGES) ## Run integration tests using docker compose with auto-shutdown - @trap 'docker compose -f test/compose/compose.integration.yaml logs --no-color > integration-logs.txt 2>&1 || true; docker compose -f test/compose/compose.integration.yaml down -v || true' EXIT && \ - docker compose -f test/compose/compose.integration.yaml run --rm --remove-orphans integration-test + @COMPOSE_PROJECT='$(COMPOSE_PROJECT)' INTEGRATION_LOGS='$(INTEGRATION_LOGS)' \ + TEST_PATTERN='$(TEST_PATTERN)' SHARD_NAME='$(SHARD_NAME)' \ + GOTESTSUM_FORMAT='$(GOTESTSUM_FORMAT)' \ + scripts/compose-integration-run.sh + +# Validate SHARD at parse time so a bad invocation fails before any +# prerequisite work (e.g. downloading test machine images). +ifneq ($(filter integration-test-shard,$(MAKECMDGOALS)),) +ifeq ($(strip $(SHARD)),) +$(error SHARD is required. Known shards: $(INTEGRATION_SHARDS)) +endif +ifeq ($(strip $(INTEGRATION_SHARD_$(SHARD))),) +$(error unknown shard '$(SHARD)'. Known shards: $(INTEGRATION_SHARDS)) +endif +endif + +integration-test-shard: $(CARTESI_TEST_MACHINE_IMAGES) ## Run one integration shard in an isolated compose project (requires SHARD=) + @COMPOSE_PROJECT='$(if $(filter rollups-node-integration,$(COMPOSE_PROJECT)),rollups-node-integration-$(SHARD),$(COMPOSE_PROJECT))' \ + INTEGRATION_LOGS='integration-logs-$(SHARD).txt' \ + TEST_PATTERN='$(INTEGRATION_SHARD_$(SHARD))' \ + SHARD_NAME='$(SHARD)' \ + GOTESTSUM_FORMAT='$(GOTESTSUM_FORMAT)' \ + scripts/compose-integration-run.sh + +integration-test-sharded-local: $(CARTESI_TEST_MACHINE_IMAGES) integration-test-shard-check ## Run all integration shards with bounded concurrency + @$(MAKE) -k -j $(INTEGRATION_TEST_JOBS) $(addprefix run-integration-shard-,$(INTEGRATION_SHARDS)) + +run-integration-shard-%: + @$(MAKE) integration-test-shard SHARD=$* + +integration-test-shard-check: ## Verify every integration test belongs to exactly one shard + @scripts/check-integration-shards.sh \ + $(foreach s,$(INTEGRATION_SHARDS),'$(s)=$(INTEGRATION_SHARD_$(s))') + +# Used by CI to build the integration matrix from the single source of truth. +comma := , +empty := +space := $(empty) $(empty) +list-integration-shards: ## Print integration shard names as a JSON array (for the CI matrix) + @echo '[$(subst $(space),$(comma),$(patsubst %,"%",$(INTEGRATION_SHARDS)))]' test-with-compose: ## Run all tests using docker compose with auto-shutdown @$(MAKE) unit-test-with-compose @@ -534,6 +618,7 @@ load-test: deploy-load-test-apps ## Deploy 3 apps and run advancer starvation lo ci-test: ## Run the full CI test pipeline locally (lint + unit + integration) # @$(MAKE) lint-with-docker + @$(MAKE) integration-test-shard-check @$(MAKE) unit-test-with-compose @$(MAKE) integration-test-with-compose @@ -574,8 +659,9 @@ build-debian-package: install .PHONY: \ build build-go $(GO_ARTIFACTS) cartesi-rollups-machine-tool \ - clean clean-go clean-contracts clean-docs clean-devnet-files clean-dapps clean-test-dependencies clean-debian-packages \ + clean clean-go clean-contracts clean-docs clean-devnet-files clean-dapps clean-test-dependencies clean-test-logs clean-integration-compose clean-debian-packages \ test unit-test unit-test-with-compose integration-test integration-test-with-compose integration-test-local test-with-compose ci-test coverage-report \ + integration-test-shard integration-test-sharded-local integration-test-shard-check list-integration-shards \ generate generate-contracts generate-config generate-inspect check-generate generate-db \ docs generate-cli-docs generate-config-docs \ lint fmt fmt-check vet escape \ diff --git a/scripts/check-integration-shards.sh b/scripts/check-integration-shards.sh new file mode 100755 index 000000000..ffffe4a7c --- /dev/null +++ b/scripts/check-integration-shards.sh @@ -0,0 +1,88 @@ +#!/bin/bash +# (c) Cartesi and individual authors (see AUTHORS) +# SPDX-License-Identifier: Apache-2.0 (see LICENSE) +# +# Shard coverage guard: verifies that every top-level integration test +# belongs to exactly one shard and that every shard matches at least one +# test. Prevents false greens when a new integration test is added but not +# assigned to a shard. +# +# Usage: check-integration-shards.sh = [= ...] +# +# Test discovery uses `go test -list`, which builds the package and runs +# TestMain (TestMain skips node management when listing). The package has no +# CGo dependency on the Cartesi C library, so a plain Go toolchain suffices. + +set -euo pipefail + +cd "$(dirname "$0")/.." + +if [ "$#" -lt 1 ]; then + echo "usage: $0 = [= ...]" >&2 + exit 1 +fi + +SHARD_NAMES=() +SHARD_REGEXES=() +for arg in "$@"; do + name="${arg%%=*}" + regex="${arg#*=}" + if [ -z "$name" ] || [ -z "$regex" ] || [ "$name" = "$arg" ]; then + echo "ERROR: malformed shard spec '$arg' (expected =)" >&2 + exit 1 + fi + SHARD_NAMES+=("$name") + SHARD_REGEXES+=("$regex") +done + +# Discover top-level tests. Keep the build/list step separate from the grep so +# a build failure surfaces as a build failure — otherwise the grep swallows the +# empty output and the script misreports it as "no tests discovered". +if ! list_output=$(go test -list '^Test' -tags=endtoendtests ./test/integration/... 2>&1); then + echo "ERROR: failed to list integration tests (build error?):" >&2 + echo "$list_output" >&2 + exit 1 +fi + +# Filter out the trailing "ok " summary line of -list. +TESTS=$(printf '%s\n' "$list_output" | grep -E '^Test[A-Za-z0-9_]*$' || true) + +if [ -z "$TESTS" ]; then + echo "ERROR: no top-level integration tests discovered" >&2 + exit 1 +fi + +fail=0 + +while IFS= read -r t; do + [ -n "$t" ] || continue + count=0 + matched="" + for i in "${!SHARD_NAMES[@]}"; do + if printf '%s\n' "$t" | grep -Eq -- "${SHARD_REGEXES[$i]}"; then + count=$((count + 1)) + matched="$matched ${SHARD_NAMES[$i]}" + fi + done + if [ "$count" -eq 0 ]; then + echo "ERROR: test $t matches no shard" >&2 + fail=1 + elif [ "$count" -gt 1 ]; then + echo "ERROR: test $t matches multiple shards:$matched" >&2 + fail=1 + fi +done <<<"$TESTS" + +for i in "${!SHARD_NAMES[@]}"; do + if ! printf '%s\n' "$TESTS" | grep -Eq -- "${SHARD_REGEXES[$i]}"; then + echo "ERROR: shard ${SHARD_NAMES[$i]} (${SHARD_REGEXES[$i]}) matches no tests" >&2 + fail=1 + fi +done + +if [ "$fail" -ne 0 ]; then + echo "FAIL: shard coverage check failed" >&2 + exit 1 +fi + +echo "OK: $(printf '%s\n' "$TESTS" | wc -l | tr -d ' ') tests covered by ${#SHARD_NAMES[@]} shards" diff --git a/scripts/compose-integration-run.sh b/scripts/compose-integration-run.sh new file mode 100755 index 000000000..344c47eea --- /dev/null +++ b/scripts/compose-integration-run.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# (c) Cartesi and individual authors (see AUTHORS) +# SPDX-License-Identifier: Apache-2.0 (see LICENSE) +# +# Runs the integration-test Compose service inside an isolated Compose +# project and captures all logs (run output, node log, service logs) into a +# single log file. Cleanup is always project-scoped so concurrent shards do +# not interfere with each other. +# +# Usage: compose-integration-run.sh +# +# Environment: +# COMPOSE_PROJECT Compose project name (required) +# INTEGRATION_LOGS Log file to write (required; truncated at start) +# TEST_PATTERN Optional anchored regex selecting a shard of top-level +# tests (forwarded to the test container; empty = full suite) +# SHARD_NAME Optional shard label (log readability only) + +set -euo pipefail + +COMPOSE_FILE="test/compose/compose.integration.yaml" +NODE_LOG_PATH="/var/lib/cartesi-rollups-node/logs/node.log" + +: "${COMPOSE_PROJECT:?COMPOSE_PROJECT is required}" +: "${INTEGRATION_LOGS:?INTEGRATION_LOGS is required}" +export TEST_PATTERN="${TEST_PATTERN:-}" +export SHARD_NAME="${SHARD_NAME:-full}" + +compose() { + docker compose -p "$COMPOSE_PROJECT" -f "$COMPOSE_FILE" "$@" +} + +cleanup() { + # The in-container trap already prints the node log into the run output; + # this volume copy covers abnormal exits (e.g. an OOM-killed container). + { + echo + echo "=== NODE LOG (from volume) ===" + } >>"$INTEGRATION_LOGS" + compose run --rm --no-deps --entrypoint cat integration-test \ + "$NODE_LOG_PATH" >>"$INTEGRATION_LOGS" 2>/dev/null || true + { + echo + echo "=== COMPOSE SERVICE LOGS ===" + } >>"$INTEGRATION_LOGS" + compose logs --no-color >>"$INTEGRATION_LOGS" 2>&1 || true + compose down -v --remove-orphans || true +} +trap cleanup EXIT + +: >"$INTEGRATION_LOGS" +echo "Running integration tests (project=$COMPOSE_PROJECT shard=$SHARD_NAME logs=$INTEGRATION_LOGS)" + +# pipefail keeps the test exit code authoritative despite the tee. +compose run --rm --remove-orphans integration-test 2>&1 | tee -a "$INTEGRATION_LOGS" diff --git a/scripts/run-integration-tests.sh b/scripts/run-integration-tests.sh index 8b00fdbdf..e0a6dab13 100755 --- a/scripts/run-integration-tests.sh +++ b/scripts/run-integration-tests.sh @@ -7,8 +7,13 @@ # so this script only needs to set up PATH and run the tests. # # Usage: run-integration-tests.sh +# +# Environment: +# TEST_PATTERN Optional anchored regex passed to go test -run to select a +# shard of top-level tests. Empty means run the full suite. +# SHARD_NAME Optional shard label, used only for log readability. -set -eu +set -euo pipefail export PATH="/opt/go/bin:/build/cartesi/go/rollups-node:$PATH" @@ -20,20 +25,73 @@ if ! command -v cartesi-rollups-machine-tool >/dev/null 2>&1; then fi which cartesi-rollups-machine-tool || { echo "ERROR: cartesi-rollups-machine-tool not found on PATH"; exit 1; } -# Print the node log on exit so it appears in docker compose logs. NODE_LOG="${CARTESI_TEST_NODE_LOG_FILE:-}" -if [ -n "$NODE_LOG" ]; then - trap 'echo "=== NODE LOG ==="; cat "$NODE_LOG" 2>/dev/null || true' EXIT +REPORT="$(mktemp)" + +cleanup() { + # Print the node log on exit so it appears in docker compose logs. + if [ -n "$NODE_LOG" ]; then + echo "=== NODE LOG ===" + cat "$NODE_LOG" 2>/dev/null || true + fi + rm -f "$REPORT" +} +trap cleanup EXIT + +# A skipped top-level test is not a pass. In the compose/CI topology the node +# is always test-managed, so an entire top-level test skipping (e.g. TestRestart +# deciding the node looks externally managed) means the shard reported success +# without exercising what it exists to cover — a false green. Suite/subtest +# skips are allowed; only whole top-level Test* functions are checked. +report_skips() { + if [ -n "$1" ]; then + echo "ERROR: top-level test(s) skipped in shard '${SHARD_NAME:-full}' (a skip is not a pass):" >&2 + echo "$1" | sed 's/^/ - /' >&2 + return 1 + fi + return 0 +} + +# Parse skipped top-level tests from a go test -json event stream. The +# "Test":"Test..." match deliberately excludes names containing '/', so +# subtest skips (e.g. TestEchoQuorum/Foo) are ignored. +toplevel_skips_json() { + grep '"Action":"skip"' "$1" 2>/dev/null \ + | grep -oE '"Test":"Test[A-Za-z0-9_]*"' \ + | sed -E 's/.*"Test":"([^"]*)".*/\1/' \ + | sort -u || true +} + +# Parse skipped top-level tests from captured `go test -v` output. Top-level +# SKIP lines start at column 0; subtest SKIP lines are indented. +toplevel_skips_verbose() { + grep -E '^--- SKIP: Test' "$1" 2>/dev/null \ + | sed -E 's/^--- SKIP: (Test[A-Za-z0-9_]*).*/\1/' \ + | sort -u || true +} + +# Shard selection: a non-empty TEST_PATTERN narrows the run to the matching +# top-level tests. Built as a bash array so the pattern is never re-expanded +# by the shell. +GO_TEST_ARGS=(-count=1 -v -timeout 55m -ldflags "-r /opt/cartesi/lib" -tags=endtoendtests) +if [ -n "${TEST_PATTERN:-}" ]; then + echo "Running integration shard '${SHARD_NAME:-unnamed}' with -run '${TEST_PATTERN}'" + GO_TEST_ARGS+=(-run "${TEST_PATTERN}") fi +GO_TEST_ARGS+=(./test/integration/...) # Timeout must be less than the CI job timeout-minutes (60) to produce # a useful go test panic instead of an abrupt CI kill. +status=0 if command -v gotestsum >/dev/null 2>&1; then - gotestsum --format testdox -- -count=1 -v -timeout 55m \ - -ldflags "-r /opt/cartesi/lib" \ - -tags=endtoendtests ./test/integration/... + # --jsonfile captures the machine-readable event stream alongside the + # human-readable --format output, so we can post-check for skipped tests. + gotestsum --jsonfile "$REPORT" --format "${GOTESTSUM_FORMAT:-testdox}" \ + -- "${GO_TEST_ARGS[@]}" || status=$? + report_skips "$(toplevel_skips_json "$REPORT")" || status=1 else - go test -count=1 -v -timeout 55m \ - -ldflags "-r /opt/cartesi/lib" \ - -tags=endtoendtests ./test/integration/... + go test "${GO_TEST_ARGS[@]}" | tee "$REPORT" || status=$? + report_skips "$(toplevel_skips_verbose "$REPORT")" || status=1 fi + +exit "$status" diff --git a/test/compose/compose.integration.yaml b/test/compose/compose.integration.yaml index 7e8ef715d..49a86f336 100644 --- a/test/compose/compose.integration.yaml +++ b/test/compose/compose.integration.yaml @@ -109,6 +109,12 @@ services: restart: "no" environment: <<: *env + # Shard selection (empty = full suite); see scripts/run-integration-tests.sh. + TEST_PATTERN: ${TEST_PATTERN:-} + SHARD_NAME: ${SHARD_NAME:-full} + # testdox prints one line per completed test; standard-verbose streams + # go test -v output live (VERBOSE=true in the Makefile selects it). + GOTESTSUM_FORMAT: ${GOTESTSUM_FORMAT:-testdox} CARTESI_BLOCKCHAIN_DEFAULT_BLOCK: latest CARTESI_TEST_DAPP_PATH: /var/lib/cartesi-rollups-node/dapps/echo-dapp CARTESI_TEST_REJECT_DAPP_PATH: /var/lib/cartesi-rollups-node/dapps/reject-loop-dapp diff --git a/test/integration/main_test.go b/test/integration/main_test.go index 8595d6756..6b8912447 100644 --- a/test/integration/main_test.go +++ b/test/integration/main_test.go @@ -33,6 +33,11 @@ func TestMain(m *testing.M) { os.Exit(0) } + // -list only builds and lists tests; skip node management entirely. + if l := flag.Lookup("test.list"); l != nil && l.Value.String() != "" { + os.Exit(m.Run()) + } + // Enforce sequential execution — tests share blockchain state. p := flag.Lookup("test.parallel") if p != nil && p.Value.String() != "1" { @@ -46,9 +51,11 @@ func TestMain(m *testing.M) { } } - // Start the node if none is running (local execution). - // In Docker Compose, the node is a separate container and is already - // running — we detect this by checking if port 10000 is in use. + // In both local and Compose runs the node is started here by TestMain + // (the Compose integration-test service runs this same test binary). The + // port check only guards against a node already running on 10000 — e.g. one + // a developer started by hand — in which case we attach to it and skip the + // restart tests rather than fighting over the port. if nodePortAvailable() { artifactsDir, err := integrationArtifactsDir() if err != nil { From be389462a29410806f9fdef517e03ca773bcda85 Mon Sep 17 00:00:00 2001 From: Victor Fusco <1221933+vfusco@users.noreply.github.com> Date: Thu, 11 Jun 2026 20:07:19 -0300 Subject: [PATCH 02/16] ci: bump actions to node 24 and harden workflows --- .github/license-check/config.json | 13 ------ .github/workflows/build.yml | 63 ++++++++++++++------------- .github/workflows/clean-up-images.yml | 34 +++++++-------- Makefile | 5 ++- README.md | 2 +- scripts/check-license-header.sh | 59 +++++++++++++++++++++++++ 6 files changed, 113 insertions(+), 63 deletions(-) delete mode 100644 .github/license-check/config.json create mode 100755 scripts/check-license-header.sh diff --git a/.github/license-check/config.json b/.github/license-check/config.json deleted file mode 100644 index 15b235e41..000000000 --- a/.github/license-check/config.json +++ /dev/null @@ -1,13 +0,0 @@ -[ - { - "include": [ - "**/*.go" - ], - "exclude": [ - "internal/repository/postgres/db/**", - "pkg/contracts/**", - "pkg/inspectclient/generated.go" - ], - "license": ".github/license-check/header.txt" - } -] diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d0bae4548..a78b5d579 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -15,25 +15,22 @@ jobs: basic-checks: runs-on: ubuntu-24.04 steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: submodules: recursive fetch-depth: 0 - name: Check conventional commit - uses: cocogitto/cocogitto-action@v3 + uses: cocogitto/cocogitto-action@c7a74f5406bab86da17da0f0e460a69f8219a68c # v3 id: conventional_commit_check with: check-latest-tag-only: true - name: Check license header - uses: viperproject/check-license-header@v2 - with: - path: ./ - config: .github/license-check/config.json + run: make check-license - name: Lint Markdown docs - uses: DavidAnson/markdownlint-cli2-action@v16 + uses: DavidAnson/markdownlint-cli2-action@ded1f9488f68a970bc66ea5619e13e9b52e601cd # v23 with: globs: | *.md @@ -47,7 +44,7 @@ jobs: packages: write steps: - name: Checkout source code - uses: actions/checkout@v4 + uses: actions/checkout@v6 with: submodules: recursive @@ -55,17 +52,17 @@ jobs: run: echo ROLLUPS_NODE_VERSION=`make version` >> $GITHUB_ENV - name: Login to GHCR - uses: docker/login-action@v3 + uses: docker/login-action@650006c6eb7dba73a995cc03b0b2d7f5ca915bee # v4 with: registry: ${{ env.GHCR_REGISTRY }} username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - name: Set up Depot CLI - uses: depot/setup-action@v1 + uses: depot/setup-action@15c09a5f77a0840ad4bce955686522a257853461 # v1 - name: Build rollups-node image (amd64) - uses: depot/build-push-action@v1 + uses: depot/build-push-action@5f3b3c2e5a00f0093de47f657aeaefcedff27d18 # v1 with: file: Dockerfile context: . @@ -76,7 +73,7 @@ jobs: token: ${{ secrets.DEPOT_TOKEN }} - name: Build rollups-node image (arm64) - uses: depot/build-push-action@v1 + uses: depot/build-push-action@5f3b3c2e5a00f0093de47f657aeaefcedff27d18 # v1 with: file: Dockerfile context: . @@ -87,7 +84,7 @@ jobs: token: ${{ secrets.DEPOT_TOKEN }} - name: Build tester image - uses: depot/build-push-action@v1 + uses: depot/build-push-action@5f3b3c2e5a00f0093de47f657aeaefcedff27d18 # v1 with: file: Dockerfile context: . @@ -99,7 +96,7 @@ jobs: token: ${{ secrets.DEPOT_TOKEN }} - name: Build devnet image - uses: depot/build-push-action@v1 + uses: depot/build-push-action@5f3b3c2e5a00f0093de47f657aeaefcedff27d18 # v1 with: file: test/devnet/Dockerfile context: . @@ -110,7 +107,7 @@ jobs: token: ${{ secrets.DEPOT_TOKEN }} - name: Build debian package (amd64) - uses: depot/build-push-action@v1 + uses: depot/build-push-action@5f3b3c2e5a00f0093de47f657aeaefcedff27d18 # v1 with: file: Dockerfile context: . @@ -126,7 +123,7 @@ jobs: run: make copy-debian-package BUILD_PLATFORM=linux/amd64 DEB_ARCH=amd64 DEB_PACKAGER_IMG=${{ github.repository_owner }}/rollups-node:debian-packager-amd64 - name: Build debian package (arm64) - uses: depot/build-push-action@v1 + uses: depot/build-push-action@5f3b3c2e5a00f0093de47f657aeaefcedff27d18 # v1 with: file: Dockerfile context: . @@ -142,7 +139,7 @@ jobs: run: make copy-debian-package BUILD_PLATFORM=linux/arm64 DEB_ARCH=arm64 DEB_PACKAGER_IMG=${{ github.repository_owner }}/rollups-node:debian-packager-arm64 - name: Upload deb artifacts - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v7 with: name: artifacts path: | @@ -159,10 +156,10 @@ jobs: packages: read steps: - name: Checkout source code - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Login to GHCR - uses: docker/login-action@v3 + uses: docker/login-action@650006c6eb7dba73a995cc03b0b2d7f5ca915bee # v4 with: registry: ${{ env.GHCR_REGISTRY }} username: ${{ github.actor }} @@ -185,10 +182,10 @@ jobs: packages: read steps: - name: Checkout source code - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Login to GHCR - uses: docker/login-action@v3 + uses: docker/login-action@650006c6eb7dba73a995cc03b0b2d7f5ca915bee # v4 with: registry: ${{ env.GHCR_REGISTRY }} username: ${{ github.actor }} @@ -207,7 +204,7 @@ jobs: cartesi/rollups-node-devnet:devel - name: Cache test machine images - uses: actions/cache@v4 + uses: actions/cache@v5 with: path: test/downloads key: test-deps-${{ hashFiles('test/dependencies.sha256') }} @@ -227,10 +224,10 @@ jobs: shards: ${{ steps.shards.outputs.shards }} steps: - name: Checkout source code - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Setup Go - uses: actions/setup-go@v5 + uses: actions/setup-go@v6 with: go-version-file: go.mod @@ -256,10 +253,10 @@ jobs: packages: read steps: - name: Checkout source code - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Login to GHCR - uses: docker/login-action@v3 + uses: docker/login-action@650006c6eb7dba73a995cc03b0b2d7f5ca915bee # v4 with: registry: ${{ env.GHCR_REGISTRY }} username: ${{ github.actor }} @@ -278,7 +275,7 @@ jobs: cartesi/rollups-node-devnet:devel - name: Cache test machine images - uses: actions/cache@v4 + uses: actions/cache@v5 with: path: test/downloads key: test-deps-${{ hashFiles('test/dependencies.sha256') }} @@ -293,12 +290,16 @@ jobs: - name: Upload integration test logs if: always() - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v7 with: name: integration-test-logs-${{ matrix.shard }} path: integration-logs-${{ matrix.shard }}.txt retention-days: 3 + # Redundant with the trap in compose-integration-run.sh; this is the + # safety net for when concurrency cancel-in-progress SIGKILLs the make + # process before its trap can fire, which would otherwise leak the + # project's containers and volumes. - name: Clean up compose project if: always() run: | @@ -314,13 +315,13 @@ jobs: contents: write steps: - name: Checkout emulator source code - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Download artifacts - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v8 - name: Upload products to GitHub Release - uses: softprops/action-gh-release@v2 + uses: softprops/action-gh-release@718ea10b132b3b2eba29c1007bb80653f286566b # v3 with: draft: true files: | diff --git a/.github/workflows/clean-up-images.yml b/.github/workflows/clean-up-images.yml index 2ca11d67b..d806eb60e 100644 --- a/.github/workflows/clean-up-images.yml +++ b/.github/workflows/clean-up-images.yml @@ -1,11 +1,16 @@ # yaml-language-server: $schema=https://json.schemastore.org/github-workflow.json name: Clean-up Docker images +# build.yml pushes ci-tagged images to GHCR on every push; this periodic sweep +# is what keeps them from accumulating without bound. on: - pull_request: - branches: - - main - types: [closed] + schedule: + # Approximately every 5 days. cron has no true "every N days", so this + # fires on days-of-month 1,6,11,16,21,26,31 and resets at each month + # start (one short gap at the boundary) — fine for a cleanup with a + # 7-day age floor. + - cron: "0 3 */5 * *" + workflow_dispatch: jobs: cleanup: @@ -19,22 +24,14 @@ jobs: - rollups-node - rollups-node-devnet steps: - # Remove PR-scoped tags immediately. - - name: Prune PR tags - uses: vlaurin/action-ghcr-prune@v0.6.0 - with: - organization: cartesi - container: ${{ matrix.image }} - token: ${{ secrets.GITHUB_TOKEN }} - prune-untagged: false - keep-last: 0 - prune-tags-regexes: | - ^pr-${{ github.event.number }}$ - # Prune stale CI images older than 7 days to avoid deleting # images needed by concurrently running workflows. - name: Prune stale CI tags - uses: vlaurin/action-ghcr-prune@v0.6.0 + # Pinned to a commit: third-party action holding a packages:write + # token. Still node20: no node24 release exists upstream (checked + # 2026-06); works until the runner removes node20 (announced + # 2026-09-16). + uses: vlaurin/action-ghcr-prune@0cf7d39f88546edd31965acba78cdcb0be14d641 # v0.6.0 with: organization: cartesi container: ${{ matrix.image }} @@ -44,3 +41,6 @@ jobs: older-than: 7 days prune-tags-regexes: | ^ci- + ^devel-amd64-ci- + ^devel-arm64-ci- + ^tester-ci- diff --git a/Makefile b/Makefile index bba085193..0cb1fc79c 100644 --- a/Makefile +++ b/Makefile @@ -512,6 +512,9 @@ unit-test-with-compose: $(CARTESI_TEST_MACHINE_IMAGES) ## Run unit tests using d lint-with-docker: ## Run linting inside Docker (no host Go needed) @docker run --rm cartesi/rollups-node:tester sh -c 'make lint && make vet && make fmt-check' +check-license: ## Verify license headers on Go source files + @scripts/check-license-header.sh + # ============================================================================= # Integration test sharding # ============================================================================= @@ -664,7 +667,7 @@ build-debian-package: install integration-test-shard integration-test-sharded-local integration-test-shard-check list-integration-shards \ generate generate-contracts generate-config generate-inspect check-generate generate-db \ docs generate-cli-docs generate-config-docs \ - lint fmt fmt-check vet escape \ + lint fmt fmt-check vet escape check-license \ devnet image tester-image debian-packager run-with-compose shutdown-compose \ start start-devnet start-postgres stop stop-devnet stop-postgres restart restart-devnet restart-postgres \ install copy-debian-package build-debian-package \ diff --git a/README.md b/README.md index 8d6500da5..891bd548e 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ We provide packages for debian (.deb) in **amd64** and **arm64** variants on the - GNU Make >= 3.81 - Go >= 1.24.1 -Follow the Cartesi Machine installation instructions [here](https://github.com/cartesi/machine-emulator?tab=readme-ov-file#installation). +Follow the [Cartesi Machine installation instructions](https://github.com/cartesi/machine-emulator?tab=readme-ov-file#installation). ##### Build diff --git a/scripts/check-license-header.sh b/scripts/check-license-header.sh new file mode 100755 index 000000000..e587e84fe --- /dev/null +++ b/scripts/check-license-header.sh @@ -0,0 +1,59 @@ +#!/bin/bash +# (c) Cartesi and individual authors (see AUTHORS) +# SPDX-License-Identifier: Apache-2.0 (see LICENSE) +# +# Verifies that every tracked Go source file carries the license header +# (.github/license-check/header.txt) as a contiguous, in-order block within +# its first lines. The block may be preceded by a "Code generated ... DO NOT +# EDIT." preamble. Generated code that lacks the header entirely is excluded. +# +# Usage: check-license-header.sh + +set -euo pipefail + +cd "$(dirname "$0")/.." + +HEADER_FILE=".github/license-check/header.txt" +TOP_LINES=10 + +# has_header : succeeds iff the exact header block appears, in order and +# contiguous, somewhere within the first TOP_LINES lines of . +# +# The previous implementation counted how many of the first lines matched any +# header line and compared the count to the header length. That was order- and +# uniqueness-blind: a reversed header passed, and so did a file with the +# copyright line duplicated but the SPDX line missing. We compare positionally +# instead so the header must appear verbatim and in the right order. +has_header() { + head -n "$TOP_LINES" "$1" | awk -v hdr="$HEADER_FILE" ' + BEGIN { n = 0; while ((getline line < hdr) > 0) h[++n] = line } + { buf[NR] = $0 } + END { + for (i = 1; i + n - 1 <= NR; i++) { + ok = 1 + for (j = 1; j <= n; j++) + if (buf[i + j - 1] != h[j]) { ok = 0; break } + if (ok) exit 0 + } + exit 1 + }' +} + +fail=0 +while IFS= read -r f; do + if ! has_header "$f"; then + echo "ERROR: missing or wrong license header: $f" >&2 + fail=1 + fi +done < <(git ls-files '*.go' \ + ':!internal/repository/postgres/db' \ + ':!pkg/contracts' \ + ':!pkg/inspectclient/generated.go') + +if [ "$fail" -ne 0 ]; then + echo "FAIL: license header check failed (expected header below)" >&2 + cat "$HEADER_FILE" >&2 + exit 1 +fi + +echo "OK: license headers present" From e3a3ce4f965bd917975a13a19dc00db2bd68cfdd Mon Sep 17 00:00:00 2001 From: Victor Fusco <1221933+vfusco@users.noreply.github.com> Date: Sat, 20 Jun 2026 01:04:36 -0300 Subject: [PATCH 03/16] test(integration): add multiprocess node topology axis --- .github/workflows/build.yml | 33 ++- Makefile | 209 +++++++++++---- scripts/compose-integration-run.sh | 31 ++- test/compose/compose.integration.yaml | 3 + test/integration/main_test.go | 143 +++++++---- test/integration/multinode_helpers_test.go | 280 +++++++++++++++++++++ test/integration/node_helpers_test.go | 64 +++-- 7 files changed, 629 insertions(+), 134 deletions(-) create mode 100644 test/integration/multinode_helpers_test.go diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a78b5d579..4e270c391 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -216,12 +216,12 @@ jobs: run: make unit-test-with-compose # Runs the shard coverage guard and derives the integration matrix from the - # Makefile's INTEGRATION_SHARDS, so adding a shard never requires a workflow - # change and an unassigned test fails fast here. + # Makefile's shard × topology cells, so adding a shard or topology never + # requires a workflow change and an unassigned test fails fast here. integration-test-setup: runs-on: ubuntu-24.04 outputs: - shards: ${{ steps.shards.outputs.shards }} + cells: ${{ steps.matrix.outputs.cells }} steps: - name: Checkout source code uses: actions/checkout@v6 @@ -234,9 +234,9 @@ jobs: - name: Check shard coverage run: make integration-test-shard-check - - name: List shards - id: shards - run: echo "shards=$(make -s list-integration-shards)" >> "$GITHUB_OUTPUT" + - name: List shard x topology cells + id: matrix + run: echo "cells=$(make -s list-integration-cells)" >> "$GITHUB_OUTPUT" integration-test: runs-on: ubuntu-24.04 @@ -245,9 +245,9 @@ jobs: strategy: fail-fast: false matrix: - shard: ${{ fromJson(needs.integration-test-setup.outputs.shards) }} + include: ${{ fromJson(needs.integration-test-setup.outputs.cells) }} env: - COMPOSE_PROJECT: rollups-it-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.shard }} + COMPOSE_PROJECT: rollups-it-${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.shard }}-${{ matrix.topology }} permissions: contents: read packages: read @@ -283,17 +283,18 @@ jobs: - name: Download test dependencies run: make download-test-dependencies - - name: Run integration shard ${{ matrix.shard }} + - name: Run integration shard ${{ matrix.shard }} (${{ matrix.topology }}) run: | - make integration-test-shard SHARD=${{ matrix.shard }} \ + make integration-test-with-compose SHARD=${{ matrix.shard }} \ + NODE_TOPOLOGY=${{ matrix.topology }} \ COMPOSE_PROJECT="$COMPOSE_PROJECT" - name: Upload integration test logs if: always() uses: actions/upload-artifact@v7 with: - name: integration-test-logs-${{ matrix.shard }} - path: integration-logs-${{ matrix.shard }}.txt + name: integration-test-logs-${{ matrix.shard }}-${{ matrix.topology }} + path: integration-logs-${{ matrix.shard }}-${{ matrix.topology }}.txt retention-days: 3 # Redundant with the trap in compose-integration-run.sh; this is the @@ -303,6 +304,14 @@ jobs: - name: Clean up compose project if: always() run: | + docker rm -f "$COMPOSE_PROJECT-integration-test-run" >/dev/null 2>&1 || true + ids="$(docker ps -aq \ + --filter "label=com.docker.compose.project=$COMPOSE_PROJECT" \ + --filter "label=com.docker.compose.service=integration-test" \ + 2>/dev/null || true)" + if [ -n "$ids" ]; then + docker rm -f $ids >/dev/null 2>&1 || true + fi docker compose -p "$COMPOSE_PROJECT" \ -f test/compose/compose.integration.yaml down -v --remove-orphans || true diff --git a/Makefile b/Makefile index 0cb1fc79c..ef5157b11 100644 --- a/Makefile +++ b/Makefile @@ -86,7 +86,11 @@ endif TEST_PATTERN ?= ifneq ($(TEST_PATTERN),) - GO_TEST_FLAGS += -run $(TEST_PATTERN) + # Single-quote the pattern: a shard union like ^Test(A|B)$ contains shell + # metacharacters that would otherwise break the recipe's `go test` line. + # $(value ...) keeps the raw text so the trailing `$` anchor is not eaten by + # make's re-expansion when TEST_PATTERN arrives via a sub-make. + GO_TEST_FLAGS += -run '$(value TEST_PATTERN)' endif TEST_PACKAGES ?= @@ -248,12 +252,22 @@ clean-test-logs: ## Clean integration test log files # single-project name). A SIGKILL/OOM during a local sharded run skips the # cleanup trap and leaks each project's anonymous Postgres volume; this is the # manual recovery. CI uses ephemeral runners and does not need it. -clean-integration-compose: ## Tear down leftover integration shard compose projects and volumes +clean-integration-compose: ## Tear down leftover integration shard×topology compose projects and volumes @echo "Cleaning integration compose projects" - @for s in $(INTEGRATION_SHARDS) ; do \ - docker compose -p rollups-node-integration-$$s -f test/compose/compose.integration.yaml down -v --remove-orphans 2>/dev/null || true ; \ - done - @docker compose -p rollups-node-integration -f test/compose/compose.integration.yaml down -v --remove-orphans 2>/dev/null || true + @cleanup_project() { \ + project="$$1"; \ + docker rm -f "$$project-integration-test-run" >/dev/null 2>&1 || true; \ + ids="$$(docker ps -aq --filter "label=com.docker.compose.project=$$project" --filter "label=com.docker.compose.service=integration-test" 2>/dev/null || true)"; \ + if [ -n "$$ids" ]; then docker rm -f $$ids >/dev/null 2>&1 || true; fi; \ + docker compose -p "$$project" -f test/compose/compose.integration.yaml down -v --remove-orphans 2>/dev/null || true; \ + }; \ + for t in $(INTEGRATION_TOPOLOGIES) ; do \ + cleanup_project rollups-node-integration-all-$$t ; \ + for s in $(INTEGRATION_SHARDS) ; do \ + cleanup_project rollups-node-integration-$$s-$$t ; \ + done ; \ + done; \ + cleanup_project rollups-node-integration # ============================================================================= # Tests @@ -278,6 +292,12 @@ GOTESTSUM_FORMAT ?= testdox ifeq ($(VERBOSE),true) GOTESTSUM_FORMAT = standard-verbose endif +COMPOSE_TOPOLOGY_GOTESTSUM_FORMAT = $(GOTESTSUM_FORMAT) +ifeq ($(strip $(SHARD)),) +ifeq ($(origin GOTESTSUM_FORMAT),file) + COMPOSE_TOPOLOGY_GOTESTSUM_FORMAT = standard-verbose +endif +endif integration-test: ## Execute e2e tests @echo "Running end-to-end tests" @@ -542,68 +562,159 @@ INTEGRATION_SHARD_replay := ^Test(Foreclose|ForecloseReplay|DivergentClaim)$ INTEGRATION_SHARD_restart := ^Test(Restart|SnapshotPolicy)$$ INTEGRATION_SHARD_withdrawal := ^TestWithdrawalLifecycle$$ +# ----------------------------------------------------------------------------- +# Node topology axis — orthogonal to shards. +# ----------------------------------------------------------------------------- +# A shard selects WHICH tests run; a topology selects HOW the node is deployed. +# CI runs the (shard, topology) cells in parallel. Both topologies are +# test-managed (TestMain starts and can restart them), so both run every shard. +# standalone — the all-in-one cartesi-rollups-node process. +# multiprocess — one OS process per service (evm-reader, advancer, validator, +# claimer, prt, jsonrpc-api) sharing Postgres, started as +# subprocesses by TestMain (on the host, or inside the test +# container under compose). See test/integration/multinode_helpers_test.go. +# +# Applicability is per-topology data (INTEGRATION_SHARDS_). multiprocess +# runs the SAME shards as standalone — the node-lifecycle tests stop/start the +# whole service set via the topology-aware harness. Killing a single service +# (partial failure) is separate future fault-injection work. +INTEGRATION_TOPOLOGIES := standalone multiprocess +NODE_TOPOLOGY ?= standalone + +INTEGRATION_SHARDS_standalone := $(INTEGRATION_SHARDS) +INTEGRATION_SHARDS_multiprocess := $(INTEGRATION_SHARDS) + +# The CI matrix is the set of (shard, topology) cells, encoded "shard:topology". +INTEGRATION_CELLS := $(foreach t,$(INTEGRATION_TOPOLOGIES),$(foreach s,$(INTEGRATION_SHARDS_$(t)),$(s):$(t))) + COMPOSE_PROJECT ?= rollups-node-integration INTEGRATION_LOGS ?= integration-logs.txt INTEGRATION_TEST_JOBS ?= 3 +CLEAN_STALE_LOCAL_NODE ?= false -integration-test-with-compose: $(CARTESI_TEST_MACHINE_IMAGES) ## Run integration tests using docker compose with auto-shutdown - @COMPOSE_PROJECT='$(COMPOSE_PROJECT)' INTEGRATION_LOGS='$(INTEGRATION_LOGS)' \ - TEST_PATTERN='$(TEST_PATTERN)' SHARD_NAME='$(SHARD_NAME)' \ - GOTESTSUM_FORMAT='$(GOTESTSUM_FORMAT)' \ - scripts/compose-integration-run.sh +# String helpers for building run patterns / matrices. +comma := , +empty := +space := $(empty) $(empty) -# Validate SHARD at parse time so a bad invocation fails before any -# prerequisite work (e.g. downloading test machine images). -ifneq ($(filter integration-test-shard,$(MAKECMDGOALS)),) -ifeq ($(strip $(SHARD)),) -$(error SHARD is required. Known shards: $(INTEGRATION_SHARDS)) +# --- Selection driven by NODE_TOPOLOGY and SHARD ---------------------------- +# Selected topologies: NODE_TOPOLOGY (default standalone), a space-separated +# list, or the sugar value `all`. +TOPOLOGIES_SELECTED = $(if $(filter all,$(NODE_TOPOLOGY)),$(INTEGRATION_TOPOLOGIES),$(NODE_TOPOLOGY)) +# shards_for(topology): the SHARD filter (or all) intersected with what the +# topology supports (INTEGRATION_SHARDS_). +shards_for = $(filter $(if $(strip $(SHARD)),$(SHARD),$(INTEGRATION_SHARDS_$(1))),$(INTEGRATION_SHARDS_$(1))) +# run_pattern(topology): the selected shards' -run regexes as one alternation. +run_pattern = $(subst $(space),|,$(strip $(foreach s,$(call shards_for,$(1)),$(INTEGRATION_SHARD_$(s))))) +# Selected (shard:topology) cells, for PARALLEL fan-out. +SELECTED_CELLS = $(foreach t,$(TOPOLOGIES_SELECTED),$(foreach s,$(call shards_for,$(t)),$(s):$(t))) +# Label for project/log names: the SHARD filter joined by '-', or "all". +SUITE_LABEL = $(if $(strip $(SHARD)),$(subst $(space),-,$(strip $(SHARD))),all) + +# Validate NODE_TOPOLOGY / SHARD at parse time for the two entry points so a +# bad invocation fails before any prerequisite work (downloading images, etc). +ifneq ($(filter integration-test-with-compose integration-test-local,$(MAKECMDGOALS)),) +$(if $(strip $(TOPOLOGIES_SELECTED)),,$(error NODE_TOPOLOGY must select at least one topology. Known topologies: $(INTEGRATION_TOPOLOGIES))) +$(foreach t,$(TOPOLOGIES_SELECTED),$(if $(filter $(t),$(INTEGRATION_TOPOLOGIES)),,$(error unknown topology '$(t)'. Known topologies: $(INTEGRATION_TOPOLOGIES)))) +$(foreach s,$(SHARD),$(if $(filter $(s),$(INTEGRATION_SHARDS)),,$(error unknown shard '$(s)'. Known shards: $(INTEGRATION_SHARDS)))) endif -ifeq ($(strip $(INTEGRATION_SHARD_$(SHARD))),) -$(error unknown shard '$(SHARD)'. Known shards: $(INTEGRATION_SHARDS)) + +# PARALLEL is compose-only: the host node binds fixed ports. Fail at parse time, +# before the (slow) build prerequisites of integration-test-local. +ifneq ($(filter integration-test-local,$(MAKECMDGOALS)),) +ifeq ($(PARALLEL),true) +$(error PARALLEL is not supported for integration-test-local (the host node binds fixed ports 10000/10011/10012). Use 'integration-test-with-compose PARALLEL=true') endif endif -integration-test-shard: $(CARTESI_TEST_MACHINE_IMAGES) ## Run one integration shard in an isolated compose project (requires SHARD=) - @COMPOSE_PROJECT='$(if $(filter rollups-node-integration,$(COMPOSE_PROJECT)),rollups-node-integration-$(SHARD),$(COMPOSE_PROJECT))' \ - INTEGRATION_LOGS='integration-logs-$(SHARD).txt' \ - TEST_PATTERN='$(INTEGRATION_SHARD_$(SHARD))' \ - SHARD_NAME='$(SHARD)' \ - GOTESTSUM_FORMAT='$(GOTESTSUM_FORMAT)' \ - scripts/compose-integration-run.sh +# ============================================================================= +# Two entry points. Both honor: +# NODE_TOPOLOGY one topology (default standalone), a list, or `all` +# SHARD restrict to a subset of shards (default: all applicable) +# PARALLEL =true runs cells concurrently (compose only) +# A requested-but-inapplicable (shard, topology) is skipped with a message. +# ============================================================================= -integration-test-sharded-local: $(CARTESI_TEST_MACHINE_IMAGES) integration-test-shard-check ## Run all integration shards with bounded concurrency - @$(MAKE) -k -j $(INTEGRATION_TEST_JOBS) $(addprefix run-integration-shard-,$(INTEGRATION_SHARDS)) +integration-test-with-compose: $(CARTESI_TEST_MACHINE_IMAGES) ## Run integration tests via docker compose (NODE_TOPOLOGY=, SHARD=, PARALLEL=true) +ifeq ($(PARALLEL),true) +ifeq ($(strip $(SELECTED_CELLS)),) + @echo "skip: no applicable integration cells selected" +else + @$(MAKE) -k -j $(INTEGRATION_TEST_JOBS) $(addprefix _compose-cell-,$(SELECTED_CELLS)) +endif +else + @set -e; for t in $(TOPOLOGIES_SELECTED); do $(MAKE) _compose-topology-$$t; done +endif -run-integration-shard-%: - @$(MAKE) integration-test-shard SHARD=$* +# One topology: the union of selected shards in one compose project, sequential. +_compose-topology-%: + @pattern='$(call run_pattern,$*)'; \ + if [ -z "$$pattern" ]; then echo "skip: no applicable shards for topology '$*' (SHARD filter excludes all)"; exit 0; fi; \ + COMPOSE_PROJECT='$(if $(filter rollups-node-integration,$(COMPOSE_PROJECT)),rollups-node-integration-$(SUITE_LABEL)-$*,$(COMPOSE_PROJECT))' \ + INTEGRATION_LOGS='integration-logs-$(SUITE_LABEL)-$*.txt' \ + TEST_PATTERN="$$pattern" SHARD_NAME='$(SUITE_LABEL)-$*' NODE_TOPOLOGY='$*' \ + GOTESTSUM_FORMAT='$(COMPOSE_TOPOLOGY_GOTESTSUM_FORMAT)' \ + scripts/compose-integration-run.sh + +# One (shard:topology) cell in its own project — PARALLEL fan-out target. +_compose-cell-%: + @COMPOSE_PROJECT='$(COMPOSE_PROJECT)-$(firstword $(subst :, ,$*))-$(lastword $(subst :, ,$*))' \ + INTEGRATION_LOGS='integration-logs-$(firstword $(subst :, ,$*))-$(lastword $(subst :, ,$*)).txt' \ + TEST_PATTERN='$(INTEGRATION_SHARD_$(firstword $(subst :, ,$*)))' \ + SHARD_NAME='$(firstword $(subst :, ,$*))' \ + NODE_TOPOLOGY='$(lastword $(subst :, ,$*))' \ + GOTESTSUM_FORMAT='$(GOTESTSUM_FORMAT)' \ + scripts/compose-integration-run.sh integration-test-shard-check: ## Verify every integration test belongs to exactly one shard @scripts/check-integration-shards.sh \ $(foreach s,$(INTEGRATION_SHARDS),'$(s)=$(INTEGRATION_SHARD_$(s))') -# Used by CI to build the integration matrix from the single source of truth. -comma := , -empty := -space := $(empty) $(empty) -list-integration-shards: ## Print integration shard names as a JSON array (for the CI matrix) +list-integration-shards: ## Print integration shard names as a JSON array @echo '[$(subst $(space),$(comma),$(patsubst %,"%",$(INTEGRATION_SHARDS)))]' +list-integration-cells: ## Print shard×topology cells as a JSON array of {shard,topology} (for the CI matrix) + @printf '['; sep=''; \ + for cell in $(INTEGRATION_CELLS); do \ + printf '%s{"shard":"%s","topology":"%s"}' "$$sep" "$${cell%%:*}" "$${cell##*:}"; \ + sep=','; \ + done; \ + printf ']\n' + test-with-compose: ## Run all tests using docker compose with auto-shutdown @$(MAKE) unit-test-with-compose @$(MAKE) integration-test-with-compose -integration-test-local: build cartesi-rollups-machine-tool echo-dapp reject-loop-dapp exception-loop-dapp erc20-withdrawal-dapp ## Run integration tests locally (requires: make start && eval $$(make env)) - @cartesi-rollups-cli db init - @if lsof -ti:10000 >/dev/null 2>&1; then \ - echo "Killing stale node on port 10000..."; \ - kill $$(lsof -ti:10000) 2>/dev/null || true; \ - sleep 2; \ - fi - @export CARTESI_TEST_DAPP_PATH=$(CURDIR)/applications/echo-dapp; \ - export CARTESI_TEST_REJECT_DAPP_PATH=$(CURDIR)/applications/reject-loop-dapp; \ - export CARTESI_TEST_EXCEPTION_DAPP_PATH=$(CURDIR)/applications/exception-loop-dapp; \ - export CARTESI_TEST_ERC20_WITHDRAWAL_DAPP_PATH=$(CURDIR)/applications/erc20-withdrawal-dapp; \ - $(MAKE) integration-test +integration-test-local: build cartesi-rollups-machine-tool echo-dapp reject-loop-dapp exception-loop-dapp erc20-withdrawal-dapp ## Run integration tests on the host (NODE_TOPOLOGY=, SHARD=; requires: make start && eval $$(make env); CLEAN_STALE_LOCAL_NODE=true to stop test-port listeners) + @set -e; first=1; for t in $(TOPOLOGIES_SELECTED); do \ + if [ "$$first" = 1 ]; then first=0; else echo "=== resetting dev DB + devnet between topologies ==="; $(MAKE) restart; fi; \ + $(MAKE) _local-topology-$$t; \ + done + +# One topology on the host: in-process node (standalone) or service subprocesses +# (multiprocess, via TestMain reading NODE_TOPOLOGY). +_local-topology-%: + @pattern='$(call run_pattern,$*)'; \ + if [ -z "$$pattern" ]; then echo "skip: no applicable shards for topology '$*' (SHARD filter excludes all)"; exit 0; fi; \ + cartesi-rollups-cli db init; \ + test_ports="10000 10001 10002 10003 10004 10005 10006 10011 10012"; \ + busy_pids="$$(for p in $$test_ports; do lsof -tiTCP:$$p -sTCP:LISTEN 2>/dev/null || true; done | sort -u | tr '\n' ' ')"; \ + if [ -n "$$busy_pids" ]; then \ + if [ "$(CLEAN_STALE_LOCAL_NODE)" = "true" ]; then \ + echo "Stopping process(es) listening on integration test ports: $$busy_pids"; \ + kill $$busy_pids 2>/dev/null || true; \ + sleep 2; \ + else \ + echo "ERROR: integration test ports are already in use by PID(s): $$busy_pids" >&2; \ + echo "Stop those processes, or rerun with CLEAN_STALE_LOCAL_NODE=true to stop test-port listeners." >&2; \ + exit 1; \ + fi; \ + fi; \ + export CARTESI_TEST_DAPP_PATH=$(CURDIR)/applications/echo-dapp; \ + export CARTESI_TEST_REJECT_DAPP_PATH=$(CURDIR)/applications/reject-loop-dapp; \ + export CARTESI_TEST_EXCEPTION_DAPP_PATH=$(CURDIR)/applications/exception-loop-dapp; \ + export CARTESI_TEST_ERC20_WITHDRAWAL_DAPP_PATH=$(CURDIR)/applications/erc20-withdrawal-dapp; \ + NODE_TOPOLOGY='$*' TEST_PATTERN="$$pattern" $(MAKE) integration-test deploy-load-test-apps: applications/echo-dapp ## Deploy 3 echo-dapp instances for load testing @echo "Deploying load-test apps (3 echo-dapps with different salts)..." @@ -619,11 +730,11 @@ load-test: deploy-load-test-apps ## Deploy 3 apps and run advancer starvation lo @echo "NOTE: Start the node (separate terminal) with: CARTESI_ADVANCER_INPUT_BATCH_SIZE=10 cartesi-rollups-node" @scripts/load-test.sh -ci-test: ## Run the full CI test pipeline locally (lint + unit + integration) +ci-test: ## Run the CI test pipeline locally (unit + integration across all topologies) # @$(MAKE) lint-with-docker @$(MAKE) integration-test-shard-check @$(MAKE) unit-test-with-compose - @$(MAKE) integration-test-with-compose + @$(MAKE) integration-test-with-compose NODE_TOPOLOGY=all clean-test-compose-resources: ## Clean up compose resources after some unexpected test failure @echo "Cleaning up Docker Compose resources..." @@ -664,7 +775,7 @@ build-debian-package: install build build-go $(GO_ARTIFACTS) cartesi-rollups-machine-tool \ clean clean-go clean-contracts clean-docs clean-devnet-files clean-dapps clean-test-dependencies clean-test-logs clean-integration-compose clean-debian-packages \ test unit-test unit-test-with-compose integration-test integration-test-with-compose integration-test-local test-with-compose ci-test coverage-report \ - integration-test-shard integration-test-sharded-local integration-test-shard-check list-integration-shards \ + integration-test-shard-check list-integration-shards list-integration-cells \ generate generate-contracts generate-config generate-inspect check-generate generate-db \ docs generate-cli-docs generate-config-docs \ lint fmt fmt-check vet escape check-license \ diff --git a/scripts/compose-integration-run.sh b/scripts/compose-integration-run.sh index 344c47eea..bdd81289f 100755 --- a/scripts/compose-integration-run.sh +++ b/scripts/compose-integration-run.sh @@ -15,6 +15,9 @@ # TEST_PATTERN Optional anchored regex selecting a shard of top-level # tests (forwarded to the test container; empty = full suite) # SHARD_NAME Optional shard label (log readability only) +# NODE_TOPOLOGY Node deployment topology (standalone | multiprocess); +# forwarded to the container, where TestMain starts and +# manages the matching node. set -euo pipefail @@ -25,12 +28,31 @@ NODE_LOG_PATH="/var/lib/cartesi-rollups-node/logs/node.log" : "${INTEGRATION_LOGS:?INTEGRATION_LOGS is required}" export TEST_PATTERN="${TEST_PATTERN:-}" export SHARD_NAME="${SHARD_NAME:-full}" +export NODE_TOPOLOGY="${NODE_TOPOLOGY:-standalone}" compose() { docker compose -p "$COMPOSE_PROJECT" -f "$COMPOSE_FILE" "$@" } +remove_integration_run_containers() { + local ids + docker rm -f "$COMPOSE_PROJECT-integration-test-run" >/dev/null 2>&1 || true + ids=$(docker ps -aq \ + --filter "label=com.docker.compose.project=$COMPOSE_PROJECT" \ + --filter "label=com.docker.compose.service=integration-test" \ + 2>/dev/null || true) + if [ -n "$ids" ]; then + docker rm -f $ids >/dev/null 2>&1 || true + fi +} + cleanup() { + # `docker compose run` creates a one-off integration-test container. On + # Ctrl+C, the compose client can exit before that container is removed; if + # it still holds the project network/volumes, `down -v` cannot clean them. + # Remove it before starting the log-copy helper or tearing the project down. + remove_integration_run_containers + # The in-container trap already prints the node log into the run output; # this volume copy covers abnormal exits (e.g. an OOM-killed container). { @@ -39,6 +61,7 @@ cleanup() { } >>"$INTEGRATION_LOGS" compose run --rm --no-deps --entrypoint cat integration-test \ "$NODE_LOG_PATH" >>"$INTEGRATION_LOGS" 2>/dev/null || true + remove_integration_run_containers { echo echo "=== COMPOSE SERVICE LOGS ===" @@ -49,7 +72,11 @@ cleanup() { trap cleanup EXIT : >"$INTEGRATION_LOGS" -echo "Running integration tests (project=$COMPOSE_PROJECT shard=$SHARD_NAME logs=$INTEGRATION_LOGS)" +echo "Running integration tests (project=$COMPOSE_PROJECT shard=$SHARD_NAME topology=$NODE_TOPOLOGY logs=$INTEGRATION_LOGS)" + +# Clear a stale one-off from a previously interrupted run of the same project. +remove_integration_run_containers # pipefail keeps the test exit code authoritative despite the tee. -compose run --rm --remove-orphans integration-test 2>&1 | tee -a "$INTEGRATION_LOGS" +compose run --name "$COMPOSE_PROJECT-integration-test-run" --rm --remove-orphans integration-test \ + 2>&1 | tee -a "$INTEGRATION_LOGS" diff --git a/test/compose/compose.integration.yaml b/test/compose/compose.integration.yaml index 49a86f336..3d305f338 100644 --- a/test/compose/compose.integration.yaml +++ b/test/compose/compose.integration.yaml @@ -112,6 +112,9 @@ services: # Shard selection (empty = full suite); see scripts/run-integration-tests.sh. TEST_PATTERN: ${TEST_PATTERN:-} SHARD_NAME: ${SHARD_NAME:-full} + # Node topology: standalone (all-in-one) or multiprocess (per-service + # subprocesses). TestMain starts and manages either in this container. + NODE_TOPOLOGY: ${NODE_TOPOLOGY:-standalone} # testdox prints one line per completed test; standard-verbose streams # go test -v output live (VERBOSE=true in the Makefile selects it). GOTESTSUM_FORMAT: ${GOTESTSUM_FORMAT:-testdox} diff --git a/test/integration/main_test.go b/test/integration/main_test.go index 6b8912447..2388eb0b5 100644 --- a/test/integration/main_test.go +++ b/test/integration/main_test.go @@ -15,17 +15,17 @@ import ( "time" ) -// TestMain manages the node process and enforces sequential test execution. +// TestMain manages the node and enforces sequential test execution. // -// If no node is already running on port 10000 (e.g., in Docker Compose), -// TestMain starts the node binary as a subprocess, waits for health, and -// stops it after all tests complete. This makes the node lifecycle -// transparent to individual test suites — they don't need to know whether -// the node was started by the test or by an external process. +// Unless a node is already running on port 10000, TestMain starts the +// test-managed node — the all-in-one process (standalone) or the service +// subprocesses (multiprocess, NODE_TOPOLOGY=multiprocess) — waits for health, +// and stops it after all tests complete. This keeps the node lifecycle +// transparent to the suites. // -// Restart/snapshot tests call stopSharedNode/startSharedNode to exercise -// the node's synchronization path. When the node is externally managed -// (Compose), those tests are skipped. +// Restart/snapshot tests call stopSharedNode/startSharedNode to exercise the +// node's synchronization path; this works under either topology. They are +// skipped only when an external node is already running (not test-managed). func TestMain(m *testing.M) { flag.Parse() if testing.Short() { @@ -51,70 +51,71 @@ func TestMain(m *testing.M) { } } - // In both local and Compose runs the node is started here by TestMain - // (the Compose integration-test service runs this same test binary). The - // port check only guards against a node already running on 10000 — e.g. one - // a developer started by hand — in which case we attach to it and skip the - // restart tests rather than fighting over the port. - if nodePortAvailable() { - artifactsDir, err := integrationArtifactsDir() + // The node is started here by TestMain (the Compose integration service runs + // this same test binary) unless a developer already has one running on + // :10000, in which case we attach and the restart tests are skipped. The + // multiprocess topology starts the services as subprocesses — host or inside + // the test container — so the lifecycle tests can restart them too. + nodeTopology = envOrDefault("NODE_TOPOLOGY", "standalone") + healthTimeout := 2 * time.Minute + + mustPrepareRuntime := func() string { + logPath, err := prepareNodeRuntime() if err != nil { - fmt.Fprintf(os.Stderr, "failed to prepare integration artifacts dir: %v\n", err) + fmt.Fprintf(os.Stderr, "failed to prepare node runtime: %v\n", err) os.Exit(1) } - os.Setenv("CARTESI_TEST_ARTIFACTS_DIR", artifactsDir) - os.Setenv("CARTESI_TEST_NODE_WORKDIR", artifactsDir) - fmt.Fprintf(os.Stderr, "Integration artifacts dir: %s\n", artifactsDir) - - // `make env` exports CARTESI_SNAPSHOTS_DIR=snapshots, which used to - // resolve under test/integration because the node inherited go test's - // package cwd. Keep user-provided custom paths, but route the default - // snapshot path into the integration artifacts directory. - if snapshotsDir := os.Getenv("CARTESI_SNAPSHOTS_DIR"); snapshotsDir == "" || snapshotsDir == "snapshots" { - os.Setenv("CARTESI_SNAPSHOTS_DIR", filepath.Join(artifactsDir, "snapshots")) - } - - logPath := os.Getenv("CARTESI_TEST_NODE_LOG_FILE") - if logPath == "" { - f, err := os.CreateTemp("", "rollups-node-integration-*.log") - if err != nil { - fmt.Fprintf(os.Stderr, - "failed to create node log file: %v\n", err) - os.Exit(1) - } - logPath = f.Name() - f.Close() - os.Setenv("CARTESI_TEST_NODE_LOG_FILE", logPath) - } - - fmt.Fprintf(os.Stderr, "Starting node (log: %s)...\n", logPath) + return logPath + } - sharedNode, err = startNodeWithLog(logPath) + bringUp := func(h nodeHandle, err error) { if err != nil { fmt.Fprintf(os.Stderr, "failed to start node: %v\n", err) os.Exit(1) } - - ctx, cancel := context.WithTimeout( - context.Background(), 2*time.Minute) - if err := sharedNode.waitForHealth(ctx, nil); err != nil { + ctx, cancel := context.WithTimeout(context.Background(), healthTimeout) + if err := h.waitForHealth(ctx, nil); err != nil { cancel() - sharedNode.stop(nil) - fmt.Fprintf(os.Stderr, - "node failed to become healthy: %v\n", err) + h.stop(nil) + fmt.Fprintf(os.Stderr, "node failed to become healthy: %v\n", err) os.Exit(1) } cancel() + sharedNode = h fmt.Fprintln(os.Stderr, "Node is healthy. Running integration tests...") - } else { - fmt.Fprintln(os.Stderr, - "Node already running on port 10000 (external). "+ - "Restart tests will be skipped.") + } + + switch nodeTopology { + case "multiprocess": + logPath := mustPrepareRuntime() + fmt.Fprintf(os.Stderr, "Starting multiprocess node (log: %s)...\n", logPath) + bringUp(startMultiNode(logPath)) + case "standalone": + if nodePortAvailable() { + logPath := mustPrepareRuntime() + fmt.Fprintf(os.Stderr, "Starting node (log: %s)...\n", logPath) + bringUp(startNodeWithLog(logPath)) + } else { + fmt.Fprintln(os.Stderr, + "Node already running on :10000 (external). Restart tests will be skipped.") + } + default: + fmt.Fprintf(os.Stderr, "unknown NODE_TOPOLOGY %q (expected standalone or multiprocess)\n", + nodeTopology) + os.Exit(1) } code := m.Run() if sharedNode != nil { + if checker, ok := sharedNode.(nodeExitChecker); ok { + if err := checker.exitedProcessError(); err != nil { + fmt.Fprintf(os.Stderr, "node subprocess exited unexpectedly: %v\n", err) + if code == 0 { + code = 1 + } + } + } fmt.Fprintln(os.Stderr, "Stopping node...") sharedNode.stop(nil) } @@ -122,6 +123,38 @@ func TestMain(m *testing.M) { os.Exit(code) } +// prepareNodeRuntime sets up the artifacts dir, snapshot dir, and node log +// file shared by the standalone and host-multiprocess paths, returning the +// log path. +func prepareNodeRuntime() (string, error) { + artifactsDir, err := integrationArtifactsDir() + if err != nil { + return "", fmt.Errorf("prepare integration artifacts dir: %w", err) + } + os.Setenv("CARTESI_TEST_ARTIFACTS_DIR", artifactsDir) + os.Setenv("CARTESI_TEST_NODE_WORKDIR", artifactsDir) + fmt.Fprintf(os.Stderr, "Integration artifacts dir: %s\n", artifactsDir) + + // `make env` exports CARTESI_SNAPSHOTS_DIR=snapshots, which used to resolve + // under test/integration because the node inherited go test's package cwd. + // Keep user-provided custom paths, but route the default into the artifacts dir. + if snapshotsDir := os.Getenv("CARTESI_SNAPSHOTS_DIR"); snapshotsDir == "" || snapshotsDir == "snapshots" { + os.Setenv("CARTESI_SNAPSHOTS_DIR", filepath.Join(artifactsDir, "snapshots")) + } + + logPath := os.Getenv("CARTESI_TEST_NODE_LOG_FILE") + if logPath == "" { + f, err := os.CreateTemp("", "rollups-node-integration-*.log") + if err != nil { + return "", fmt.Errorf("create node log file: %w", err) + } + logPath = f.Name() + f.Close() + os.Setenv("CARTESI_TEST_NODE_LOG_FILE", logPath) + } + return logPath, nil +} + func integrationArtifactsDir() (string, error) { if dir := os.Getenv("CARTESI_TEST_ARTIFACTS_DIR"); dir != "" { absDir, err := filepath.Abs(dir) diff --git a/test/integration/multinode_helpers_test.go b/test/integration/multinode_helpers_test.go new file mode 100644 index 000000000..063875265 --- /dev/null +++ b/test/integration/multinode_helpers_test.go @@ -0,0 +1,280 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//go:build endtoendtests + +package integration + +import ( + "context" + "fmt" + "net" + "net/http" + "os" + "os/exec" + "sync" + "testing" + "time" +) + +// multiService describes one service subprocess and the telemetry port +// whose /readyz endpoint reports its health. Each service keeps the +// read-API ports the tests already expect (jsonrpc :10011, inspect :10012), so +// only the telemetry ports are remapped to avoid collisions on localhost. +type multiService struct { + name string + telemetryAddr string + extraEnv []string +} + +// multiNodeServices is the full deployment the multiprocess topology runs on +// the host — every service the standalone node embeds. prt is included so the +// prt shard's Dave-consensus tests run; it idles for authority/quorum apps. +var multiNodeServices = []multiService{ + {name: "cartesi-rollups-evm-reader", telemetryAddr: ":10001", + extraEnv: []string{"CARTESI_EVM_READER_TELEMETRY_ADDRESS=:10001"}}, + {name: "cartesi-rollups-advancer", telemetryAddr: ":10002", + extraEnv: []string{ + "CARTESI_ADVANCER_TELEMETRY_ADDRESS=:10002", + "CARTESI_INSPECT_ADDRESS=:10012", + "CARTESI_FEATURE_INSPECT_ENABLED=true", + }}, + {name: "cartesi-rollups-validator", telemetryAddr: ":10003", + extraEnv: []string{"CARTESI_VALIDATOR_TELEMETRY_ADDRESS=:10003"}}, + {name: "cartesi-rollups-claimer", telemetryAddr: ":10004", + extraEnv: []string{"CARTESI_CLAIMER_TELEMETRY_ADDRESS=:10004"}}, + {name: "cartesi-rollups-jsonrpc-api", telemetryAddr: ":10005", + extraEnv: []string{ + "CARTESI_JSONRPC_TELEMETRY_ADDRESS=:10005", + "CARTESI_JSONRPC_API_ADDRESS=:10011", + }}, + {name: "cartesi-rollups-prt", telemetryAddr: ":10006", + extraEnv: []string{"CARTESI_PRT_TELEMETRY_ADDRESS=:10006"}}, +} + +var multiNodeListenAddrs = []string{ + ":10000", // stale standalone node telemetry + ":10001", ":10002", ":10003", ":10004", ":10005", ":10006", + ":10011", // jsonrpc API + ":10012", // inspect API +} + +type multiNodeProcess struct { + name string + cmd *exec.Cmd + done chan struct{} + + mu sync.Mutex + waitErr error +} + +// multiNode is a running host multiprocess deployment. +type multiNode struct { + procs []*multiNodeProcess + addrs []string + logFile *os.File + tail *exec.Cmd // tail -f process streaming the log to the terminal + tty *os.File // /dev/tty FD used by tail; closed in stop() +} + +// startMultiNode starts each service binary as a host subprocess sharing the +// inherited environment (DB connection, blockchain endpoint, contracts, +// mnemonic, snapshot dir) plus per-service telemetry/address overrides and the +// fast polling intervals used for test responsiveness. All output is appended +// to logPath and streamed to the terminal, the same way the standalone node is. +// On any failure the already-started processes are stopped. +func startMultiNode(logPath string, extraEnv ...string) (*multiNode, error) { + if err := preflightMultiNodePorts(); err != nil { + return nil, err + } + + logFile, err := os.OpenFile( //nolint:gosec + logPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644) //nolint:mnd + if err != nil { + return nil, fmt.Errorf("open log file %s: %w", logPath, err) + } + + mn := &multiNode{logFile: logFile} + + // Stream the combined service log to the terminal via a separate tail + // process writing to /dev/tty, bypassing go test / gotestsum output + // capture (same approach as the standalone node). Started before the + // services so their startup output is visible. Falls back silently when + // /dev/tty is unavailable (CI, compose). + if tty, ttyErr := os.OpenFile("/dev/tty", os.O_WRONLY, 0); ttyErr == nil { + tail := exec.Command("tail", "-f", logPath) //nolint:gosec + tail.Stdout = tty + tail.Stderr = tty + if err := tail.Start(); err != nil { + tty.Close() + } else { + mn.tail = tail + mn.tty = tty + } + } + + base := append(os.Environ(), + "CARTESI_ADVANCER_POLLING_INTERVAL=1", + "CARTESI_VALIDATOR_POLLING_INTERVAL=1", + "CARTESI_CLAIMER_POLLING_INTERVAL=1", + "CARTESI_EVM_READER_POLLING_INTERVAL=1", + "CARTESI_PRT_POLLING_INTERVAL=1", + ) + + for _, svc := range multiNodeServices { + if _, err := exec.LookPath(svc.name); err != nil { + mn.stop(nil) + return nil, fmt.Errorf("%s not found on PATH: %w", svc.name, err) + } + cmd := exec.Command(svc.name) //nolint:gosec + cmd.Stdout = logFile + cmd.Stderr = logFile + cmd.Env = append(append(append([]string{}, base...), svc.extraEnv...), extraEnv...) + if workDir := os.Getenv("CARTESI_TEST_NODE_WORKDIR"); workDir != "" { + if err := os.MkdirAll(workDir, 0755); err != nil { //nolint:mnd + mn.stop(nil) + return nil, fmt.Errorf("create node workdir %s: %w", workDir, err) + } + cmd.Dir = workDir + } + if err := cmd.Start(); err != nil { + mn.stop(nil) + return nil, fmt.Errorf("start %s: %w", svc.name, err) + } + proc := &multiNodeProcess{name: svc.name, cmd: cmd, done: make(chan struct{})} + go func() { + err := cmd.Wait() + proc.mu.Lock() + proc.waitErr = err + proc.mu.Unlock() + close(proc.done) + }() + fmt.Fprintf(os.Stderr, " started %s (telemetry %s, pid %d)\n", + svc.name, svc.telemetryAddr, cmd.Process.Pid) + mn.procs = append(mn.procs, proc) + mn.addrs = append(mn.addrs, svc.telemetryAddr) + } + return mn, nil +} + +func preflightMultiNodePorts() error { + for _, addr := range multiNodeListenAddrs { + conn, err := net.DialTimeout("tcp", "localhost"+addr, time.Second) + if err == nil { + conn.Close() + return fmt.Errorf("cannot start multiprocess node: localhost%s is already in use", addr) + } + } + return nil +} + +// waitForHealth polls every service's /readyz until all respond 200 OK or the +// context is cancelled. +func (mn *multiNode) waitForHealth(ctx context.Context, _ testing.TB) error { + client := &http.Client{Timeout: 2 * time.Second} + for i, addr := range mn.addrs { + svc := multiNodeServices[i].name + url := "http://localhost" + addr + "/readyz" + err := pollUntil(ctx, 2*time.Second, func() (bool, error) { + if err := mn.exitedProcessError(); err != nil { + return false, err + } + req, err := http.NewRequestWithContext(ctx, "GET", url, nil) + if err != nil { + return false, nil + } + resp, err := client.Do(req) //nolint:gosec // url is a fixed localhost telemetry port + if err != nil { + return false, nil + } + defer resp.Body.Close() + return resp.StatusCode == http.StatusOK, nil + }) + if err != nil { + return fmt.Errorf("%s did not become healthy at %s: %w", svc, url, err) + } + fmt.Fprintf(os.Stderr, " %s healthy\n", svc) + } + return nil +} + +func (mn *multiNode) exitedProcessError() error { + for _, proc := range mn.procs { + if exited, err := proc.exitStatus(); exited { + if err != nil { + return fmt.Errorf("%s exited unexpectedly: %w", proc.name, err) + } + return fmt.Errorf("%s exited unexpectedly", proc.name) + } + } + return nil +} + +func (p *multiNodeProcess) exitStatus() (bool, error) { + select { + case <-p.done: + p.mu.Lock() + defer p.mu.Unlock() + return true, p.waitErr + default: + return false, nil + } +} + +// stop interrupts every service subprocess (in reverse start order) and waits +// for it to exit, then stops the log tail and closes the log file. +func (mn *multiNode) stop(t testing.TB) { + // Stop the tail first so it does not print shutdown noise. + if mn.tail != nil && mn.tail.Process != nil { + _ = mn.tail.Process.Kill() + _ = mn.tail.Wait() + } + if mn.tty != nil { + mn.tty.Close() + } + for i := len(mn.procs) - 1; i >= 0; i-- { + proc := mn.procs[i] + if proc.cmd.Process == nil || proc.isDone() { + continue + } + if err := proc.cmd.Process.Signal(os.Interrupt); err != nil { + if t != nil { + t.Logf(" signal %s failed, killing: %v", proc.name, err) + } + _ = proc.cmd.Process.Kill() + } + } + for _, proc := range mn.procs { + if proc.wait(30 * time.Second) { //nolint:mnd + continue + } + if t != nil { + t.Logf(" %s did not exit within 30s, sending SIGKILL", proc.name) + } + if proc.cmd.Process != nil { + _ = proc.cmd.Process.Kill() + } + <-proc.done + } + if mn.logFile != nil { + mn.logFile.Close() + } +} + +func (p *multiNodeProcess) isDone() bool { + select { + case <-p.done: + return true + default: + return false + } +} + +func (p *multiNodeProcess) wait(timeout time.Duration) bool { + select { + case <-p.done: + return true + case <-time.After(timeout): + return false + } +} diff --git a/test/integration/node_helpers_test.go b/test/integration/node_helpers_test.go index 1ef047447..10f0164ad 100644 --- a/test/integration/node_helpers_test.go +++ b/test/integration/node_helpers_test.go @@ -18,15 +18,30 @@ import ( const nodeBinary = "cartesi-rollups-node" -// sharedNode is the test-managed node process, started by TestMain when no -// external node is running. All test suites share this instance. Restart -// tests stop and restart it via stopSharedNode/startSharedNode. -// When nil, the node is externally managed (e.g., Docker Compose) and -// restart tests are skipped. -var sharedNode *nodeProcess - -// isNodeSelfManaged returns true if TestMain started the node process. -// When false, the node is externally managed and cannot be restarted. +// nodeHandle is a running test-managed node: either the standalone all-in-one +// process or the multiprocess set of service subprocesses. Both can be stopped +// and restarted, so the node-lifecycle tests run under either topology. +type nodeHandle interface { + stop(t testing.TB) + waitForHealth(ctx context.Context, t testing.TB) error +} + +type nodeExitChecker interface { + exitedProcessError() error +} + +// sharedNode is the test-managed node, started by TestMain unless an external +// node is already running on :10000. All suites share it; lifecycle tests stop +// and restart it via stopSharedNode/startSharedNode. nil when externally +// managed. Its concrete type follows nodeTopology. +var sharedNode nodeHandle + +// nodeTopology is the deployment TestMain manages ("standalone" or +// "multiprocess"); it selects what startSharedNode(WithEnv) restarts. +var nodeTopology string + +// isNodeSelfManaged returns true if TestMain started the node (so it can be +// restarted). False when an external node is already running. func isNodeSelfManaged() bool { return sharedNode != nil } @@ -34,11 +49,20 @@ func isNodeSelfManaged() bool { // stopSharedNode stops the test-managed node. Panics if the node is // externally managed. func stopSharedNode(t testing.TB) { + t.Helper() if sharedNode == nil { t.Fatal("cannot stop node: not managed by tests (running in compose?)") } - sharedNode.stop(t) + h := sharedNode + var exitErr error + if checker, ok := sharedNode.(nodeExitChecker); ok { + exitErr = checker.exitedProcessError() + } + h.stop(t) sharedNode = nil + if exitErr != nil { + t.Fatalf("cannot stop node: managed node already exited unexpectedly: %v", exitErr) + } } // startSharedNode starts a new test-managed node, reusing the existing log @@ -51,26 +75,34 @@ func startSharedNode(t testing.TB) { // inject extra environment variables (e.g., // CARTESI_FEATURE_CLAIM_SUBMISSION_ENABLED=false to bring the node up in // reader mode for a single test phase). Restore default mode on test -// teardown by stopping the node and calling startSharedNode again. +// teardown by stopping the node and calling startSharedNode again. Under the +// multiprocess topology this starts/stops the whole service set. func startSharedNodeWithEnv(t testing.TB, extraEnv ...string) { if sharedNode != nil { t.Fatal("cannot start node: already running") } logPath := os.Getenv("CARTESI_TEST_NODE_LOG_FILE") - var err error - sharedNode, err = startNodeWithLog(logPath, extraEnv...) + var ( + h nodeHandle + err error + ) + if nodeTopology == "multiprocess" { + h, err = startMultiNode(logPath, extraEnv...) + } else { + h, err = startNodeWithLog(logPath, extraEnv...) + } if err != nil { t.Fatalf("failed to start node: %v", err) } ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) defer cancel() - if err := sharedNode.waitForHealth(ctx, t); err != nil { - sharedNode.stop(t) - sharedNode = nil + if err := h.waitForHealth(ctx, t); err != nil { + h.stop(t) t.Fatalf("node failed to become healthy: %v", err) } + sharedNode = h } // nodePortAvailable returns true if the node's telemetry port (10000) is free. From 6667e49bb429fe458f9c3a02811f9b8998bac490 Mon Sep 17 00:00:00 2001 From: Victor Fusco <1221933+vfusco@users.noreply.github.com> Date: Sat, 20 Jun 2026 21:42:40 -0300 Subject: [PATCH 04/16] fix(claimer): drain inputs before terminalizing foreclosed epochs --- internal/claimer/foreclosed_apps_test.go | 46 +++++++++++++++--------- internal/claimer/foreclosure.go | 31 ++++++++++------ 2 files changed, 51 insertions(+), 26 deletions(-) diff --git a/internal/claimer/foreclosed_apps_test.go b/internal/claimer/foreclosed_apps_test.go index 6a3ab099e..0a88324bd 100644 --- a/internal/claimer/foreclosed_apps_test.go +++ b/internal/claimer/foreclosed_apps_test.go @@ -149,15 +149,14 @@ func TestProcessForeclosedApps_DrainCheckErrorsAppendAndContinue(t *testing.T) { app2 := foreclosedAppHelper(2, 100, model.Consensus_Authority) for _, app := range []*model.Application{app1, app2} { - r.On("ForecloseUnacceptedEpochsAtOrAfterBlock", - mock.Anything, app.ID, app.ForecloseBlock, - ).Return(0, nil).Once() r.On("HasUndrainedEpochsBeforeBlock", mock.Anything, app.ID, app.ForecloseBlock, ).Return(false, errors.New("db unavailable")).Once() } - // HasUnreconciledClaimsBeforeBlock must not be reached for either app — the - // undrained check errored and the per-app branch `continue`d. No expectation. + // The drain check runs first; its error makes the per-app branch `continue` + // before terminalizing or reconciling. Neither + // ForecloseUnacceptedEpochsAtOrAfterBlock nor HasUnreconciledClaimsBeforeBlock + // is reached — no expectation registered for either. errs := s.processForeclosedApps(map[int64]*model.Application{app1.ID: app1, app2.ID: app2}) assert.Len(t, errs, 2, "each app's drain error is appended; the pass does not abort early") @@ -196,6 +195,15 @@ func TestProcessForeclosedApps_NoTransitionWhenDrained(t *testing.T) { assert.Empty(t, errs) } +// TestProcessForeclosedApps_DefersWhenInputsUndrained verifies the drain-gate +// ordering that protects an input landing in the foreclose block. While any +// pre-foreclosure input is still undrained, the pass defers WITHOUT +// terminalizing. Terminalizing the straddling epoch first would flip it to +// CLAIM_FORECLOSED and strand its unprocessed same-block input (it would vanish +// from both this drain check and the manager's machine-drain gate, and the +// machine would be torn down before advancing it). The absent +// ForecloseUnacceptedEpochsAtOrAfterBlock expectation is the regression guard: +// testify/mock fails on the unexpected call if terminalization runs too early. func TestProcessForeclosedApps_DefersWhenInputsUndrained(t *testing.T) { s, r, _ := newServiceMock() defer r.AssertExpectations(t) @@ -203,35 +211,41 @@ func TestProcessForeclosedApps_DefersWhenInputsUndrained(t *testing.T) { app := foreclosedAppHelper(1, 100, model.Consensus_Authority) s.Context = context.Background() - r.On("ForecloseUnacceptedEpochsAtOrAfterBlock", - mock.Anything, app.ID, app.ForecloseBlock, - ).Return(0, nil).Once() r.On("HasUndrainedEpochsBeforeBlock", mock.Anything, app.ID, app.ForecloseBlock, ).Return(true, nil).Once() - // No HasUnreconciledClaimsBeforeBlock expectation — unresolved inputs - // must stop the drain check before claim-state reconciliation. + // No ForecloseUnacceptedEpochsAtOrAfterBlock and no + // HasUnreconciledClaimsBeforeBlock: an undrained input defers the whole pass + // before terminalization and before claim reconciliation. errs := s.processForeclosedApps(map[int64]*model.Application{app.ID: app}) assert.Empty(t, errs, "input-drain deferral is not an error") } -func TestProcessForeclosedApps_TerminalizesUnacceptedOverlapBeforeDrain(t *testing.T) { +// TestProcessForeclosedApps_TerminalizesUnacceptedOverlapAfterDrain verifies the +// other side of the gate: once the drain check clears (no undrained inputs), the +// straddling/after epochs that can never be accepted are terminalized to +// CLAIM_FORECLOSED, then reconciliation completes. +func TestProcessForeclosedApps_TerminalizesUnacceptedOverlapAfterDrain(t *testing.T) { s, r, _ := newServiceMock() defer r.AssertExpectations(t) app := foreclosedAppHelper(1, 100, model.Consensus_Authority) s.Context = context.Background() - r.On("ForecloseUnacceptedEpochsAtOrAfterBlock", - mock.Anything, app.ID, app.ForecloseBlock, - ).Return(2, nil).Once() - r.On("HasUndrainedEpochsBeforeBlock", + // Pin the sequence: the drain check MUST run before terminalization (else a + // straddling-epoch input is stranded — the bug this ordering prevents), and + // terminalization before the claim-reconciliation check. + drain := r.On("HasUndrainedEpochsBeforeBlock", mock.Anything, app.ID, app.ForecloseBlock, ).Return(false, nil).Once() - r.On("HasUnreconciledClaimsBeforeBlock", + terminalize := r.On("ForecloseUnacceptedEpochsAtOrAfterBlock", + mock.Anything, app.ID, app.ForecloseBlock, + ).Return(2, nil).Once() + reconcile := r.On("HasUnreconciledClaimsBeforeBlock", mock.Anything, app.ID, app.ForecloseBlock, ).Return(false, nil).Once() + mock.InOrder(drain, terminalize, reconcile) errs := s.processForeclosedApps(map[int64]*model.Application{app.ID: app}) assert.Empty(t, errs) diff --git a/internal/claimer/foreclosure.go b/internal/claimer/foreclosure.go index b7b4010b8..e8bccd81e 100644 --- a/internal/claimer/foreclosure.go +++ b/internal/claimer/foreclosure.go @@ -80,41 +80,52 @@ func (s *Service) processForeclosedApps( ) continue } - terminalized, err := s.repository.ForecloseUnacceptedEpochsAtOrAfterBlock( + // Drain gate FIRST, terminalize second. An input can land in the + // foreclose block itself (before the foreclose tx, so it is valid and is + // indexed up to and including foreclose_block). Terminalizing the + // straddling epoch before that input is advanced would flip the epoch to + // CLAIM_FORECLOSED, which hides its still-unprocessed input from this + // drain check AND from the manager's machine-drain gate + // (HasUndrainedEpochsBeforeBlock excludes terminal epochs) — so the + // machine is torn down and the input is never processed, leaving the + // final machine state one input behind the chain. Wait for the drain, + // then terminalize. PRT gates terminalization the same way + // (internal/prt/service.go handleForeclosedApp). + undrained, err := s.repository.HasUndrainedEpochsBeforeBlock( s.Context, app.ID, app.ForecloseBlock, ) if err != nil { errs = append(errs, fmt.Errorf( - "terminalizing unaccepted epochs for foreclosed app %s: %w", + "checking input drain progress for foreclosed app %s: %w", app.IApplicationAddress, err)) continue } - if terminalized > 0 { + if undrained { s.Logger.Info( - "Foreclosed application terminalized epochs that cannot be accepted", + "Foreclosed application still advancing pre-foreclosure inputs", "application", app.Name, "address", app.IApplicationAddress, "foreclose_block", app.ForecloseBlock, - "epochs", terminalized, ) + continue } - undrained, err := s.repository.HasUndrainedEpochsBeforeBlock( + terminalized, err := s.repository.ForecloseUnacceptedEpochsAtOrAfterBlock( s.Context, app.ID, app.ForecloseBlock, ) if err != nil { errs = append(errs, fmt.Errorf( - "checking input drain progress for foreclosed app %s: %w", + "terminalizing unaccepted epochs for foreclosed app %s: %w", app.IApplicationAddress, err)) continue } - if undrained { + if terminalized > 0 { s.Logger.Info( - "Foreclosed application still advancing pre-foreclosure inputs", + "Foreclosed application terminalized epochs that cannot be accepted", "application", app.Name, "address", app.IApplicationAddress, "foreclose_block", app.ForecloseBlock, + "epochs", terminalized, ) - continue } unreconciled, err := s.repository.HasUnreconciledClaimsBeforeBlock( s.Context, app.ID, app.ForecloseBlock, From 184ec8ad474e399c524238416866f36707cff1c2 Mon Sep 17 00:00:00 2001 From: Renato Maia <1887792+renatomaia@users.noreply.github.com> Date: Mon, 11 May 2026 17:46:12 -0300 Subject: [PATCH 05/16] docs(services): remove outdated comment --- pkg/service/service.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/pkg/service/service.go b/pkg/service/service.go index 4ed56f653..d1a30b169 100644 --- a/pkg/service/service.go +++ b/pkg/service/service.go @@ -10,8 +10,6 @@ // - embed a [Service] struct into a new Service struct. // - embed a [Create] call into a new Create function. // -// Check DummyService, SlowService and ListService source code for examples of how to do it. -// // To use a service, call its corresponding Create function with a matching CreateInfo and Service, // then fill in the appropriate CreateInfo fields. // Here are a few of the available options: From 7b4b634d74a72376b2ff0bafe5089b0f6bba4849 Mon Sep 17 00:00:00 2001 From: Renato Maia <1887792+renatomaia@users.noreply.github.com> Date: Thu, 16 Apr 2026 15:31:49 -0300 Subject: [PATCH 06/16] refactor(services): support services that process both with ticks or continuously --- cmd/cartesi-rollups-advancer/root/root.go | 23 +- cmd/cartesi-rollups-claimer/root/root.go | 23 +- cmd/cartesi-rollups-evm-reader/root/root.go | 21 +- cmd/cartesi-rollups-jsonrpc-api/root/root.go | 7 +- cmd/cartesi-rollups-node/root/root.go | 7 +- cmd/cartesi-rollups-prt/root/root.go | 23 +- cmd/cartesi-rollups-validator/root/root.go | 21 +- internal/advancer/advancer_test.go | 25 +- internal/advancer/service.go | 36 +- internal/claimer/claimer_test.go | 19 +- internal/claimer/fixtures_test.go | 6 +- internal/claimer/service.go | 32 +- internal/claimer/service_test.go | 21 +- .../evmreader/accounts_drive_proved_test.go | 14 +- internal/evmreader/evmreader_test.go | 27 +- internal/evmreader/foreclosure_test.go | 14 +- internal/evmreader/input_scan_units_test.go | 12 +- internal/evmreader/output_test.go | 16 +- internal/evmreader/sealedepochs_test.go | 4 +- internal/evmreader/service.go | 28 +- internal/evmreader/service_config_test.go | 18 +- internal/jsonrpc/service.go | 53 +-- internal/jsonrpc/util_test.go | 4 +- internal/node/node.go | 137 +++--- internal/prt/handle_foreclosed_test.go | 6 +- internal/prt/service.go | 24 +- internal/validator/validator.go | 24 +- internal/validator/validator_test.go | 4 +- pkg/service/service.go | 428 ++++++++++-------- pkg/service/service_test.go | 97 ++-- pkg/service/telemetry_test.go | 35 +- test/validator/validator_test.go | 11 +- 32 files changed, 625 insertions(+), 595 deletions(-) diff --git a/cmd/cartesi-rollups-advancer/root/root.go b/cmd/cartesi-rollups-advancer/root/root.go index d8680f501..65a37964a 100644 --- a/cmd/cartesi-rollups-advancer/root/root.go +++ b/cmd/cartesi-rollups-advancer/root/root.go @@ -80,19 +80,21 @@ func run(cmd *cobra.Command, args []string) { defer cancel() createInfo := advancer.CreateInfo{ - CreateInfo: service.CreateInfo{ - Name: config.ServiceAdvancer, - LogLevel: config.ResolveServiceLogLevel(config.ServiceAdvancer, cfg.LogLevel), - LogColor: cfg.LogColor, - EnableSignalHandling: true, - TelemetryCreate: true, - TelemetryAddress: cfg.AdvancerTelemetryAddress, - PollInterval: cfg.AdvancerPollingInterval, + TickServiceConfigs: service.TickServiceConfigs{ + PollInterval: cfg.AdvancerPollingInterval, + ServiceConfigs: service.ServiceConfigs{ + Name: config.ServiceAdvancer, + LogLevel: config.ResolveServiceLogLevel(config.ServiceAdvancer, cfg.LogLevel), + LogColor: cfg.LogColor, + EnableSignalHandling: true, + TelemetryCreate: true, + TelemetryAddress: cfg.AdvancerTelemetryAddress, + }, }, Config: *cfg, } - logger := service.NewServiceLogger(&createInfo.CreateInfo) - createInfo.CreateInfo.Logger = logger + logger := service.NewServiceLogger(&createInfo.ServiceConfigs) + createInfo.ServiceConfigs.Logger = logger var err error createInfo.Repository, err = factory.NewRepositoryFromConnectionString(ctx, cfg.DatabaseConnection.Raw()) @@ -101,7 +103,6 @@ func run(cmd *cobra.Command, args []string) { advancerService, err := advancer.Create(ctx, &createInfo) cli.CheckErr(logger, err) - advancerService.LogConfig(createInfo.Config) cli.CheckErr(logger, advancerService.Serve()) } diff --git a/cmd/cartesi-rollups-claimer/root/root.go b/cmd/cartesi-rollups-claimer/root/root.go index 0aafe79f0..96eb4cdc8 100644 --- a/cmd/cartesi-rollups-claimer/root/root.go +++ b/cmd/cartesi-rollups-claimer/root/root.go @@ -81,19 +81,21 @@ func run(cmd *cobra.Command, args []string) { defer cancel() createInfo := claimer.CreateInfo{ - CreateInfo: service.CreateInfo{ - Name: config.ServiceClaimer, - LogLevel: config.ResolveServiceLogLevel(config.ServiceClaimer, cfg.LogLevel), - LogColor: cfg.LogColor, - EnableSignalHandling: true, - TelemetryCreate: true, - TelemetryAddress: cfg.ClaimerTelemetryAddress, - PollInterval: cfg.ClaimerPollingInterval, + TickServiceConfigs: service.TickServiceConfigs{ + PollInterval: cfg.ClaimerPollingInterval, + ServiceConfigs: service.ServiceConfigs{ + Name: config.ServiceClaimer, + LogLevel: config.ResolveServiceLogLevel(config.ServiceClaimer, cfg.LogLevel), + LogColor: cfg.LogColor, + EnableSignalHandling: true, + TelemetryCreate: true, + TelemetryAddress: cfg.ClaimerTelemetryAddress, + }, }, Config: *cfg, } - logger := service.NewServiceLogger(&createInfo.CreateInfo) - createInfo.CreateInfo.Logger = logger + logger := service.NewServiceLogger(&createInfo.ServiceConfigs) + createInfo.ServiceConfigs.Logger = logger authOpt, err := config.HTTPAuthorizationOption() cli.CheckErr(logger, err) @@ -113,7 +115,6 @@ func run(cmd *cobra.Command, args []string) { claimerService, err := claimer.Create(ctx, &createInfo) cli.CheckErr(logger, err) - claimerService.LogConfig(createInfo.Config) err = claimerService.Serve() cli.CheckErr(logger, err) diff --git a/cmd/cartesi-rollups-evm-reader/root/root.go b/cmd/cartesi-rollups-evm-reader/root/root.go index 08faec1ed..bb40f0d8a 100644 --- a/cmd/cartesi-rollups-evm-reader/root/root.go +++ b/cmd/cartesi-rollups-evm-reader/root/root.go @@ -81,19 +81,21 @@ func run(cmd *cobra.Command, args []string) { defer cancel() createInfo := evmreader.CreateInfo{ - CreateInfo: service.CreateInfo{ - Name: config.ServiceEvmReader, - LogLevel: config.ResolveServiceLogLevel(config.ServiceEvmReader, cfg.LogLevel), - LogColor: cfg.LogColor, - EnableSignalHandling: true, - TelemetryCreate: true, - TelemetryAddress: cfg.EvmReaderTelemetryAddress, + TickServiceConfigs: service.TickServiceConfigs{ + ServiceConfigs: service.ServiceConfigs{ + Name: config.ServiceEvmReader, + LogLevel: config.ResolveServiceLogLevel(config.ServiceEvmReader, cfg.LogLevel), + LogColor: cfg.LogColor, + EnableSignalHandling: true, + TelemetryCreate: true, + TelemetryAddress: cfg.EvmReaderTelemetryAddress, + }, PollInterval: cfg.EvmReaderPollingInterval, }, Config: *cfg, } - logger := service.NewServiceLogger(&createInfo.CreateInfo) - createInfo.CreateInfo.Logger = logger + logger := service.NewServiceLogger(&createInfo.ServiceConfigs) + createInfo.ServiceConfigs.Logger = logger var err error authOpt, err := config.HTTPAuthorizationOption() @@ -115,7 +117,6 @@ func run(cmd *cobra.Command, args []string) { readerService, err := evmreader.Create(ctx, &createInfo) cli.CheckErr(logger, err) - readerService.LogConfig(createInfo.Config) cli.CheckErr(logger, readerService.Serve()) } diff --git a/cmd/cartesi-rollups-jsonrpc-api/root/root.go b/cmd/cartesi-rollups-jsonrpc-api/root/root.go index 1da348eff..a18b984bf 100644 --- a/cmd/cartesi-rollups-jsonrpc-api/root/root.go +++ b/cmd/cartesi-rollups-jsonrpc-api/root/root.go @@ -68,7 +68,7 @@ func run(cmd *cobra.Command, args []string) { defer cancel() createInfo := jsonrpc.CreateInfo{ - CreateInfo: service.CreateInfo{ + ServiceConfigs: service.ServiceConfigs{ Name: config.ServiceJsonrpc, LogLevel: config.ResolveServiceLogLevel(config.ServiceJsonrpc, cfg.LogLevel), LogColor: cfg.LogColor, @@ -78,8 +78,8 @@ func run(cmd *cobra.Command, args []string) { }, Config: *cfg, } - logger := service.NewServiceLogger(&createInfo.CreateInfo) - createInfo.CreateInfo.Logger = logger + logger := service.NewServiceLogger(&createInfo.ServiceConfigs) + createInfo.ServiceConfigs.Logger = logger var err error createInfo.Repository, err = factory.NewRepositoryFromConnectionString(ctx, cfg.DatabaseConnection.Raw()) @@ -88,7 +88,6 @@ func run(cmd *cobra.Command, args []string) { jsonrpcService, err := jsonrpc.Create(ctx, &createInfo) cli.CheckErr(logger, err) - jsonrpcService.LogConfig(createInfo.Config) cli.CheckErr(logger, jsonrpcService.Serve()) } diff --git a/cmd/cartesi-rollups-node/root/root.go b/cmd/cartesi-rollups-node/root/root.go index d6422501d..32ce499ca 100644 --- a/cmd/cartesi-rollups-node/root/root.go +++ b/cmd/cartesi-rollups-node/root/root.go @@ -152,7 +152,7 @@ func run(cmd *cobra.Command, args []string) { defer cancel() createInfo := node.CreateInfo{ - CreateInfo: service.CreateInfo{ + ServiceConfigs: service.ServiceConfigs{ Name: config.ServiceNode, LogLevel: cfg.LogLevel, LogColor: cfg.LogColor, @@ -162,8 +162,8 @@ func run(cmd *cobra.Command, args []string) { }, Config: *cfg, } - logger := service.NewServiceLogger(&createInfo.CreateInfo) - createInfo.CreateInfo.Logger = logger + logger := service.NewServiceLogger(&createInfo.ServiceConfigs) + createInfo.ServiceConfigs.Logger = logger var err error createInfo.ReaderClient, err = newEthClient(ctx, config.ServiceEvmReader, cfg.BlockchainHttpRequestTimeout) @@ -181,7 +181,6 @@ func run(cmd *cobra.Command, args []string) { nodeService, err := node.Create(ctx, &createInfo) cli.CheckErr(logger, err) - nodeService.LogConfig(createInfo.Config) cli.CheckErr(logger, nodeService.Serve()) } diff --git a/cmd/cartesi-rollups-prt/root/root.go b/cmd/cartesi-rollups-prt/root/root.go index 85a2a78e6..fb50771c4 100644 --- a/cmd/cartesi-rollups-prt/root/root.go +++ b/cmd/cartesi-rollups-prt/root/root.go @@ -69,19 +69,21 @@ func run(cmd *cobra.Command, args []string) { defer cancel() createInfo := prt.CreateInfo{ - CreateInfo: service.CreateInfo{ - Name: config.ServicePrt, - LogLevel: config.ResolveServiceLogLevel(config.ServicePrt, cfg.LogLevel), - LogColor: cfg.LogColor, - EnableSignalHandling: true, - TelemetryCreate: true, - TelemetryAddress: cfg.PrtTelemetryAddress, - PollInterval: cfg.PrtPollingInterval, + TickServiceConfigs: service.TickServiceConfigs{ + PollInterval: cfg.PrtPollingInterval, + ServiceConfigs: service.ServiceConfigs{ + Name: config.ServicePrt, + LogLevel: config.ResolveServiceLogLevel(config.ServicePrt, cfg.LogLevel), + LogColor: cfg.LogColor, + EnableSignalHandling: true, + TelemetryCreate: true, + TelemetryAddress: cfg.PrtTelemetryAddress, + }, }, Config: *cfg, } - logger := service.NewServiceLogger(&createInfo.CreateInfo) - createInfo.CreateInfo.Logger = logger + logger := service.NewServiceLogger(&createInfo.ServiceConfigs) + createInfo.ServiceConfigs.Logger = logger var err error authOpt, err := config.HTTPAuthorizationOption() @@ -102,7 +104,6 @@ func run(cmd *cobra.Command, args []string) { prtService, err := prt.Create(ctx, &createInfo) cli.CheckErr(logger, err) - prtService.LogConfig(createInfo.Config) cli.CheckErr(logger, prtService.Serve()) } diff --git a/cmd/cartesi-rollups-validator/root/root.go b/cmd/cartesi-rollups-validator/root/root.go index 1fa08a336..7c0347529 100644 --- a/cmd/cartesi-rollups-validator/root/root.go +++ b/cmd/cartesi-rollups-validator/root/root.go @@ -68,19 +68,21 @@ func run(cmd *cobra.Command, args []string) { defer cancel() createInfo := validator.CreateInfo{ - CreateInfo: service.CreateInfo{ - Name: config.ServiceValidator, - LogLevel: config.ResolveServiceLogLevel(config.ServiceValidator, cfg.LogLevel), - LogColor: cfg.LogColor, - EnableSignalHandling: true, - TelemetryCreate: true, - TelemetryAddress: cfg.ValidatorTelemetryAddress, + TickServiceConfigs: service.TickServiceConfigs{ PollInterval: cfg.ValidatorPollingInterval, + ServiceConfigs: service.ServiceConfigs{ + Name: config.ServiceValidator, + LogLevel: config.ResolveServiceLogLevel(config.ServiceValidator, cfg.LogLevel), + LogColor: cfg.LogColor, + EnableSignalHandling: true, + TelemetryCreate: true, + TelemetryAddress: cfg.ValidatorTelemetryAddress, + }, }, Config: *cfg, } - logger := service.NewServiceLogger(&createInfo.CreateInfo) - createInfo.CreateInfo.Logger = logger + logger := service.NewServiceLogger(&createInfo.ServiceConfigs) + createInfo.ServiceConfigs.Logger = logger var err error createInfo.Repository, err = factory.NewRepositoryFromConnectionString(ctx, cfg.DatabaseConnection.Raw()) @@ -89,7 +91,6 @@ func run(cmd *cobra.Command, args []string) { validatorService, err := validator.Create(ctx, &createInfo) cli.CheckErr(logger, err) - validatorService.LogConfig(createInfo.Config) cli.CheckErr(logger, validatorService.Serve()) } diff --git a/internal/advancer/advancer_test.go b/internal/advancer/advancer_test.go index 0f8c9aa7c..da59ca2aa 100644 --- a/internal/advancer/advancer_test.go +++ b/internal/advancer/advancer_test.go @@ -48,8 +48,13 @@ func newMockAdvancerServiceWithBatchSize( machineManager: machineManager, repository: repo, } - serviceArgs := &service.CreateInfo{Name: "advancer", Impl: s, EnableReschedule: true} - err := service.Create(context.Background(), serviceArgs, &s.Service) + serviceArgs := &service.TickServiceConfigs{ + ServiceConfigs: service.ServiceConfigs{ + Name: "advancer", + }, + EnableReschedule: true, + } + err := service.InitTickServiceTemplate(serviceArgs, &s.TickServiceTemplate, s, s) if err != nil { return nil, err } @@ -1025,8 +1030,8 @@ func (s *AdvancerSuite) TestRemoveSnapshot() { tmpDir := s.T().TempDir() advancer := &Service{snapshotsDir: tmpDir} - serviceArgs := &service.CreateInfo{Name: "advancer", Impl: advancer} - require.Nil(service.Create(context.Background(), serviceArgs, &advancer.Service)) + serviceArgs := &service.TickServiceConfigs{ServiceConfigs: service.ServiceConfigs{Name: "advancer"}} + require.Nil(service.InitTickServiceTemplate(serviceArgs, &advancer.TickServiceTemplate, advancer, advancer)) // Create a snapshot directory snapshotPath := filepath.Join(tmpDir, "myapp_epoch0_input0") @@ -1044,8 +1049,8 @@ func (s *AdvancerSuite) TestRemoveSnapshot() { tmpDir := s.T().TempDir() advancer := &Service{snapshotsDir: tmpDir} - serviceArgs := &service.CreateInfo{Name: "advancer", Impl: advancer} - require.Nil(service.Create(context.Background(), serviceArgs, &advancer.Service)) + serviceArgs := &service.TickServiceConfigs{ServiceConfigs: service.ServiceConfigs{Name: "advancer"}} + require.Nil(service.InitTickServiceTemplate(serviceArgs, &advancer.TickServiceTemplate, advancer, advancer)) snapshotPath := filepath.Join(tmpDir, "myapp_epoch0_input0") err := advancer.removeSnapshot(snapshotPath, "myapp") @@ -1057,8 +1062,8 @@ func (s *AdvancerSuite) TestRemoveSnapshot() { tmpDir := s.T().TempDir() advancer := &Service{snapshotsDir: tmpDir} - serviceArgs := &service.CreateInfo{Name: "advancer", Impl: advancer} - require.Nil(service.Create(context.Background(), serviceArgs, &advancer.Service)) + serviceArgs := &service.TickServiceConfigs{ServiceConfigs: service.ServiceConfigs{Name: "advancer"}} + require.Nil(service.InitTickServiceTemplate(serviceArgs, &advancer.TickServiceTemplate, advancer, advancer)) // Try to traverse outside snapshotsDir maliciousPath := filepath.Join(tmpDir, "..", "outside", "myapp_evil") @@ -1072,8 +1077,8 @@ func (s *AdvancerSuite) TestRemoveSnapshot() { tmpDir := s.T().TempDir() advancer := &Service{snapshotsDir: tmpDir} - serviceArgs := &service.CreateInfo{Name: "advancer", Impl: advancer} - require.Nil(service.Create(context.Background(), serviceArgs, &advancer.Service)) + serviceArgs := &service.TickServiceConfigs{ServiceConfigs: service.ServiceConfigs{Name: "advancer"}} + require.Nil(service.InitTickServiceTemplate(serviceArgs, &advancer.TickServiceTemplate, advancer, advancer)) snapshotPath := filepath.Join(tmpDir, "otherapp_epoch0_input0") err := advancer.removeSnapshot(snapshotPath, "myapp") diff --git a/internal/advancer/service.go b/internal/advancer/service.go index 3e657d5eb..a009cf6bd 100644 --- a/internal/advancer/service.go +++ b/internal/advancer/service.go @@ -24,7 +24,7 @@ const httpShutdownTimeout = 10 * time.Second // Service is the main advancer service that processes inputs through Cartesi machines type Service struct { - service.Service + service.TickServiceTemplate inputBatchSize uint64 snapshotsDir string repository AdvancerRepository @@ -39,23 +39,22 @@ type Service struct { // CreateInfo contains the configuration for creating an advancer service type CreateInfo struct { - service.CreateInfo + service.TickServiceConfigs Config config.AdvancerConfig Repository repository.Repository } // Create initializes a new advancer service -func Create(ctx context.Context, c *CreateInfo) (*Service, error) { +func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { var err error if err = ctx.Err(); err != nil { return nil, err // This returns context.Canceled or context.DeadlineExceeded. } s := &Service{} - c.Impl = s c.EnableReschedule = true - err = service.Create(ctx, &c.CreateInfo, &s.Service) + err = service.InitTickServiceTemplate(&c.TickServiceConfigs, &s.TickServiceTemplate, s, s) if err != nil { return nil, err } @@ -102,13 +101,12 @@ func Create(ctx context.Context, c *CreateInfo) (*Service, error) { s.snapshotsDir = c.Config.SnapshotsDir + s.LogConfig(c.Config) + return s, nil } // Service interface implementation -func (s *Service) Alive() bool { return true } -func (s *Service) Ready() bool { return true } -func (s *Service) Reload() []error { return nil } func (s *Service) Tick() []error { hadWork, err := s.Step(s.Context) @@ -141,18 +139,7 @@ func (s *Service) Tick() []error { return []error{err} } -func (s *Service) Stop(b bool) []error { - // CAS achieves once-semantics: the second caller returns immediately - // (fire-and-forget) rather than blocking like sync.Once. This is safe - // because the orchestrator calls Cancel() after Stop() and waits for - // the Serve goroutine to exit. - if !s.cleanedUp.CompareAndSwap(false, true) { - return nil // already stopped - } - // This method shadows service.Service.Stop(), so set the stopping flag - // explicitly. Without this, a concurrent Tick that observes closed - // resources would not see IsStopping() == true. - s.SetStopping() +func (s *Service) OnStop(b bool) []error { var errs []error if s.inspector != nil { s.Logger.Info("Shutting down inspect HTTP server") @@ -168,9 +155,9 @@ func (s *Service) Stop(b bool) []error { errs = append(errs, fmt.Errorf("failed to close machine manager: %w", err)) } } - return errs + return append(errs, s.TickServiceTemplate.OnStop(b)...) } -func (s *Service) Serve() error { +func (s *Service) OnServe() error { if s.inspector != nil { go func() { if err := s.inspector.Serve(); err != nil && !errors.Is(err, http.ErrServerClosed) { @@ -179,8 +166,5 @@ func (s *Service) Serve() error { } }() } - return s.Service.Serve() -} -func (s *Service) String() string { - return s.Name + return s.TickServiceTemplate.OnServe() } diff --git a/internal/claimer/claimer_test.go b/internal/claimer/claimer_test.go index 4835832a1..18867f723 100644 --- a/internal/claimer/claimer_test.go +++ b/internal/claimer/claimer_test.go @@ -4,7 +4,6 @@ package claimer import ( - "context" "math/big" "testing" "time" @@ -36,23 +35,13 @@ func TestTickInterleavesStagesWithPinnedBlockAndReschedulesOnProgress(t *testing defer r.AssertExpectations(t) defer b.AssertExpectations(t) - ctx := context.Background() - err := service.Create(ctx, &service.CreateInfo{ - Name: "claimer-test", - Context: ctx, - Impl: m, + err := service.InitTickServiceTemplate(&service.TickServiceConfigs{ + ServiceConfigs: service.ServiceConfigs{Name: "claimer-test"}, PollInterval: time.Hour, EnableReschedule: true, - }, &m.Service) + }, &m.TickServiceTemplate, m, m) require.NoError(t, err) - t.Cleanup(func() { - if m.Ticker != nil { - m.Ticker.Stop() - } - if m.Cancel != nil { - m.Cancel() - } - }) + t.Cleanup(func() { m.Stop(false) }) tickBlock := big.NewInt(100) app := makeApplication() diff --git a/internal/claimer/fixtures_test.go b/internal/claimer/fixtures_test.go index 0e5d814a9..ee995fec1 100644 --- a/internal/claimer/fixtures_test.go +++ b/internal/claimer/fixtures_test.go @@ -67,8 +67,10 @@ func newServiceMock() (*Service, *claimerRepositoryMock, *claimerBlockchainMock) } claimer := &Service{ - Service: service.Service{ - Logger: slog.New(handler), + TickServiceTemplate: service.TickServiceTemplate{ + ServiceTemplate: service.ServiceTemplate{ + Logger: slog.New(handler), + }, }, submissionEnabled: true, claimsInFlight: map[int64]inFlightTx{}, diff --git a/internal/claimer/service.go b/internal/claimer/service.go index 8495f4b15..2a0c87460 100644 --- a/internal/claimer/service.go +++ b/internal/claimer/service.go @@ -20,7 +20,7 @@ import ( ) type CreateInfo struct { - service.CreateInfo + service.TickServiceConfigs Config config.ClaimerConfig @@ -29,7 +29,7 @@ type CreateInfo struct { } type Service struct { - service.Service + service.TickServiceTemplate repository iclaimerRepository blockchain iclaimerBlockchain @@ -75,7 +75,7 @@ type PersistentConfig struct { ChainID uint64 } -func Create(ctx context.Context, c *CreateInfo) (*Service, error) { +func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { var err error if c == nil { @@ -92,10 +92,9 @@ func Create(ctx context.Context, c *CreateInfo) (*Service, error) { } s := &Service{} - c.Impl = s c.EnableReschedule = true - err = service.Create(ctx, &c.CreateInfo, &s.Service) + err = service.InitTickServiceTemplate(&c.TickServiceConfigs, &s.TickServiceTemplate, s, s) if err != nil { return nil, fmt.Errorf("creating base service: %w", err) } @@ -142,28 +141,9 @@ func Create(ctx context.Context, c *CreateInfo) (*Service, error) { defaultBlock: nodeConfig.DefaultBlock, } - return s, nil -} - -func (s *Service) Alive() bool { - return true -} - -func (s *Service) Ready() bool { - return true -} + s.LogConfig(c.Config) -func (s *Service) Reload() []error { - return nil -} - -func (s *Service) Stop(bool) []error { - s.SetStopping() - return nil -} - -func (s *Service) String() string { - return s.Name + return s, nil } func setupPersistentConfig( diff --git a/internal/claimer/service_test.go b/internal/claimer/service_test.go index cab78d375..fd6e7537c 100644 --- a/internal/claimer/service_test.go +++ b/internal/claimer/service_test.go @@ -35,8 +35,10 @@ func TestCreateUsesPersistedDefaultBlock(t *testing.T) { Return(rawConfig, time.Now(), time.Now(), nil).Once() s, err := Create(ctx, &CreateInfo{ - CreateInfo: service.CreateInfo{ - Context: ctx, + TickServiceConfigs: service.TickServiceConfigs{ + ServiceConfigs: service.ServiceConfigs{ + Context: ctx, + }, PollInterval: time.Hour, }, Config: config.ClaimerConfig{ @@ -48,19 +50,14 @@ func TestCreateUsesPersistedDefaultBlock(t *testing.T) { Repository: repo, }) require.NoError(t, err) - t.Cleanup(func() { - if s.Ticker != nil { - s.Ticker.Stop() - } - if s.Cancel != nil { - s.Cancel() - } - }) + t.Cleanup(func() { s.Stop(false) }) + + impl := s.(*Service) // expose struct API for whitebox testing. - blockchain, ok := s.blockchain.(*claimerBlockchain) + blockchain, ok := impl.blockchain.(*claimerBlockchain) require.True(t, ok) assert.Equal(t, model.DefaultBlock_Latest, blockchain.defaultBlock) - assert.False(t, s.submissionEnabled) + assert.False(t, impl.submissionEnabled) repo.AssertExpectations(t) repo.AssertNumberOfCalls(t, "SaveNodeConfigRaw", 0) diff --git a/internal/evmreader/accounts_drive_proved_test.go b/internal/evmreader/accounts_drive_proved_test.go index bd6451d11..2cd825ef5 100644 --- a/internal/evmreader/accounts_drive_proved_test.go +++ b/internal/evmreader/accounts_drive_proved_test.go @@ -34,10 +34,16 @@ func newPostForeclosureFixture(t *testing.T) ( s := &Service{ repository: repo, } - require.NoError(t, service.Create( - context.Background(), - &service.CreateInfo{Name: "evm-reader", Impl: s, Logger: slog.New(slog.NewTextHandler(os.Stdout, nil))}, - &s.Service, + require.NoError(t, service.InitTickServiceTemplate( + &service.TickServiceConfigs{ + ServiceConfigs: service.ServiceConfigs{ + Name: "evm-reader", + Logger: slog.New(slog.NewTextHandler(os.Stdout, nil)), + }, + }, + &s.TickServiceTemplate, + s, + s, )) return s, appContract, repo } diff --git a/internal/evmreader/evmreader_test.go b/internal/evmreader/evmreader_test.go index 1ccb2344d..ec8731ac9 100644 --- a/internal/evmreader/evmreader_test.go +++ b/internal/evmreader/evmreader_test.go @@ -66,15 +66,16 @@ func (s *EvmReaderSuite) SetupTest() { logLevel, err := config.GetLogLevel() s.Require().NoError(err) - serviceArgs := &service.CreateInfo{ - Name: "evm-reader", - Impl: s.evmReader, - LogLevel: logLevel, - Context: s.ctx, - Cancel: s.cancel, + serviceArgs := &service.TickServiceConfigs{ + ServiceConfigs: service.ServiceConfigs{ + Name: "evm-reader", + LogLevel: logLevel, + Context: s.ctx, + Cancel: s.cancel, + }, PollInterval: 100 * time.Millisecond, } - err = service.Create(context.Background(), serviceArgs, &s.evmReader.Service) + err = service.InitTickServiceTemplate(serviceArgs, &s.evmReader.TickServiceTemplate, s.evmReader, s.evmReader) s.Require().NoError(err) s.evmReader.resolver = newApplicationAdapterResolver(s.evmReader.Logger, s.contractFactory) @@ -84,6 +85,18 @@ func (s *EvmReaderSuite) TearDownTest() { s.cancel() } +// Service tests +func (s *EvmReaderSuite) TestItStopsWhenContextIsCanceled() { + errChannel := make(chan error, 1) + go func() { + errChannel <- s.evmReader.Serve() + }() + s.cancel() + + err := <-errChannel + s.Require().Nil(err, "stopped with an error when canceled") +} + func newCallNotification(c *mock.Call) <-chan struct{} { ch := make(chan struct{}) c.Run(func(mock.Arguments) { ch <- struct{}{} }) diff --git a/internal/evmreader/foreclosure_test.go b/internal/evmreader/foreclosure_test.go index b32248797..e94d089d9 100644 --- a/internal/evmreader/foreclosure_test.go +++ b/internal/evmreader/foreclosure_test.go @@ -44,10 +44,16 @@ func newForeclosureServiceFixture(t *testing.T) ( s := &Service{ repository: repo, } - require.NoError(t, service.Create( - context.Background(), - &service.CreateInfo{Name: "evm-reader", Impl: s, Logger: slog.New(slog.NewTextHandler(os.Stdout, nil))}, - &s.Service, + require.NoError(t, service.InitTickServiceTemplate( + &service.TickServiceConfigs{ + ServiceConfigs: service.ServiceConfigs{ + Name: "evm-reader", + Logger: slog.New(slog.NewTextHandler(os.Stdout, nil)), + }, + }, + &s.TickServiceTemplate, + s, + s, )) return s, appContract, repo } diff --git a/internal/evmreader/input_scan_units_test.go b/internal/evmreader/input_scan_units_test.go index 3080bfdc2..c821cef3f 100644 --- a/internal/evmreader/input_scan_units_test.go +++ b/internal/evmreader/input_scan_units_test.go @@ -18,7 +18,9 @@ import ( func TestBuildIConsensusInputScanUnits_GroupsByInputBoxAndCursor(t *testing.T) { ctx := context.Background() reader := &Service{ - Service: service.Service{Logger: testLogger(t)}, + TickServiceTemplate: service.TickServiceTemplate{ + ServiceTemplate: service.ServiceTemplate{Logger: testLogger(t)}, + }, } inputBoxA := common.HexToAddress("0x00000000000000000000000000000000000000a1") inputBoxB := common.HexToAddress("0x00000000000000000000000000000000000000b1") @@ -111,7 +113,9 @@ func TestBuildIConsensusInputScanUnits_InitializesBeforeGrouping(t *testing.T) { repo.On("UpdateEventLastCheckBlock", mock.Anything, []int64{int64(1)}, MonitoredEvent_InputAdded, uint64(6)). Return(nil).Once() reader := &Service{ - Service: service.Service{Logger: testLogger(t)}, + TickServiceTemplate: service.TickServiceTemplate{ + ServiceTemplate: service.ServiceTemplate{Logger: testLogger(t)}, + }, repository: repo, } inputBox := common.HexToAddress("0x00000000000000000000000000000000000000a1") @@ -130,7 +134,9 @@ func TestBuildIConsensusInputScanUnits_InitializesBeforeGrouping(t *testing.T) { func TestBuildIConsensusInputScanUnits_FailedInitializationExcludesOnlyThatApp(t *testing.T) { ctx := context.Background() reader := &Service{ - Service: service.Service{Logger: testLogger(t)}, + TickServiceTemplate: service.TickServiceTemplate{ + ServiceTemplate: service.ServiceTemplate{Logger: testLogger(t)}, + }, } inputBox := common.HexToAddress("0x00000000000000000000000000000000000000a1") broken := inputUnitApp(1, inputBox, 0, true) diff --git a/internal/evmreader/output_test.go b/internal/evmreader/output_test.go index 15c175a2d..566ee47c8 100644 --- a/internal/evmreader/output_test.go +++ b/internal/evmreader/output_test.go @@ -4,7 +4,6 @@ package evmreader import ( - "context" "errors" "math/big" "time" @@ -609,15 +608,16 @@ func (s *EvmReaderSuite) setupOutputMismatchTest() { logLevel, err := config.GetLogLevel() s.Require().NoError(err) - serviceArgs := &service.CreateInfo{ - Name: "evm-reader", - Impl: s.evmReader, - LogLevel: logLevel, - Context: s.ctx, - Cancel: s.cancel, + serviceArgs := &service.TickServiceConfigs{ + ServiceConfigs: service.ServiceConfigs{ + Name: "evm-reader", + LogLevel: logLevel, + Context: s.ctx, + Cancel: s.cancel, + }, PollInterval: 100 * time.Millisecond, } - err = service.Create(context.Background(), serviceArgs, &s.evmReader.Service) + err = service.InitTickServiceTemplate(serviceArgs, &s.evmReader.TickServiceTemplate, s.evmReader, s.evmReader) s.Require().NoError(err) s.evmReader.resolver = newApplicationAdapterResolver(s.evmReader.Logger, s.contractFactory) diff --git a/internal/evmreader/sealedepochs_test.go b/internal/evmreader/sealedepochs_test.go index 4f45f799d..e05345377 100644 --- a/internal/evmreader/sealedepochs_test.go +++ b/internal/evmreader/sealedepochs_test.go @@ -56,8 +56,8 @@ func (s *SealedEpochsSuite) SetupTest() { logLevel, err := config.GetLogLevel() s.Require().NoError(err) - serviceArgs := &service.CreateInfo{Name: "evm-reader", Impl: s.evmReader, LogLevel: logLevel} - err = service.Create(context.Background(), serviceArgs, &s.evmReader.Service) + serviceArgs := &service.ServiceConfigs{Name: "evm-reader", LogLevel: logLevel} + err = service.InitServiceTemplate(serviceArgs, &s.evmReader.ServiceTemplate, s.evmReader) s.Require().NoError(err) } diff --git a/internal/evmreader/service.go b/internal/evmreader/service.go index 29d2429c7..96cfaf7dd 100644 --- a/internal/evmreader/service.go +++ b/internal/evmreader/service.go @@ -19,7 +19,7 @@ import ( ) type CreateInfo struct { - service.CreateInfo + service.TickServiceConfigs Config config.EvmreaderConfig @@ -29,7 +29,7 @@ type CreateInfo struct { } type Service struct { - service.Service + service.TickServiceTemplate client EthClientInterface adapterFactory AdapterFactory @@ -52,16 +52,15 @@ type PersistentConfig struct { ChainID uint64 } -func Create(ctx context.Context, c *CreateInfo) (*Service, error) { +func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { var err error if err = ctx.Err(); err != nil { return nil, err // This returns context.Canceled or context.DeadlineExceeded. } s := &Service{} - c.Impl = s - err = service.Create(ctx, &c.CreateInfo, &s.Service) + err = service.InitTickServiceTemplate(&c.TickServiceConfigs, &s.TickServiceTemplate, s, s) if err != nil { return nil, err } @@ -108,6 +107,8 @@ func Create(ctx context.Context, c *CreateInfo) (*Service, error) { } s.resolver = newApplicationAdapterResolver(s.Logger, s.adapterFactory) + s.LogConfig(c.Config) + return s, nil } @@ -119,25 +120,12 @@ func (s *Service) Ready() bool { return s.ready.Load() } -func (s *Service) Reload() []error { - return nil -} - -func (s *Service) Stop(bool) []error { - s.SetStopping() - return nil -} - -func (s *Service) Serve() error { +func (s *Service) OnServe() error { s.alive.Store(true) s.ready.Store(true) defer s.alive.Store(false) defer s.ready.Store(false) - return s.Service.Serve() -} - -func (s *Service) String() string { - return s.Name + return s.TickServiceTemplate.OnServe() } func (s *Service) setupPersistentConfig( diff --git a/internal/evmreader/service_config_test.go b/internal/evmreader/service_config_test.go index f99e1a601..2604c3bb7 100644 --- a/internal/evmreader/service_config_test.go +++ b/internal/evmreader/service_config_test.go @@ -38,7 +38,12 @@ func TestCreateWithNilEthClient(t *testing.T) { require.NoError(t, err) _, err = Create(context.Background(), &CreateInfo{ - CreateInfo: service.CreateInfo{Name: "evm-reader", LogLevel: logLevel}, + TickServiceConfigs: service.TickServiceConfigs{ + ServiceConfigs: service.ServiceConfigs{ + Name: "evm-reader", + LogLevel: logLevel, + }, + }, }) require.ErrorContains(t, err, "EthClient on evmreader service Create is nil") } @@ -74,9 +79,11 @@ func TestCreateAcceptsRequestTimeoutBelowPollingInterval(t *testing.T) { Return(rawConfig, time.Now(), time.Now(), nil).Once() svc, err := Create(context.Background(), &CreateInfo{ - CreateInfo: service.CreateInfo{ - Name: "evm-reader", - LogLevel: logLevel, + TickServiceConfigs: service.TickServiceConfigs{ + ServiceConfigs: service.ServiceConfigs{ + Name: "evm-reader", + LogLevel: logLevel, + }, PollInterval: pollInterval, }, Config: config.EvmreaderConfig{ @@ -90,8 +97,7 @@ func TestCreateAcceptsRequestTimeoutBelowPollingInterval(t *testing.T) { Repository: repo, }) require.NoError(t, err) - defer svc.Ticker.Stop() - defer svc.Cancel() + defer svc.Stop(false) repo.AssertExpectations(t) } diff --git a/internal/jsonrpc/service.go b/internal/jsonrpc/service.go index f39ad2157..4c947e6d2 100644 --- a/internal/jsonrpc/service.go +++ b/internal/jsonrpc/service.go @@ -28,7 +28,7 @@ const jsonrpcShutdownTimeout = 5 * time.Second // Service implements the IService interface. type Service struct { - service.Service + service.ServiceTemplate repository repository.Repository server *http.Server admission *service.SemaphoreAdmission @@ -40,23 +40,22 @@ type Service struct { } type CreateInfo struct { - service.CreateInfo + service.ServiceConfigs Config config.JsonrpcConfig Repository repository.Repository } -func Create(ctx context.Context, c *CreateInfo) (*Service, error) { +func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { var err error if err = ctx.Err(); err != nil { return nil, err // This returns context.Canceled or context.DeadlineExceeded. } s := &Service{} - c.Impl = s - err = service.Create(ctx, &c.CreateInfo, &s.Service) + err = service.InitServiceTemplate(&c.ServiceConfigs, &s.ServiceTemplate, s) if err != nil { return nil, err } @@ -97,28 +96,12 @@ func Create(ctx context.Context, c *CreateInfo) (*Service, error) { s.listen = net.Listen } - return s, nil -} - -func (s *Service) Alive() bool { - return true -} - -func (s *Service) Ready() bool { - return true -} - -func (s *Service) Reload() []error { - return nil -} + s.LogConfig(c.Config) -func (s *Service) Tick() []error { - // No periodic tasks. - return nil + return s, nil } -func (s *Service) Stop(_ bool) []error { - s.SetStopping() +func (s *Service) OnStop(_ bool) []error { var errs []error s.Logger.Info("Shutting down JSON-RPC HTTP server", "addr", s.server.Addr) ctx, cancel := context.WithTimeout(context.Background(), jsonrpcShutdownTimeout) @@ -129,11 +112,7 @@ func (s *Service) Stop(_ bool) []error { return errs } -func (s *Service) String() string { - return s.Name -} - -func (s *Service) Serve() error { +func (s *Service) OnServe() error { listener, err := s.listen("tcp", s.server.Addr) if err != nil { return err @@ -155,26 +134,18 @@ func (s *Service) Serve() error { serverDone <- err }() - serviceDone := make(chan error, 1) - go func() { - // Run the shared service loop concurrently because it blocks waiting - // for signals/context cancellation while the HTTP server blocks - // waiting for connections. - serviceDone <- s.Service.Serve() - }() - select { case err := <-serverDone: // The HTTP loop exited first. This is unexpected unless the listener // failed or the server was already closed, so cancel the framework // loop and wait for it to observe the cancellation before returning. s.Cancel() - serviceErr := <-serviceDone + <-s.Context.Done() if err != nil { return err } - return serviceErr - case err := <-serviceDone: + return nil + case <-s.Context.Done(): // The framework loop exited first because it handled a shutdown signal // or context cancellation and called Stop(), which should trigger // s.server.Shutdown(). Wait for the HTTP loop to finish so Serve() @@ -183,6 +154,6 @@ func (s *Service) Serve() error { if serverErr != nil { return serverErr } - return err + return nil } } diff --git a/internal/jsonrpc/util_test.go b/internal/jsonrpc/util_test.go index e68660282..9e55f88b3 100644 --- a/internal/jsonrpc/util_test.go +++ b/internal/jsonrpc/util_test.go @@ -101,7 +101,7 @@ func newTestServiceFull(t *testing.T, name string, maxInflight uint64, corsOrigi require.NoError(t, err) ci := CreateInfo{ - CreateInfo: service.CreateInfo{ + ServiceConfigs: service.ServiceConfigs{ Name: name, LogLevel: logLevel, LogColor: true, @@ -115,7 +115,7 @@ func newTestServiceFull(t *testing.T, name string, maxInflight uint64, corsOrigi s, err := Create(ctx, &ci) require.NoError(t, err, "on new test service") - return s + return s.(*Service) } func nameToNumber(in string) uint64 { diff --git a/internal/node/node.go b/internal/node/node.go index 872ca8c26..98cda64b9 100644 --- a/internal/node/node.go +++ b/internal/node/node.go @@ -29,7 +29,7 @@ type serviceResult struct { } type CreateInfo struct { - service.CreateInfo + service.ServiceConfigs Config config.NodeConfig @@ -40,23 +40,27 @@ type CreateInfo struct { } type Service struct { - service.Service + service.ServiceTemplate Children []service.IService Repository repository.Repository } -func Create(ctx context.Context, c *CreateInfo) (*Service, error) { +func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { var err error if err = ctx.Err(); err != nil { return nil, err // This returns context.Canceled or context.DeadlineExceeded. } + // setup node and all child services to share the same context. + ctx, cancel := context.WithCancel(context.Background()) + c.ServiceConfigs.Context = ctx + c.ServiceConfigs.Cancel = cancel + s := &Service{} - c.Impl = s - err = service.Create(ctx, &c.CreateInfo, &s.Service) + err = service.InitServiceTemplate(&c.ServiceConfigs, &s.ServiceTemplate, s) if err != nil { return nil, err } @@ -66,6 +70,9 @@ func Create(ctx context.Context, c *CreateInfo) (*Service, error) { s.Logger.Error(fmt.Sprint(err)) return nil, err } + + s.LogConfig(c.Config) + return s, nil } @@ -138,10 +145,7 @@ func (me *Service) Ready() bool { return allReady } -func (s *Service) Reload() []error { return nil } -func (s *Service) Tick() []error { return nil } -func (me *Service) Stop(force bool) []error { - me.SetStopping() +func (me *Service) OnStop(force bool) []error { errs := []error{} for _, s := range me.Children { errs = append(errs, s.Stop(force)...) @@ -149,27 +153,30 @@ func (me *Service) Stop(force bool) []error { return errs } -func (me *Service) Serve() error { +func (me *Service) OnServe() error { for _, s := range me.Children { go s.Serve() } - return me.Service.Serve() + <-me.Context.Done() + return nil } // services creation func newEVMReader(ctx context.Context, c *CreateInfo, s *Service) (service.IService, error) { readerArgs := evmreader.CreateInfo{ - CreateInfo: service.CreateInfo{ - Name: config.ServiceEvmReader, - Context: s.Context, - Cancel: s.Cancel, - LogLevel: config.ResolveServiceLogLevel(config.ServiceEvmReader, c.Config.LogLevel), - LogColor: c.Config.LogColor, - EnableSignalHandling: false, - TelemetryCreate: false, + TickServiceConfigs: service.TickServiceConfigs{ + ServiceConfigs: service.ServiceConfigs{ + Name: config.ServiceEvmReader, + Context: c.ServiceConfigs.Context, + Cancel: c.ServiceConfigs.Cancel, + LogLevel: config.ResolveServiceLogLevel(config.ServiceEvmReader, c.Config.LogLevel), + LogColor: c.Config.LogColor, + EnableSignalHandling: false, + TelemetryCreate: false, + ServeMux: s.ServeMux, + }, PollInterval: c.Config.EvmReaderPollingInterval, - ServeMux: s.ServeMux, }, EthClient: c.ReaderClient, Repository: c.Repository, @@ -185,16 +192,17 @@ func newEVMReader(ctx context.Context, c *CreateInfo, s *Service) (service.IServ func newAdvancer(ctx context.Context, c *CreateInfo, s *Service) (service.IService, error) { advancerArgs := advancer.CreateInfo{ - CreateInfo: service.CreateInfo{ - Name: config.ServiceAdvancer, - Context: s.Context, - Cancel: s.Cancel, - LogLevel: config.ResolveServiceLogLevel(config.ServiceAdvancer, c.Config.LogLevel), - LogColor: c.Config.LogColor, - EnableSignalHandling: false, - TelemetryCreate: false, - PollInterval: c.Config.AdvancerPollingInterval, - ServeMux: s.ServeMux, + TickServiceConfigs: service.TickServiceConfigs{ + PollInterval: c.Config.AdvancerPollingInterval, + ServiceConfigs: service.ServiceConfigs{ + Name: config.ServiceAdvancer, + Context: c.ServiceConfigs.Context, + Cancel: c.ServiceConfigs.Cancel, + LogLevel: config.ResolveServiceLogLevel(config.ServiceAdvancer, c.Config.LogLevel), + LogColor: c.Config.LogColor, + EnableSignalHandling: false, + TelemetryCreate: false, + }, }, Repository: c.Repository, Config: *c.Config.ToAdvancerConfig(), @@ -209,16 +217,17 @@ func newAdvancer(ctx context.Context, c *CreateInfo, s *Service) (service.IServi func newValidator(ctx context.Context, c *CreateInfo, s *Service) (service.IService, error) { validatorArgs := validator.CreateInfo{ - CreateInfo: service.CreateInfo{ - Name: config.ServiceValidator, - Context: s.Context, - Cancel: s.Cancel, - LogLevel: config.ResolveServiceLogLevel(config.ServiceValidator, c.Config.LogLevel), - LogColor: c.Config.LogColor, - EnableSignalHandling: false, - TelemetryCreate: false, - PollInterval: c.Config.ValidatorPollingInterval, - ServeMux: s.ServeMux, + TickServiceConfigs: service.TickServiceConfigs{ + PollInterval: c.Config.ValidatorPollingInterval, + ServiceConfigs: service.ServiceConfigs{ + Name: config.ServiceValidator, + Context: c.ServiceConfigs.Context, + Cancel: c.ServiceConfigs.Cancel, + LogLevel: config.ResolveServiceLogLevel(config.ServiceValidator, c.Config.LogLevel), + LogColor: c.Config.LogColor, + EnableSignalHandling: false, + TelemetryCreate: false, + }, }, Repository: c.Repository, Config: *c.Config.ToValidatorConfig(), @@ -233,16 +242,17 @@ func newValidator(ctx context.Context, c *CreateInfo, s *Service) (service.IServ func newClaimer(ctx context.Context, c *CreateInfo, s *Service) (service.IService, error) { claimerArgs := claimer.CreateInfo{ - CreateInfo: service.CreateInfo{ - Name: config.ServiceClaimer, - Context: s.Context, - Cancel: s.Cancel, - LogLevel: config.ResolveServiceLogLevel(config.ServiceClaimer, c.Config.LogLevel), - LogColor: c.Config.LogColor, - EnableSignalHandling: false, - TelemetryCreate: false, - PollInterval: c.Config.ClaimerPollingInterval, - ServeMux: s.ServeMux, + TickServiceConfigs: service.TickServiceConfigs{ + PollInterval: c.Config.ClaimerPollingInterval, + ServiceConfigs: service.ServiceConfigs{ + Name: config.ServiceClaimer, + Context: c.ServiceConfigs.Context, + Cancel: c.ServiceConfigs.Cancel, + LogLevel: config.ResolveServiceLogLevel(config.ServiceClaimer, c.Config.LogLevel), + LogColor: c.Config.LogColor, + EnableSignalHandling: false, + TelemetryCreate: false, + }, }, EthConn: c.ClaimerClient, Repository: c.Repository, @@ -258,10 +268,10 @@ func newClaimer(ctx context.Context, c *CreateInfo, s *Service) (service.IServic func newJsonrpc(ctx context.Context, c *CreateInfo, s *Service) (service.IService, error) { jsonrpcArgs := jsonrpc.CreateInfo{ - CreateInfo: service.CreateInfo{ + ServiceConfigs: service.ServiceConfigs{ Name: config.ServiceJsonrpc, - Context: s.Context, - Cancel: s.Cancel, + Context: c.ServiceConfigs.Context, + Cancel: c.ServiceConfigs.Cancel, LogLevel: config.ResolveServiceLogLevel(config.ServiceJsonrpc, c.Config.LogLevel), LogColor: c.Config.LogColor, EnableSignalHandling: false, @@ -281,16 +291,17 @@ func newJsonrpc(ctx context.Context, c *CreateInfo, s *Service) (service.IServic func newPrt(ctx context.Context, c *CreateInfo, s *Service) (service.IService, error) { prtArgs := prt.CreateInfo{ - CreateInfo: service.CreateInfo{ - Name: config.ServicePrt, - Context: s.Context, - Cancel: s.Cancel, - LogLevel: config.ResolveServiceLogLevel(config.ServicePrt, c.Config.LogLevel), - LogColor: c.Config.LogColor, - EnableSignalHandling: false, - TelemetryCreate: false, - PollInterval: c.Config.PrtPollingInterval, - ServeMux: s.ServeMux, + TickServiceConfigs: service.TickServiceConfigs{ + PollInterval: c.Config.PrtPollingInterval, + ServiceConfigs: service.ServiceConfigs{ + Name: config.ServicePrt, + Context: c.ServiceConfigs.Context, + Cancel: c.ServiceConfigs.Cancel, + LogLevel: config.ResolveServiceLogLevel(config.ServicePrt, c.Config.LogLevel), + LogColor: c.Config.LogColor, + EnableSignalHandling: false, + TelemetryCreate: false, + }, }, EthClient: c.PrtClient, Repository: c.Repository, diff --git a/internal/prt/handle_foreclosed_test.go b/internal/prt/handle_foreclosed_test.go index 3dde90c24..8948e171a 100644 --- a/internal/prt/handle_foreclosed_test.go +++ b/internal/prt/handle_foreclosed_test.go @@ -113,8 +113,10 @@ func (m *prtRepositoryMock) LoadNodeConfigRaw(context.Context, string) ([]byte, func newPRTServiceMock() (*Service, *prtRepositoryMock) { repo := &prtRepositoryMock{} s := &Service{ - Service: service.Service{ - Logger: slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{Level: slog.LevelDebug})), + TickServiceTemplate: service.TickServiceTemplate{ + ServiceTemplate: service.ServiceTemplate{ + Logger: slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{Level: slog.LevelDebug})), + }, }, repository: repo, } diff --git a/internal/prt/service.go b/internal/prt/service.go index 4847b054f..e06542a14 100644 --- a/internal/prt/service.go +++ b/internal/prt/service.go @@ -21,7 +21,7 @@ import ( ) type CreateInfo struct { - service.CreateInfo + service.TickServiceConfigs Config config.PrtConfig Repository repository.Repository EthClient EthClientInterface @@ -29,7 +29,7 @@ type CreateInfo struct { } type Service struct { - service.Service + service.TickServiceTemplate repository prtRepository client EthClientInterface adapterFactory AdapterFactory @@ -49,16 +49,15 @@ type PersistentConfig struct { ChainID uint64 } -func Create(ctx context.Context, c *CreateInfo) (*Service, error) { +func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { var err error if err = ctx.Err(); err != nil { return nil, err // This returns context.Canceled or context.DeadlineExceeded. } s := &Service{} - c.Impl = s - err = service.Create(ctx, &c.CreateInfo, &s.Service) + err = service.InitTickServiceTemplate(&c.TickServiceConfigs, &s.TickServiceTemplate, s, s) if err != nil { return nil, err } @@ -118,13 +117,11 @@ func Create(ctx context.Context, c *CreateInfo) (*Service, error) { } } + s.LogConfig(c.Config) + return s, nil } -func (s *Service) Alive() bool { return true } -func (s *Service) Ready() bool { return true } -func (s *Service) Reload() []error { return nil } - // Tick executes the Validator main logic of producing claims and/or proofs // for processed epochs of all running applications. func (s *Service) Tick() []error { @@ -292,15 +289,6 @@ func (s *Service) forecloseComputedEpochs(ctx context.Context, app *Application) return nil } -func (s *Service) Stop(_ bool) []error { - s.SetStopping() - return nil -} - -func (s *Service) String() string { - return s.Name -} - func (s *Service) setupPersistentConfig( ctx context.Context, c *config.PrtConfig, diff --git a/internal/validator/validator.go b/internal/validator/validator.go index 0578d9fff..aed324b25 100644 --- a/internal/validator/validator.go +++ b/internal/validator/validator.go @@ -23,7 +23,8 @@ import ( ) type Service struct { - service.Service + service.TickServiceTemplate + repository ValidatorRepository // cached constants @@ -32,23 +33,22 @@ type Service struct { } type CreateInfo struct { - service.CreateInfo + service.TickServiceConfigs Config config.ValidatorConfig Repository repository.Repository } -func Create(ctx context.Context, c *CreateInfo) (*Service, error) { +func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { var err error if err = ctx.Err(); err != nil { return nil, err // This returns context.Canceled or context.DeadlineExceeded. } s := &Service{} - c.Impl = s - err = service.Create(ctx, &c.CreateInfo, &s.Service) + err = service.InitTickServiceTemplate(&c.TickServiceConfigs, &s.TickServiceTemplate, s, s) if err != nil { return nil, err } @@ -61,13 +61,11 @@ func Create(ctx context.Context, c *CreateInfo) (*Service, error) { s.pristinePostContext = merkle.CreatePostContext() s.pristineRootHash = s.pristinePostContext[merkle.TREE_DEPTH] + s.LogConfig(c.Config) + return s, nil } -func (s *Service) Alive() bool { return true } -func (s *Service) Ready() bool { return true } -func (s *Service) Reload() []error { return nil } - // Tick executes the Validator main logic of producing claims and/or proofs // for processed epochs of all running applications. func (s *Service) Tick() []error { @@ -99,14 +97,6 @@ func (s *Service) Tick() []error { } return errs } -func (s *Service) Stop(_ bool) []error { - s.SetStopping() - return nil -} - -func (s *Service) String() string { - return s.Name -} type ValidatorRepository interface { ListApplications(ctx context.Context, f repository.ApplicationFilter, p repository.Pagination, descending bool) ([]*Application, uint64, error) diff --git a/internal/validator/validator_test.go b/internal/validator/validator_test.go index d866b584a..886baa4fe 100644 --- a/internal/validator/validator_test.go +++ b/internal/validator/validator_test.go @@ -51,8 +51,8 @@ func (s *ValidatorSuite) SetupSubTest() { pristinePostContext: postContext, pristineRootHash: postContext[merkle.TREE_DEPTH], } - serviceArgs := &service.CreateInfo{Name: "validator", Impl: validator} - err := service.Create(context.Background(), serviceArgs, &validator.Service) + serviceArgs := &service.ServiceConfigs{Name: "validator"} + err := service.InitServiceTemplate(serviceArgs, &validator.ServiceTemplate, validator) s.Require().Nil(err) dummyOutputsMerkleRoot := common.HexToHash("0x0a162946e56158bac0673e6dd3bdfdc1e4a0e7744a120fdb640050c8d7abe1c6") dummyEpochs = []Epoch{ diff --git a/pkg/service/service.go b/pkg/service/service.go index d1a30b169..69bc748a0 100644 --- a/pkg/service/service.go +++ b/pkg/service/service.go @@ -6,8 +6,8 @@ // The runtime information is then stored in the Service. // // The recommended way to implement a new service is to: -// - embed a [CreateInfo] struct into a new CreateInfo struct. -// - embed a [Service] struct into a new Service struct. +// - embed a [ServiceConfigs] struct into a new CreateInfo struct. +// - embed a [ServiceTemplate] struct into a new Service struct. // - embed a [Create] call into a new Create function. // // To use a service, call its corresponding Create function with a matching CreateInfo and Service, @@ -72,84 +72,46 @@ const telemetryShutdownTimeout = 5 * time.Second var ( ErrInvalid = fmt.Errorf("Invalid Argument") // invalid argument + ErrServiceStopped = fmt.Errorf("Service was stopped") ) -// ServiceImpl is the interface that concrete services must implement. -// -// IMPORTANT: Stop() implementations that shadow Service.Stop() MUST call -// s.SetStopping() as their first action. This sets the stopping flag so that -// a concurrent Tick() can detect shutdown-in-progress via IsStopping() and -// suppress expected teardown errors (e.g., context.Canceled from in-flight -// RPCs). Without this call, the race window between Stop() tearing down -// resources and Tick() observing the cancellation produces spurious errors. -// -// When Stop() is called through the framework's Service.Stop() dispatch, -// the flag is set automatically before Impl.Stop() runs. But the node -// orchestrator calls child.Stop() directly (Go method resolution picks the -// concrete type's Stop, bypassing Service.Stop), so the impl's SetStopping() -// is the only thing that sets the flag on that path. -type ServiceImpl interface { - Alive() bool - Ready() bool - Reload() []error - Tick() []error - Stop(bool) []error -} - +// Public interface with methods to manipulate the service. type IService interface { Alive() bool Ready() bool Reload() []error - Tick() []error Stop(bool) []error Serve() error String() string } -// CreateInfo stores initialization data for the Create function -type CreateInfo struct { - Name string - LogLevel slog.Level - LogColor bool - EnableSignalHandling bool - TelemetryCreate bool - TelemetryAddress string - PollInterval time.Duration - Impl ServiceImpl - Logger *slog.Logger - ServeMux *http.ServeMux - Context context.Context - Cancel context.CancelFunc +/* + * Service template for services that do continuous processing. + */ - // EnableReschedule, when true, creates a self-continuation channel. - // Services that discover remaining work after a Tick() call - // SignalReschedule() to re-tick immediately without waiting for the - // timer interval. - // - // Migration: When the events library (feature/events-library-research) - // ships, Serve() will gain an additional EventChannel case for external - // cross-service notifications. Reschedule remains complementary: - // Reschedule = internal self-continuation ("I have more work"), - // EventChannel = external stimulus ("another service produced work"). - // Both coexist in the select loop alongside the Ticker safety-net. - EnableReschedule bool +// Internal interface with abstract methods called by ServiceTemplate. +// These methods are not part of the public service interface. +type LifecycleImpl interface { + Alive() bool + Ready() bool + OnReload() []error + OnStop(bool) []error + OnServe() error } -// Service stores runtime information. -type Service struct { +// ServiceTemplate stores runtime information. +type ServiceTemplate struct { Running atomic.Bool Name string - Impl ServiceImpl Logger *slog.Logger - Ticker *time.Ticker + lifecycleImpl LifecycleImpl Context context.Context Cancel context.CancelFunc - Sighup chan os.Signal // SIGHUP to reload - SigShutdown chan os.Signal // SIGINT/SIGTERM to exit gracefully + sigHangUp chan os.Signal // SIGHUP to reload + sigShutdown chan os.Signal // SIGINT/SIGTERM to exit gracefully ServeMux *http.ServeMux - Telemetry *http.Server - TelemetryFunc func() error - reschedule chan struct{} // self-continuation signal; see CreateInfo.EnableReschedule + telemetry *http.Server + telemetryFunc func() error // stopping is set to true at the beginning of Stop(), before Impl.Stop() // is called. Services can check this via IsStopping() from Tick() to @@ -158,26 +120,37 @@ type Service struct { // calls). This covers the race window between Stop() being called and // ctx.Cancel() propagating. stopping atomic.Bool + + // stopped server Stop() run exactly once, even when Stop() is called + // multiple times (by the child's Serve() loop and by the parent orchestrator). + stopped atomic.Bool } -// Create a service by: -// - using values from s if non zero, -// - using values from c, -// - using default values when applicable -func Create(ctx context.Context, c *CreateInfo, s *Service) error { - if c == nil || c.Impl == nil || c.Impl == s || s == nil { +// ServiceConfigs stores configuration for the InitServiceTemplate function +type ServiceConfigs struct { + Name string + Logger *slog.Logger + LogLevel slog.Level + LogColor bool + Context context.Context + Cancel context.CancelFunc + EnableSignalHandling bool + TelemetryCreate bool + TelemetryAddress string + ServeMux *http.ServeMux // used only for unit testing +} + +// Initialize the 'ServiceTemplate' structure using values from 'CreateInfo'. +// 'impl' must be a reference to the concrete service implementation that +// embeds 'ServiceTemplate' +func InitServiceTemplate(c *ServiceConfigs, s *ServiceTemplate, impl LifecycleImpl) error { + if c == nil || s == nil || impl == nil { return ErrInvalid } - if err := ctx.Err(); err != nil { - return err // This returns context.Canceled or context.DeadlineExceeded. - } - if s.Ticker == nil && c.PollInterval < 0 { - return fmt.Errorf("PollInterval must be non-negative, got %v", c.PollInterval) - } - s.Running.Store(false) + s.lifecycleImpl = impl + s.Name = c.Name - s.Impl = c.Impl s.Logger = c.Logger // log @@ -199,28 +172,15 @@ func Create(ctx context.Context, c *CreateInfo, s *Service) error { s.Cancel = c.Cancel } - // ticker - if s.Ticker == nil { - if c.PollInterval == 0 { - c.PollInterval = time.Minute - } - s.Ticker = time.NewTicker(c.PollInterval) - } - - // self-rescheduling - if c.EnableReschedule { - s.reschedule = make(chan struct{}, 1) - } - // signal handling if c.EnableSignalHandling { - if s.Sighup == nil { - s.Sighup = make(chan os.Signal, 1) - signal.Notify(s.Sighup, syscall.SIGHUP) + if s.sigHangUp == nil { + s.sigHangUp = make(chan os.Signal, 1) + signal.Notify(s.sigHangUp, syscall.SIGHUP) } - if s.SigShutdown == nil { - s.SigShutdown = make(chan os.Signal, 1) - signal.Notify(s.SigShutdown, syscall.SIGINT, syscall.SIGTERM) + if s.sigShutdown == nil { + s.sigShutdown = make(chan os.Signal, 1) + signal.Notify(s.sigShutdown, syscall.SIGINT, syscall.SIGTERM) } } @@ -235,32 +195,36 @@ func Create(ctx context.Context, c *CreateInfo, s *Service) error { if c.TelemetryAddress == "" { c.TelemetryAddress = ":8080" } - s.Telemetry, s.TelemetryFunc = s.CreateDefaultTelemetry(c.TelemetryAddress) + s.telemetry, s.telemetryFunc = s.CreateDefaultTelemetry(c.TelemetryAddress) go func() { - if err := s.TelemetryFunc(); err != nil { + if err := s.telemetryFunc(); err != nil { s.Logger.Error("Telemetry HTTP server failed", "error", err) } }() } s.Logger.Info("Create", "version", version.BuildVersion, "log_level", c.LogLevel, "pid", os.Getpid()) - if s.Telemetry != nil { - s.Logger.Info("Telemetry", "address", s.Telemetry.Addr) + if s.telemetry != nil { + s.Logger.Info("Telemetry", "address", s.telemetry.Addr) } return nil } -func (s *Service) Alive() bool { - return s.Impl.Alive() -} - -func (s *Service) Ready() bool { - return s.Impl.Ready() -} +// Default implementation of some abstract methods (except `OnServe`). +// Remove them to force concrete services to provide implementation for them. +func (s *ServiceTemplate) OnReload() []error { return nil } +func (s *ServiceTemplate) OnStop(bool) []error { return nil } +func (s *ServiceTemplate) Alive() bool { return true } +func (s *ServiceTemplate) Ready() bool { return true } +func (s *ServiceTemplate) String() string { return s.Name } + +func (s *ServiceTemplate) Reload() []error { + if s.stopped.Load() { + return []error{ErrServiceStopped} + } -func (s *Service) Reload() []error { start := time.Now() - errs := s.Impl.Reload() + errs := s.lifecycleImpl.OnReload() elapsed := time.Since(start) if len(errs) > 0 { @@ -274,25 +238,9 @@ func (s *Service) Reload() []error { return errs } -func (s *Service) Tick() []error { - start := time.Now() - errs := s.Impl.Tick() - elapsed := time.Since(start) - - if len(errs) > 0 { - s.Logger.Error("Tick", - "duration", elapsed, - "error", errs) - } else { - s.Logger.Debug("Tick", - "duration", elapsed) - } - return errs -} - // IsStopping reports whether Stop() has been called. Services use this in // Tick() to detect shutdown-in-progress and suppress expected teardown errors. -func (s *Service) IsStopping() bool { +func (s *ServiceTemplate) IsStopping() bool { return s.stopping.Load() } @@ -300,31 +248,40 @@ func (s *Service) IsStopping() bool { // Service.Stop() (i.e., every ServiceImpl) must call this at the top of their // Stop so that concurrent Tick goroutines can observe IsStopping() == true // before resources are torn down. -func (s *Service) SetStopping() { +func (s *ServiceTemplate) SetStopping() { s.stopping.Store(true) } -func (s *Service) Stop(force bool) []error { +func (s *ServiceTemplate) Stop(force bool) []error { + // CAS achieves once-semantics: the second caller returns immediately + // (fire-and-forget) rather than blocking like sync.Once. This is safe + // because the orchestrator calls Cancel() after Stop() and waits for + // the Serve goroutine to exit. + if !s.stopped.CompareAndSwap(false, true) { + return nil // already stopped + } + s.stopping.Store(true) start := time.Now() - errs := s.Impl.Stop(force) - if s.Telemetry != nil { + errs := s.lifecycleImpl.OnStop(force) + if s.telemetry != nil { shutdownCtx, cancel := context.WithTimeout(context.Background(), telemetryShutdownTimeout) defer cancel() - if err := s.Telemetry.Shutdown(shutdownCtx); err != nil { + if err := s.telemetry.Shutdown(shutdownCtx); err != nil { errs = append(errs, err) } } - if s.SigShutdown != nil { - signal.Stop(s.SigShutdown) + if s.sigShutdown != nil { + signal.Stop(s.sigShutdown) } - if s.Sighup != nil { - signal.Stop(s.Sighup) + if s.sigHangUp != nil { + signal.Stop(s.sigHangUp) } elapsed := time.Since(start) s.Running.Store(false) s.Cancel() + if len(errs) > 0 { s.Logger.Error("Stop", "force", force, @@ -338,17 +295,152 @@ func (s *Service) Stop(force bool) []error { return errs } -// rescheduleChan returns the reschedule channel, or nil if rescheduling is disabled. -// A nil channel in a select case blocks forever, preserving timer-only behavior. -func (s *Service) rescheduleChan() <-chan struct{} { - return s.reschedule +func (s *ServiceTemplate) Serve() error { + if s.stopped.Load() { + return ErrServiceStopped + } + + s.Running.Store(true) + + go func() { + for { + select { + case <-s.sigHangUp: + s.Reload() + case <-s.sigShutdown: + s.Stop(false) // Graceful shutdown; errors are logged by Stop. + return + case <-s.Context.Done(): + s.Stop(true) // Stop logs errors internally. + return + } + } + }() + + defer s.Stop(true) + + return s.lifecycleImpl.OnServe() +} + +// LogConfig logs the service configuration at debug level. +// Intended for use by standalone service binaries after Create. +func (s *ServiceTemplate) LogConfig(config any) { + s.Logger.Info("Starting service", "config", config) +} + +/* + * Alternative service template that implements the tick-based processing. + */ + +type TickImpl interface { + Tick() []error +} + +type TickServiceTemplate struct { + ServiceTemplate + tickImpl TickImpl + ticker *time.Ticker + reschedule chan struct{} // self-continuation signal; see CreateInfo.EnableReschedule +} + +type TickServiceConfigs struct { + ServiceConfigs + PollInterval time.Duration + + // EnableReschedule, when true, creates a self-continuation channel. + // Services that discover remaining work after a Tick() call + // SignalReschedule() to re-tick immediately without waiting for the + // timer interval. + // + // Migration: When the events library (feature/events-library-research) + // ships, Serve() will gain an additional EventChannel case for external + // cross-service notifications. Reschedule remains complementary: + // Reschedule = internal self-continuation ("I have more work"), + // EventChannel = external stimulus ("another service produced work"). + // Both coexist in the select loop alongside the Ticker safety-net. + EnableReschedule bool +} + +func InitTickServiceTemplate( + cfg *TickServiceConfigs, + tmpl *TickServiceTemplate, + lifecycleImpl LifecycleImpl, + tickImpl TickImpl, +) error { + if cfg == nil || tmpl == nil || tickImpl == nil { + return ErrInvalid + } + + err := InitServiceTemplate(&cfg.ServiceConfigs, &tmpl.ServiceTemplate, lifecycleImpl) + if err != nil { + return err + } + + tmpl.tickImpl = tickImpl + + // ticker + if cfg.PollInterval < 0 { + return fmt.Errorf("PollInterval must be non-negative, got %v", cfg.PollInterval) + } + if cfg.PollInterval == 0 { + cfg.PollInterval = time.Minute + } + tmpl.ticker = time.NewTicker(cfg.PollInterval) + + // self-rescheduling + if cfg.EnableReschedule { + tmpl.reschedule = make(chan struct{}, 1) + } + + return nil +} + +func (s *TickServiceTemplate) tick() []error { + start := time.Now() + errs := s.tickImpl.Tick() + elapsed := time.Since(start) + + if len(errs) > 0 { + s.Logger.Error("Tick", + "duration", elapsed, + "error", errs) + } else { + s.Logger.Debug("Tick", + "duration", elapsed) + } + return errs +} + +func (s *TickServiceTemplate) OnStop(bool) []error { + s.ticker.Stop() + return nil +} + +func (s *TickServiceTemplate) OnServe() error { + ctx := s.Context + if ctx.Err() != nil { + return nil + } + s.tick() + for { + select { + case <-ctx.Done(): + return nil + case <-s.ticker.C: + s.tick() + // 'reschedule' is nil when rescheduling is disabled thus blocking forever, + // preserving timer-only behavior. + case <-s.reschedule: + s.tick() + } + } } // SignalReschedule performs a non-blocking send on the reschedule channel. // If a signal is already pending, this is a no-op (one wake is sufficient). // Does nothing if rescheduling is not enabled. // INVARIANT: This method must never block. -func (s *Service) SignalReschedule() { +func (s *TickServiceTemplate) SignalReschedule() { select { case s.reschedule <- struct{}{}: default: @@ -357,7 +449,7 @@ func (s *Service) SignalReschedule() { // DrainReschedule consumes and discards a pending reschedule signal, if any. // Returns true if a signal was pending. Intended for testing. -func (s *Service) DrainReschedule() bool { +func (s *TickServiceTemplate) DrainReschedule() bool { select { case <-s.reschedule: return true @@ -366,46 +458,9 @@ func (s *Service) DrainReschedule() bool { } } -func (s *Service) Serve() error { - s.Running.Store(true) - - // Check for context cancellation before the first tick. - select { - case <-s.Context.Done(): - s.Stop(true) // Stop logs errors internally. - return nil - default: - } - - s.Tick() - for s.Running.Load() { - select { - case <-s.Sighup: - s.Reload() - case <-s.SigShutdown: - s.Stop(false) // Graceful shutdown; errors are logged by Stop. - return nil - case <-s.Context.Done(): - s.Stop(true) // Stop logs errors internally. - return nil - case <-s.Ticker.C: - s.Tick() - case <-s.rescheduleChan(): - s.Tick() - } - } - return nil -} - -func (s *Service) String() string { - return s.Name -} - -// LogConfig logs the service configuration at debug level. -// Intended for use by standalone service binaries after Create. -func (s *Service) LogConfig(config any) { - s.Logger.Info("Starting service", "config", config) -} +/* + * Service Logger + */ func NewLogger(level slog.Level, color bool) *slog.Logger { opts := &tint.Options{ @@ -419,12 +474,15 @@ func NewLogger(level slog.Level, color bool) *slog.Logger { return slog.New(handler) } -func NewServiceLogger(c *CreateInfo) *slog.Logger { +func NewServiceLogger(c *ServiceConfigs) *slog.Logger { return NewLogger(c.LogLevel, c.LogColor).With("service", c.Name) } -// Telemetry -func (s *Service) CreateDefaultTelemetry(addr string) (*http.Server, func() error) { +/* + * Service Telemetry + */ + +func (s *ServiceTemplate) CreateDefaultTelemetry(addr string) (*http.Server, func() error) { s.ServeMux.Handle("/readyz", http.HandlerFunc(s.ReadyHandler)) s.ServeMux.Handle("/livez", http.HandlerFunc(s.AliveHandler)) @@ -450,8 +508,8 @@ func (s *Service) CreateDefaultTelemetry(addr string) (*http.Server, func() erro } // HTTP handler for `/s.Name/readyz` that exposes the value of Ready() -func (s *Service) ReadyHandler(w http.ResponseWriter, r *http.Request) { - if !s.Ready() { +func (s *ServiceTemplate) ReadyHandler(w http.ResponseWriter, r *http.Request) { + if !s.lifecycleImpl.Ready() { http.Error(w, s.Name+": ready check failed", http.StatusInternalServerError) } else { @@ -460,8 +518,8 @@ func (s *Service) ReadyHandler(w http.ResponseWriter, r *http.Request) { } // HTTP handler for `/s.Name/livez` that exposes the value of Alive() -func (s *Service) AliveHandler(w http.ResponseWriter, r *http.Request) { - if !s.Alive() { +func (s *ServiceTemplate) AliveHandler(w http.ResponseWriter, r *http.Request) { + if !s.lifecycleImpl.Alive() { http.Error(w, s.Name+": alive check failed", http.StatusInternalServerError) } else { diff --git a/pkg/service/service_test.go b/pkg/service/service_test.go index 6949c3171..3e79ef51f 100644 --- a/pkg/service/service_test.go +++ b/pkg/service/service_test.go @@ -16,14 +16,13 @@ import ( // mockImpl is a minimal ServiceImpl for testing the Serve() loop. type mockImpl struct { + TickServiceTemplate tickCount atomic.Int32 onTick func(n int32) // called on each Tick with the tick count (1-based) } -func (m *mockImpl) Alive() bool { return true } -func (m *mockImpl) Ready() bool { return true } -func (m *mockImpl) Reload() []error { return nil } -func (m *mockImpl) Stop(bool) []error { return nil } +func (m *mockImpl) OnReload() []error { return nil } +func (m *mockImpl) OnStop(bool) []error { return nil } func (m *mockImpl) Tick() []error { n := m.tickCount.Add(1) if m.onTick != nil { @@ -39,21 +38,21 @@ func createTestService( t *testing.T, impl *mockImpl, enableReschedule bool, -) (*Service, context.CancelFunc) { +) (IService, context.CancelFunc) { t.Helper() ctx, cancel := context.WithCancel(context.Background()) - s := &Service{} - err := Create(ctx, &CreateInfo{ - Name: "test", - LogLevel: slog.LevelError, - Impl: impl, + err := InitTickServiceTemplate(&TickServiceConfigs{ + ServiceConfigs: ServiceConfigs{ + Name: "test", + LogLevel: slog.LevelError, + Context: ctx, + Cancel: cancel, + }, PollInterval: 10 * time.Minute, // long: tests control wakeup explicitly - Context: ctx, - Cancel: cancel, EnableReschedule: enableReschedule, - }, s) + }, &impl.TickServiceTemplate, impl, impl) require.NoError(t, err) - return s, cancel + return impl, cancel } type ServeSuite struct { @@ -66,18 +65,19 @@ func TestServe(t *testing.T) { func TestCreateRejectsNegativePollInterval(t *testing.T) { impl := &mockImpl{} - svc := &Service{} + svc := &TickServiceTemplate{} require.NotPanics(t, func() { - err := Create(context.Background(), &CreateInfo{ - Name: "test-negative-poll", - LogLevel: slog.LevelError, - Impl: impl, + err := InitTickServiceTemplate(&TickServiceConfigs{ + ServiceConfigs: ServiceConfigs{ + Name: "test-negative-poll", + LogLevel: slog.LevelError, + }, PollInterval: -time.Second, - }, svc) + }, svc, impl, impl) require.ErrorContains(t, err, "PollInterval must be non-negative") }) - require.Nil(t, svc.Ticker) + require.Nil(t, svc.ticker) } func (s *ServeSuite) TestDisabledReschedulePreservesExistingBehavior() { @@ -85,20 +85,20 @@ func (s *ServeSuite) TestDisabledReschedulePreservesExistingBehavior() { // Serve() should tick only on timer fires. impl := &mockImpl{} ctx, cancel := context.WithCancel(context.Background()) - svc := &Service{} - err := Create(ctx, &CreateInfo{ - Name: "test-no-resched", - LogLevel: slog.LevelError, - Impl: impl, + err := InitTickServiceTemplate(&TickServiceConfigs{ + ServiceConfigs: ServiceConfigs{ + Name: "test-no-resched", + LogLevel: slog.LevelError, + Context: ctx, + Cancel: cancel, + }, PollInterval: 20 * time.Millisecond, - Context: ctx, - Cancel: cancel, - }, svc) + }, &impl.TickServiceTemplate, impl, impl) s.Require().NoError(err) done := make(chan struct{}) go func() { - _ = svc.Serve() + _ = impl.Serve() close(done) }() @@ -118,13 +118,13 @@ func (s *ServeSuite) TestDisabledReschedulePreservesExistingBehavior() { func (s *ServeSuite) TestRescheduleTriggersImmediateRetick() { // When SignalReschedule() is called from Tick(), Serve() should call // Tick() again immediately without waiting for the timer. - var svc *Service - impl := &mockImpl{ + var impl *mockImpl + impl = &mockImpl{ onTick: func(n int32) { // Signal reschedule on ticks 1 and 2 (the initial tick // and the first rescheduled tick). Stop on tick 3. if n <= 2 { - svc.SignalReschedule() + impl.SignalReschedule() } }, } @@ -152,7 +152,6 @@ func (s *ServeSuite) TestRescheduleTriggersImmediateRetick() { func (s *ServeSuite) TestRescheduleCoalesces() { // Multiple signals while Tick() is running should result in at most // one extra tick, not one per signal. - var svc *Service tickStarted := make(chan struct{}) tickProceed := make(chan struct{}) @@ -181,7 +180,7 @@ func (s *ServeSuite) TestRescheduleCoalesces() { // Send multiple signals while tick is blocked. Only one fits in the buffer. for range 10 { - svc.SignalReschedule() + impl.SignalReschedule() } // Let the first tick complete. @@ -201,10 +200,10 @@ func (s *ServeSuite) TestRescheduleCoalesces() { func (s *ServeSuite) TestContextCancellationExitsPromptly() { // When context is cancelled with a reschedule signal pending, // Serve() should exit promptly. - var svc *Service - impl := &mockImpl{ + var impl *mockImpl + impl = &mockImpl{ onTick: func(_ int32) { - svc.SignalReschedule() + impl.SignalReschedule() }, } @@ -244,37 +243,37 @@ func (s *ServeSuite) TestServeExitsOnContextCancelledBeforeFirstTick() { func (s *ServeSuite) TestRescheduleEnabledCreatesChannel() { impl := &mockImpl{} - svc, cancel := createTestService(s.T(), impl, true) + _, cancel := createTestService(s.T(), impl, true) defer cancel() - s.NotNil(svc.reschedule, "reschedule channel should be created when enabled") + s.NotNil(impl.reschedule, "reschedule channel should be created when enabled") } func (s *ServeSuite) TestRescheduleDisabledLeavesNilChannel() { impl := &mockImpl{} - svc, cancel := createTestService(s.T(), impl, false) + _, cancel := createTestService(s.T(), impl, false) defer cancel() - s.Nil(svc.reschedule, "reschedule channel should be nil when disabled") + s.Nil(impl.reschedule, "reschedule channel should be nil when disabled") } func (s *ServeSuite) TestSignalRescheduleNoopWhenDisabled() { impl := &mockImpl{} - svc, cancel := createTestService(s.T(), impl, false) + _, cancel := createTestService(s.T(), impl, false) defer cancel() // Should not panic on nil channel. - s.NotPanics(func() { svc.SignalReschedule() }) + s.NotPanics(func() { impl.SignalReschedule() }) } func (s *ServeSuite) TestDrainReschedule() { impl := &mockImpl{} - svc, cancel := createTestService(s.T(), impl, true) + _, cancel := createTestService(s.T(), impl, true) defer cancel() - s.False(svc.DrainReschedule(), "should be empty initially") + s.False(impl.DrainReschedule(), "should be empty initially") - svc.SignalReschedule() - s.True(svc.DrainReschedule(), "should drain pending signal") - s.False(svc.DrainReschedule(), "should be empty after drain") + impl.SignalReschedule() + s.True(impl.DrainReschedule(), "should drain pending signal") + s.False(impl.DrainReschedule(), "should be empty after drain") } diff --git a/pkg/service/telemetry_test.go b/pkg/service/telemetry_test.go index c854bda3a..c7f6293a4 100644 --- a/pkg/service/telemetry_test.go +++ b/pkg/service/telemetry_test.go @@ -14,13 +14,13 @@ import ( // newTelemetryTestService returns a *Service ready to have CreateDefaultTelemetry // called on it. It wires a ServeMux, a mockImpl, and a discard logger. -func newTelemetryTestService() *Service { +func newTelemetryTestService() *ServiceTemplate { impl := &mockImpl{} - return &Service{ - Name: "test", - Logger: discardLogger(), - ServeMux: http.NewServeMux(), - Impl: impl, + return &ServiceTemplate{ + Name: "test", + Logger: discardLogger(), + ServeMux: http.NewServeMux(), + lifecycleImpl: impl, } } @@ -96,3 +96,26 @@ func TestCreateDefaultTelemetry_PanicRecovered(t *testing.T) { require.Contains(t, rr.Body.String(), "Internal server error") require.NotContains(t, rr.Body.String(), "kaboom") } + +type falseLifecycleImpl struct{ ServiceTemplate } + +func (*falseLifecycleImpl) Alive() bool { return false } +func (*falseLifecycleImpl) Ready() bool { return false } +func (*falseLifecycleImpl) OnServe() error { return nil } + +func TestCreateDefaultTelemetry_Returns500WhenLifecycleFails(t *testing.T) { + service := &ServiceTemplate{ + Name: "test", + Logger: discardLogger(), + ServeMux: http.NewServeMux(), + lifecycleImpl: &falseLifecycleImpl{}, + } + + srv, _ := service.CreateDefaultTelemetry(":0") + + for _, path := range []string{"/readyz", "/livez"} { + rr := httptest.NewRecorder() + srv.Handler.ServeHTTP(rr, httptest.NewRequest(http.MethodGet, path, nil)) + require.Equal(t, http.StatusInternalServerError, rr.Code, "path=%s", path) + } +} diff --git a/test/validator/validator_test.go b/test/validator/validator_test.go index 105696e6c..c76f43593 100644 --- a/test/validator/validator_test.go +++ b/test/validator/validator_test.go @@ -62,13 +62,16 @@ func (s *ValidatorRepositoryIntegrationSuite) SetupSubTest() { s.Require().Nil(err) serviceArgs := validator.CreateInfo{ - CreateInfo: service.CreateInfo{ - Name: "validator", - LogLevel: slog.LevelDebug, + TickServiceConfigs: service.TickServiceConfigs{ + ServiceConfigs: service.ServiceConfigs{ + Name: "validator", + LogLevel: slog.LevelDebug, + }, }, Repository: s.repository, } - s.validator, err = validator.Create(context.Background(), &serviceArgs) + srv, err := validator.Create(context.Background(), &serviceArgs) + s.validator = srv.(*validator.Service) s.Require().Nil(err) } From 369a4ca92a1cfa796cf3074cbdb2d4c1fb45bb4c Mon Sep 17 00:00:00 2001 From: Renato Maia <1887792+renatomaia@users.noreply.github.com> Date: Mon, 18 May 2026 11:09:41 -0300 Subject: [PATCH 07/16] refactor(services): use context from argument instead of an object field --- internal/advancer/advancer.go | 2 +- internal/advancer/advancer_test.go | 47 ++++++++++++----- internal/advancer/service.go | 10 ++-- internal/claimer/accept.go | 58 ++++++++++++--------- internal/claimer/accept_test.go | 60 +++++++++++---------- internal/claimer/claim_status.go | 11 ++-- internal/claimer/claim_status_test.go | 3 +- internal/claimer/claimer.go | 24 ++++----- internal/claimer/claimer_test.go | 11 ++-- internal/claimer/divergence.go | 39 +++++++++----- internal/claimer/divergence_test.go | 3 +- internal/claimer/foreclosed_apps_test.go | 32 ++++++------ internal/claimer/foreclosure.go | 15 +++--- internal/claimer/inflight.go | 38 ++++++++------ internal/claimer/inflight_test.go | 31 +++++------ internal/claimer/reverts.go | 24 +++++---- internal/claimer/reverts_test.go | 27 +++++----- internal/claimer/stage.go | 24 +++++---- internal/claimer/stage_test.go | 17 +++--- internal/claimer/submit.go | 62 ++++++++++++---------- internal/claimer/submit_test.go | 66 ++++++++++++------------ internal/evmreader/evmreader.go | 6 +-- internal/evmreader/evmreader_test.go | 6 +-- internal/evmreader/service.go | 4 +- internal/jsonrpc/service.go | 8 +-- internal/node/node.go | 4 +- internal/prt/service.go | 10 ++-- internal/validator/validator.go | 6 +-- pkg/service/service.go | 45 +++++++--------- pkg/service/service_test.go | 2 +- pkg/service/telemetry_test.go | 7 +-- test/validator/validator_test.go | 8 +-- 32 files changed, 399 insertions(+), 311 deletions(-) diff --git a/internal/advancer/advancer.go b/internal/advancer/advancer.go index 8efec4f63..e2991a0a5 100644 --- a/internal/advancer/advancer.go +++ b/internal/advancer/advancer.go @@ -315,7 +315,7 @@ func (s *Service) processInputs(ctx context.Context, app *Application, inputs [] "epoch", input.EpochIndex, "index", input.Index, "error", err) - s.Cancel() // triggers graceful shutdown of all services + s.Stop(false) // triggers graceful shutdown of all services return err } diff --git a/internal/advancer/advancer_test.go b/internal/advancer/advancer_test.go index da59ca2aa..f4985f692 100644 --- a/internal/advancer/advancer_test.go +++ b/internal/advancer/advancer_test.go @@ -34,14 +34,33 @@ func TestAdvancer(t *testing.T) { type AdvancerSuite struct{ suite.Suite } +const defaultBatchSize = 500 + func newMockAdvancerService(machineManager *MockMachineManager, repo *MockRepository) (*Service, error) { - return newMockAdvancerServiceWithBatchSize(machineManager, repo, 500) + return newMockAdvancerServiceWithBatchSize(machineManager, repo, defaultBatchSize) } func newMockAdvancerServiceWithBatchSize( machineManager *MockMachineManager, repo *MockRepository, batchSize uint64, +) (*Service, error) { + ctx, cf := context.WithCancel(context.Background()) + return newMockAdvancerServiceWithContextAndBatchSize( + ctx, + cf, + machineManager, + repo, + batchSize, + ) +} + +func newMockAdvancerServiceWithContextAndBatchSize( + ctx context.Context, + cancelCtx context.CancelFunc, + machineManager *MockMachineManager, + repo *MockRepository, + batchSize uint64, ) (*Service, error) { s := &Service{ inputBatchSize: batchSize, @@ -49,8 +68,10 @@ func newMockAdvancerServiceWithBatchSize( repository: repo, } serviceArgs := &service.TickServiceConfigs{ - ServiceConfigs: service.ServiceConfigs{ + ServiceConfigs: service.ServiceConfigs{ Name: "advancer", + Context: ctx, + Cancel: cancelCtx, }, EnableReschedule: true, } @@ -65,6 +86,7 @@ func newMockAdvancerServiceWithBatchSize( // the mock machine manager, and the mock repository. type testEnv struct { service *Service + ctx context.Context app *MockMachineImpl mm *MockMachineManager repo *MockRepository @@ -73,13 +95,14 @@ type testEnv struct { // setupOneApp creates a standard test environment with one application. // The repository is empty; callers can configure it after the call. func (s *AdvancerSuite) setupOneApp() testEnv { + ctx, cf := context.WithCancel(context.Background()) mm := newMockMachineManager() app := newMockMachine(1) mm.Map[1] = newMockInstance(app) repo := &MockRepository{} - svc, err := newMockAdvancerService(mm, repo) + svc, err := newMockAdvancerServiceWithContextAndBatchSize(ctx, cf, mm, repo, defaultBatchSize) s.Require().NoError(err) - return testEnv{service: svc, app: app, mm: mm, repo: repo} + return testEnv{service: svc, ctx: ctx, app: app, mm: mm, repo: repo} } func (s *AdvancerSuite) TestServiceInterface() { @@ -103,12 +126,12 @@ func (s *AdvancerSuite) TestServiceInterface() { repository.GetEpochsReturn = map[common.Address][]*Epoch{ machineManager.Map[1].application.IApplicationAddress: {}, } - tickErrors := advancer.Tick() + tickErrors := advancer.Tick(context.Background()) require.Empty(tickErrors) // Test Tick with error repository.GetEpochsError = errors.New("list epochs error") - tickErrors = advancer.Tick() + tickErrors = advancer.Tick(context.Background()) require.NotEmpty(tickErrors) require.Contains(tickErrors[0].Error(), "list epochs error") @@ -411,7 +434,7 @@ func (s *AdvancerSuite) TestProcess() { require.Len(env.repo.StoredResults, 1) // Verify that the node shutdown was triggered (context cancelled) - require.Error(env.service.Context.Err(), "shared context should be cancelled") + require.Error(env.ctx.Err(), "shared context should be cancelled") }) }) } @@ -509,7 +532,7 @@ func (s *AdvancerSuite) TestErrorRecovery() { err := env.service.processInputs(context.Background(), env.app.Application, inputs) require.Error(err) require.Contains(err.Error(), "temporary failure") - require.Error(env.service.Context.Err(), "shared context should be cancelled") + require.Error(env.ctx.Err(), "shared context should be cancelled") }) } @@ -1574,7 +1597,7 @@ func (s *AdvancerSuite) TestSelfWakeOnSuccess() { require.NoError(err) // Call Tick() which internally calls Step() and signals reschedule. - svc.Tick() + svc.Tick(context.Background()) // The reschedule channel should have a pending signal. require.True(svc.DrainReschedule(), @@ -1598,7 +1621,7 @@ func (s *AdvancerSuite) TestNoSelfWakeWhenIdle() { svc, err := newMockAdvancerService(mm, repo) require.NoError(err) - svc.Tick() + svc.Tick(context.Background()) require.False(svc.DrainReschedule(), "reschedule channel should be empty when no work exists") @@ -1617,7 +1640,7 @@ func (s *AdvancerSuite) TestNoSelfWakeOnError() { svc, err := newMockAdvancerService(mm, repo) require.NoError(err) - errs := svc.Tick() + errs := svc.Tick(context.Background()) require.NotEmpty(errs) require.False(svc.DrainReschedule(), @@ -1658,7 +1681,7 @@ func (s *AdvancerSuite) TestPartialSuccessStillReschedules() { // Call Tick — app1 fails, app2 succeeds with more work remaining (batch limit hit). // Tick should surface the error AND signal reschedule for app2's pending work. - errs := svc.Tick() + errs := svc.Tick(context.Background()) require.NotEmpty(errs, "Tick should surface app1's error") // Reschedule SHOULD fire: app2 had work, and one failing app must not diff --git a/internal/advancer/service.go b/internal/advancer/service.go index a009cf6bd..1b4a36807 100644 --- a/internal/advancer/service.go +++ b/internal/advancer/service.go @@ -107,8 +107,8 @@ func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { } // Service interface implementation -func (s *Service) Tick() []error { - hadWork, err := s.Step(s.Context) +func (s *Service) Tick(ctx context.Context) []error { + hadWork, err := s.Step(ctx) // Signal reschedule whenever work was done, even if some apps errored. // Failed apps are marked Failed and removed by the machine manager, @@ -157,14 +157,14 @@ func (s *Service) OnStop(b bool) []error { } return append(errs, s.TickServiceTemplate.OnStop(b)...) } -func (s *Service) OnServe() error { +func (s *Service) OnServe(ctx context.Context) error { if s.inspector != nil { go func() { if err := s.inspector.Serve(); err != nil && !errors.Is(err, http.ErrServerClosed) { s.Logger.Error("Inspect HTTP server failed — shutting down", "error", err) - s.Cancel() + s.Stop(true) } }() } - return s.TickServiceTemplate.OnServe() + return s.TickServiceTemplate.OnServe(ctx) } diff --git a/internal/claimer/accept.go b/internal/claimer/accept.go index 9499a1278..21fbb7a47 100644 --- a/internal/claimer/accept.go +++ b/internal/claimer/accept.go @@ -57,7 +57,7 @@ func (s *Service) findClaimAcceptedEventAndSucc( } matches, ok := claimAcceptedEventMatches(app, prevEpoch, prevClaimAcceptanceEvent) if !ok { - err = s.markMatcherPrecondFailure(app, prevEpoch, "findClaimAcceptedEventAndSucc(prev)") + err = s.markMatcherPrecondFailure(ctx, app, prevEpoch, "findClaimAcceptedEventAndSucc(prev)") return nil, nil, nil, err } if !matches { @@ -83,6 +83,7 @@ func (s *Service) findClaimAcceptedEventAndSucc( // // It returns the number of successful state changes and any errors. func (s *Service) acceptClaimsAndUpdateDatabase( + ctx context.Context, acceptedEpochs map[int64]*model.Epoch, stagedEpochs map[int64]*model.Epoch, apps map[int64]*model.Application, @@ -92,7 +93,7 @@ func (s *Service) acceptClaimsAndUpdateDatabase( errs := []error{} for key, currEpoch := range stagedEpochs { - result := s.processAcceptedClaimEvent(stagedClaimWork{ + result := s.processAcceptedClaimEvent(ctx, stagedClaimWork{ app: apps[key], prevEpoch: acceptedEpochs[key], epoch: currEpoch, @@ -109,6 +110,7 @@ func (s *Service) acceptClaimsAndUpdateDatabase( } func (s *Service) processAcceptedClaimEvent( + ctx context.Context, work stagedClaimWork, defaultBlockNumber *big.Int, ) claimStepResult { @@ -116,7 +118,7 @@ func (s *Service) processAcceptedClaimEvent( currEpoch := work.epoch prevEpoch := work.prevEpoch - if err := s.checkConsensusForAddressChange(app, defaultBlockNumber); err != nil { + if err := s.checkConsensusForAddressChange(ctx, app, defaultBlockNumber); err != nil { return claimDropped(err) } @@ -124,11 +126,11 @@ func (s *Service) processAcceptedClaimEvent( var err error if prevEpoch != nil { _, _, currEvent, err = s.findClaimAcceptedEventAndSucc( - s.Context, app, prevEpoch, currEpoch, prevEpoch.LastBlock+1, defaultBlockNumber.Uint64(), + ctx, app, prevEpoch, currEpoch, prevEpoch.LastBlock+1, defaultBlockNumber.Uint64(), ) } else { _, currEvent, _, err = s.blockchain.findClaimAcceptedEventAndSucc( - s.Context, app, currEpoch, currEpoch.LastBlock+1, defaultBlockNumber.Uint64(), + ctx, app, currEpoch, currEpoch.LastBlock+1, defaultBlockNumber.Uint64(), ) } if err != nil { @@ -146,10 +148,10 @@ func (s *Service) processAcceptedClaimEvent( ) matches, ok := claimAcceptedEventMatches(app, currEpoch, currEvent) if !ok { - return claimDropped(s.markMatcherPrecondFailure(app, currEpoch, "acceptClaimsAndUpdateDatabase")) + return claimDropped(s.markMatcherPrecondFailure(ctx, app, currEpoch, "acceptClaimsAndUpdateDatabase")) } if !matches { - return claimDropped(s.markAcceptedDivergence(app, currEpoch, currEvent, "acceptClaimsAndUpdateDatabase")) + return claimDropped(s.markAcceptedDivergence(ctx, app, currEpoch, currEvent, "acceptClaimsAndUpdateDatabase")) } s.Logger.Debug("Updating claim status to accepted", "app", app.IApplicationAddress, @@ -157,7 +159,7 @@ func (s *Service) processAcceptedClaimEvent( "last_block", currEpoch.LastBlock, ) txHash := currEvent.Raw.TxHash - err = s.repository.UpdateEpochWithAcceptedClaim(s.Context, currEpoch.ApplicationID, currEpoch.Index, &txHash) + err = s.repository.UpdateEpochWithAcceptedClaim(ctx, currEpoch.ApplicationID, currEpoch.Index, &txHash) if err != nil { return claimDropped(err) } @@ -184,6 +186,7 @@ func (s *Service) processAcceptedClaimEvent( // In reader mode (submissionEnabled=false), this function does not send // transactions; the later ClaimAccepted event scan performs the DB update. func (s *Service) acceptStagedClaimsAndIssueAcceptTx( + ctx context.Context, stagedEpochs map[int64]*model.Epoch, apps map[int64]*model.Application, defaultBlockNumber *big.Int, @@ -192,7 +195,7 @@ func (s *Service) acceptStagedClaimsAndIssueAcceptTx( errs := []error{} for key, currEpoch := range stagedEpochs { - result := s.processStagedClaim(stagedClaimWork{ + result := s.processStagedClaim(ctx, stagedClaimWork{ app: apps[key], epoch: currEpoch, }, defaultBlockNumber) @@ -208,20 +211,21 @@ func (s *Service) acceptStagedClaimsAndIssueAcceptTx( } func (s *Service) processStagedClaim( + ctx context.Context, work stagedClaimWork, defaultBlockNumber *big.Int, ) claimStepResult { app := work.app currEpoch := work.epoch - currentBlock, result, done := s.stagedClaimReadyForAccept(app, currEpoch, defaultBlockNumber) + currentBlock, result, done := s.stagedClaimReadyForAccept(ctx, app, currEpoch, defaultBlockNumber) if done { return result } // Read the claim state before sending acceptClaim. Use the same block // number for all reads in this tick. - claim, err := s.blockchain.getClaimStatus(s.Context, app, currEpoch, defaultBlockNumber) + claim, err := s.blockchain.getClaimStatus(ctx, app, currEpoch, defaultBlockNumber) if err != nil { return claimRetryLater(fmt.Errorf("getClaim before acceptClaim (app=%v, epoch=%d): %w", app.IApplicationAddress, currEpoch.Index, err)) @@ -232,15 +236,16 @@ func (s *Service) processStagedClaim( // the app FAILED on an UNSTAGED reading and leave the drain behind a // re-enable loop). if app.ForecloseBlock != 0 && claim.Status != claimStatusAccepted { - return s.terminalizeForeclosedStagedClaim(app, currEpoch) + return s.terminalizeForeclosedStagedClaim(ctx, app, currEpoch) } - if result, done := s.handlePreAcceptClaimStatus(app, currEpoch, claim, currentBlock); done { + if result, done := s.handlePreAcceptClaimStatus(ctx, app, currEpoch, claim, currentBlock); done { return result } - return s.broadcastAcceptClaimOrReconcileRevert(app, currEpoch, defaultBlockNumber) + return s.broadcastAcceptClaimOrReconcileRevert(ctx, app, currEpoch, defaultBlockNumber) } func (s *Service) stagedClaimReadyForAccept( + ctx context.Context, app *model.Application, currEpoch *model.Epoch, defaultBlockNumber *big.Int, @@ -250,14 +255,14 @@ func (s *Service) stagedClaimReadyForAccept( return 0, claimNoProgress(), true } - if err := s.checkConsensusForAddressChange(app, defaultBlockNumber); err != nil { + if err := s.checkConsensusForAddressChange(ctx, app, defaultBlockNumber); err != nil { return 0, claimRetryLater(err), true } if currEpoch.StagedAtBlock == nil { // Invariant: CLAIM_STAGED rows must have staged_at_block. The database // CHECK should stop this from happening. - err := s.setApplicationCorrupted(s.Context, app, + err := s.setApplicationCorrupted(ctx, app, "epoch %d (%d) is CLAIM_STAGED but staged_at_block is nil", currEpoch.Index, currEpoch.VirtualIndex) return 0, claimRetryLater(err), true @@ -283,6 +288,7 @@ func (s *Service) stagedClaimReadyForAccept( } func (s *Service) handlePreAcceptClaimStatus( + ctx context.Context, app *model.Application, currEpoch *model.Epoch, claim iconsensus.IConsensusClaim, @@ -290,7 +296,7 @@ func (s *Service) handlePreAcceptClaimStatus( ) (claimStepResult, bool) { switch claim.Status { case claimStatusAccepted: // Another party accepted first; update our DB. - err := s.updateEpochAcceptedFromClaimStatus(app, currEpoch, claim, "acceptStagedClaimsAndIssueAcceptTx") + err := s.updateEpochAcceptedFromClaimStatus(ctx, app, currEpoch, claim, "acceptStagedClaimsAndIssueAcceptTx") if err != nil { return claimRetryLater(err), true } @@ -306,7 +312,7 @@ func (s *Service) handlePreAcceptClaimStatus( // match a STAGED or ACCEPTED claim on chain. If the chain says UNSTAGED, // the node is probably reading the wrong chain, an old block, or stale // node_config. Mark the app FAILED so the operator sees the problem. - if ferr := appstatus.SetFailedf(s.Context, s.Logger, s.repository, app, + if ferr := appstatus.SetFailedf(ctx, s.Logger, s.repository, app, "getClaim returned UNSTAGED for epoch %d (%d) recorded as CLAIM_STAGED at block %d; "+ "current block %d. Likely a misconfigured default block or stale node_config — "+ "verify CARTESI_BLOCKCHAIN_DEFAULT_BLOCK is 'finalized' or 'safe' and that the "+ @@ -320,7 +326,7 @@ func (s *Service) handlePreAcceptClaimStatus( // Defense-in-depth invariant check. The chain says our claim is still // STAGED, so its outputs root must match our local epoch. If it does // not match, this node and the chain disagree about the claim data. - if vErr := s.verifyClaimOutputsMatch(app, currEpoch, claim, "acceptStagedClaimsAndIssueAcceptTx"); vErr != nil { + if vErr := s.verifyClaimOutputsMatch(ctx, app, currEpoch, claim, "acceptStagedClaimsAndIssueAcceptTx"); vErr != nil { return claimRetryLater(vErr), true } return claimNoProgress(), false @@ -330,7 +336,7 @@ func (s *Service) handlePreAcceptClaimStatus( // view call, so the IConsensus contract is most likely a newer version // than this node. Mark the app FAILED (recoverable) so an operator sees // it, rather than skipping silently every tick forever. - if ferr := appstatus.SetFailedf(s.Context, s.Logger, s.repository, app, + if ferr := appstatus.SetFailedf(ctx, s.Logger, s.repository, app, "getClaim returned unmodeled ClaimStatus %d for epoch %d (%d) — this node models "+ "only 0/1/2. The IConsensus contract may be newer than this node supports; "+ "upgrade the node or verify the contract before re-enabling.", @@ -343,10 +349,11 @@ func (s *Service) handlePreAcceptClaimStatus( } func (s *Service) terminalizeForeclosedStagedClaim( + ctx context.Context, app *model.Application, currEpoch *model.Epoch, ) claimStepResult { - if ferr := s.forecloseClaim(app, currEpoch, "acceptStagedClaimsAndIssueAcceptTx"); ferr != nil { + if ferr := s.forecloseClaim(ctx, app, currEpoch, "acceptStagedClaimsAndIssueAcceptTx"); ferr != nil { return claimRetryLater(ferr) } s.dropAcceptAttempt(acceptAttemptKey{currEpoch.ApplicationID, currEpoch.Index}) @@ -354,6 +361,7 @@ func (s *Service) terminalizeForeclosedStagedClaim( } func (s *Service) broadcastAcceptClaimOrReconcileRevert( + ctx context.Context, app *model.Application, currEpoch *model.Epoch, defaultBlockNumber *big.Int, @@ -365,7 +373,7 @@ func (s *Service) broadcastAcceptClaimOrReconcileRevert( attempts := s.incrementAcceptAttempt(attemptKey) if attempts > s.maxAcceptAttempts { var err error - if ferr := appstatus.SetFailedf(s.Context, s.Logger, s.repository, app, + if ferr := appstatus.SetFailedf(ctx, s.Logger, s.repository, app, "acceptClaim has failed %d consecutive times for epoch %d (%d); "+ "inspect logs and the chain state, then re-enable. "+ "Common causes: gas estimation issues, signer not authorised, "+ @@ -380,7 +388,7 @@ func (s *Service) broadcastAcceptClaimOrReconcileRevert( txHash, err := s.blockchain.acceptClaimOnBlockchain(app, currEpoch) if err != nil { - outcome, stateErr := s.handleAcceptClaimRevert(err, app, currEpoch) + outcome, stateErr := s.handleAcceptClaimRevert(ctx, err, app, currEpoch) switch outcome { case acceptClaimRetryLater: return claimNoProgress() @@ -388,7 +396,7 @@ func (s *Service) broadcastAcceptClaimOrReconcileRevert( s.dropAcceptAttempt(attemptKey) return claimRetryLater(stateErr) case acceptClaimReconciledAccepted: - claim, gerr := s.blockchain.getClaimStatus(s.Context, app, currEpoch, defaultBlockNumber) + claim, gerr := s.blockchain.getClaimStatus(ctx, app, currEpoch, defaultBlockNumber) if gerr != nil { return claimRetryLater(fmt.Errorf("getClaim after acceptClaim front-run revert (app=%v, epoch=%d): %w", app.IApplicationAddress, currEpoch.Index, gerr)) @@ -402,7 +410,7 @@ func (s *Service) broadcastAcceptClaimOrReconcileRevert( s.dropAcceptAttempt(attemptKey) return claimNoProgress() } - err = s.updateEpochAcceptedFromClaimStatus(app, currEpoch, claim, "acceptClaimReconciledAccepted") + err = s.updateEpochAcceptedFromClaimStatus(ctx, app, currEpoch, claim, "acceptClaimReconciledAccepted") if err != nil { return claimRetryLater(err) } diff --git a/internal/claimer/accept_test.go b/internal/claimer/accept_test.go index 7084c3630..138c9d0ae 100644 --- a/internal/claimer/accept_test.go +++ b/internal/claimer/accept_test.go @@ -4,6 +4,7 @@ package claimer import ( + "context" "fmt" "math/big" "strings" @@ -29,13 +30,14 @@ func TestAcceptFirstClaim(t *testing.T) { currEpoch := makeSubmittedEpoch(app, 3) var prevEvent *iconsensus.IConsensusClaimAccepted = nil currEvent := makeAcceptedEvent(app, currEpoch) + ctx := context.Background() b.On("findClaimAcceptedEventAndSucc", mock.Anything, app, currEpoch, currEpoch.LastBlock+1, endBlock.Uint64()). Return(&iconsensus.IConsensus{}, prevEvent, currEvent, nil).Once() b.On("getConsensusAddress", mock.Anything, app, mock.Anything). Return(app.IConsensusAddress, nil).Once() - _, errs := m.acceptClaimsAndUpdateDatabase(makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + _, errs := m.acceptClaimsAndUpdateDatabase(ctx, makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, len(errs), 0) } @@ -58,7 +60,7 @@ func TestAcceptClaimWithAntecessor(t *testing.T) { r.On("UpdateEpochWithAcceptedClaim", mock.Anything, app.ID, currEpoch.Index, mock.Anything). Return(nil).Once() - transitions, errs := m.acceptClaimsAndUpdateDatabase(makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + transitions, errs := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 0, len(errs)) assert.Equal(t, 1, transitions, "accepting a claim counts as a transition") } @@ -84,7 +86,7 @@ func TestFindClaimAcceptedEventAndSuccFailure0(t *testing.T) { b.On("findClaimAcceptedEventAndSucc", mock.Anything, app, currEpoch, currEpoch.LastBlock+1, endBlock.Uint64()). Return(&iconsensus.IConsensus{}, prevEvent, currEvent, expectedErr).Once() - _, errs := m.acceptClaimsAndUpdateDatabase(makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + _, errs := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 1, len(errs)) } @@ -107,7 +109,7 @@ func TestFindClaimAcceptedEventAndSuccFailure1(t *testing.T) { b.On("findClaimAcceptedEventAndSucc", mock.Anything, app, prevEpoch, prevEpoch.LastBlock+1, endBlock.Uint64()). Return(&iconsensus.IConsensus{}, prevEvent, currEvent, expectedErr).Once() - _, errs := m.acceptClaimsAndUpdateDatabase(makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + _, errs := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 1, len(errs)) } @@ -138,7 +140,7 @@ func TestAcceptClaimWithAntecessorMismatch(t *testing.T) { r.On("UpdateApplicationStatus", mock.Anything, mock.Anything, model.ApplicationStatus_Diverged, mock.Anything). Return(nil).Once() - _, errs := m.acceptClaimsAndUpdateDatabase(makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + _, errs := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 1, len(errs)) } @@ -163,7 +165,7 @@ func TestAcceptClaimWithEventMismatch(t *testing.T) { r.On("UpdateApplicationStatus", mock.Anything, mock.Anything, model.ApplicationStatus_Diverged, mock.Anything). Return(nil).Once() - _, errs := m.acceptClaimsAndUpdateDatabase(makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + _, errs := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 1, len(errs)) } @@ -183,7 +185,7 @@ func TestAcceptClaimWithAntecessorOutOfOrder(t *testing.T) { Return(nil). Once() - _, errs := m.acceptClaimsAndUpdateDatabase(makeEpochMap(wrongEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), big.NewInt(0)) + _, errs := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(wrongEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), big.NewInt(0)) assert.Equal(t, 1, len(errs)) } @@ -206,7 +208,7 @@ func TestErrAcceptedMissingEvent(t *testing.T) { r.On("UpdateApplicationStatus", mock.Anything, mock.Anything, model.ApplicationStatus_Corrupted, mock.Anything). Return(nil).Once() - _, errs := m.acceptClaimsAndUpdateDatabase(makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + _, errs := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 1, len(errs)) } @@ -231,7 +233,7 @@ func TestUpdateEpochWithAcceptedClaimFailed(t *testing.T) { r.On("UpdateEpochWithAcceptedClaim", mock.Anything, app.ID, currEpoch.Index, mock.Anything). Return(expectedErr).Once() - _, errs := m.acceptClaimsAndUpdateDatabase(makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + _, errs := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 1, len(errs)) } @@ -253,7 +255,7 @@ func TestConsensusAddressChangedOnAcceptedClaims(t *testing.T) { Return(nil). Once() - _, errs := m.acceptClaimsAndUpdateDatabase(makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + _, errs := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, len(errs), 1) } @@ -275,7 +277,7 @@ func TestAcceptStagedFrontRunner(t *testing.T) { r.On("UpdateEpochWithAcceptedClaim", mock.Anything, app.ID, currEpoch.Index, mock.Anything). Return(nil).Once() - transitions, errs := m.acceptStagedClaimsAndIssueAcceptTx(makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + transitions, errs := m.acceptStagedClaimsAndIssueAcceptTx(context.Background(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 0, len(errs)) assert.Equal(t, 1, transitions) assert.Equal(t, 0, len(m.acceptsInFlight)) @@ -300,7 +302,7 @@ func TestAcceptStagedBroadcastsWhenClaimStillStaged(t *testing.T) { b.On("acceptClaimOnBlockchain", app, currEpoch). Return(txHash, nil).Once() - transitions, errs := m.acceptStagedClaimsAndIssueAcceptTx(makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + transitions, errs := m.acceptStagedClaimsAndIssueAcceptTx(context.Background(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 0, len(errs)) assert.Equal(t, 0, transitions, "broadcasting acceptClaim records in-flight work but does not update DB yet") @@ -331,7 +333,7 @@ func TestAcceptStagedFrontRunnerOutputsMismatchSetsDiverged(t *testing.T) { r.On("UpdateApplicationStatus", mock.Anything, app.ID, model.ApplicationStatus_Diverged, mock.Anything). Return(nil).Once() - transitions, errs := m.acceptStagedClaimsAndIssueAcceptTx(makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + transitions, errs := m.acceptStagedClaimsAndIssueAcceptTx(context.Background(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 1, len(errs)) assert.Equal(t, 0, transitions) assert.Equal(t, 0, len(m.acceptsInFlight)) @@ -359,7 +361,7 @@ func TestAcceptStagedUnmodeledClaimStatusFailsClosed(t *testing.T) { r.On("UpdateApplicationStatus", mock.Anything, app.ID, model.ApplicationStatus_Failed, mock.Anything). Return(nil).Once() - transitions, errs := m.acceptStagedClaimsAndIssueAcceptTx(makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + transitions, errs := m.acceptStagedClaimsAndIssueAcceptTx(context.Background(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 0, transitions) // SetFailedf returns nil on success; the FAILED write is asserted by the mock. assert.Equal(t, 0, len(errs)) @@ -389,8 +391,9 @@ func TestAcceptStagedForeclosesForeclosedApp(t *testing.T) { // CRITICAL: no acceptClaimOnBlockchain expectation — testify reports // an unexpected call if the guard fails. + ctx := context.Background() transitions, errs := m.acceptStagedClaimsAndIssueAcceptTx( - makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + ctx, makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 0, len(errs)) assert.Equal(t, 1, transitions) @@ -428,8 +431,9 @@ func TestAcceptStagedForeclosesForeclosedAppOnUnstaged(t *testing.T) { // unexpected call if the foreclosure guard fails and any FAILED/DIVERGED/ // CORRUPTED write is attempted. + ctx := context.Background() transitions, errs := m.acceptStagedClaimsAndIssueAcceptTx( - makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + ctx, makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 0, len(errs)) assert.Equal(t, 1, transitions) @@ -464,7 +468,7 @@ func TestAcceptStagedCapEnforced(t *testing.T) { r.On("UpdateApplicationStatus", mock.Anything, app.ID, model.ApplicationStatus_Failed, mock.Anything). Return(nil).Once() - _, errs := m.acceptStagedClaimsAndIssueAcceptTx(makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + _, errs := m.acceptStagedClaimsAndIssueAcceptTx(context.Background(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) // SetFailedf returns nil on success — no error surfaced. assert.Equal(t, 0, len(errs)) assert.Equal(t, 0, len(m.acceptsInFlight)) @@ -485,6 +489,7 @@ func TestAcceptStagedUnknownBroadcastErrorsIncrementAttemptsUntilCap(t *testing. currEpoch := makeStagedEpoch(app, 3, stagedAt) attemptKey := acceptAttemptKey{currEpoch.ApplicationID, currEpoch.Index} broadcastErr := fmt.Errorf("gas estimation failed") + ctx := context.Background() for i := uint64(1); i <= m.maxAcceptAttempts; i++ { b.On("getConsensusAddress", mock.Anything, app, mock.Anything). @@ -494,7 +499,7 @@ func TestAcceptStagedUnknownBroadcastErrorsIncrementAttemptsUntilCap(t *testing. b.On("acceptClaimOnBlockchain", app, currEpoch). Return(common.Hash{}, broadcastErr).Once() - transitions, errs := m.acceptStagedClaimsAndIssueAcceptTx(makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + transitions, errs := m.acceptStagedClaimsAndIssueAcceptTx(ctx, makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 0, transitions) require.Equal(t, 1, len(errs)) @@ -512,7 +517,7 @@ func TestAcceptStagedUnknownBroadcastErrorsIncrementAttemptsUntilCap(t *testing. })). Return(nil).Once() - transitions, errs := m.acceptStagedClaimsAndIssueAcceptTx(makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + transitions, errs := m.acceptStagedClaimsAndIssueAcceptTx(ctx, makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 0, transitions) assert.Equal(t, 0, len(errs), "marking FAILED after the cap is a state transition outcome, not a tick error") @@ -545,7 +550,8 @@ func TestAcceptClaimNotStagedAcceptedRechecksOutputsMismatch(t *testing.T) { r.On("UpdateApplicationStatus", mock.Anything, app.ID, model.ApplicationStatus_Diverged, mock.Anything). Return(nil).Once() - transitions, errs := m.acceptStagedClaimsAndIssueAcceptTx(makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + ctx := context.Background() + transitions, errs := m.acceptStagedClaimsAndIssueAcceptTx(ctx, makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 1, len(errs)) assert.Equal(t, 0, transitions) assert.Equal(t, 0, len(m.acceptsInFlight)) @@ -565,7 +571,7 @@ func TestAcceptStagedPeriodNotElapsed(t *testing.T) { Return(app.IConsensusAddress, nil).Once() endBlock := big.NewInt(60) // only 10 blocks elapsed; need 100. - transitions, errs := m.acceptStagedClaimsAndIssueAcceptTx(makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + transitions, errs := m.acceptStagedClaimsAndIssueAcceptTx(context.Background(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 0, len(errs)) assert.Equal(t, 0, transitions) assert.Equal(t, 0, len(m.acceptsInFlight)) @@ -588,7 +594,7 @@ func TestAcceptStagedReaderMode(t *testing.T) { b.On("getConsensusAddress", mock.Anything, app, mock.Anything). Return(app.IConsensusAddress, nil).Once() - transitions, errs := m.acceptStagedClaimsAndIssueAcceptTx(makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + transitions, errs := m.acceptStagedClaimsAndIssueAcceptTx(context.Background(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 0, len(errs)) assert.Equal(t, 0, transitions) assert.Equal(t, 0, len(m.acceptsInFlight)) @@ -625,7 +631,7 @@ func TestAcceptanceDivergence_QuorumStagedDoesNotRejectEpoch(t *testing.T) { })). Return(nil).Once() - _, errs := m.acceptClaimsAndUpdateDatabase(makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + _, errs := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 1, len(errs)) assert.Equal(t, model.ApplicationStatus_Diverged, app.Status) assert.Equal(t, model.EpochStatus_ClaimStaged, currEpoch.Status) @@ -657,7 +663,7 @@ func TestAcceptanceDivergence_QuorumComputedRejectsEpoch(t *testing.T) { })). Return(nil).Once() - _, errs := m.submitClaimsAndUpdateDatabase(makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + _, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 1, len(errs)) assert.Equal(t, model.ApplicationStatus_Diverged, app.Status) assert.Equal(t, model.EpochStatus_ClaimRejected, currEpoch.Status) @@ -688,7 +694,7 @@ func TestAcceptanceDivergence_AuthorityComputedSetsDivergedWithoutRejectingEpoch })). Return(nil).Once() - _, errs := m.submitClaimsAndUpdateDatabase(makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + _, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 1, len(errs)) assert.Equal(t, model.ApplicationStatus_Diverged, app.Status) assert.Equal(t, model.EpochStatus_ClaimComputed, currEpoch.Status) @@ -718,7 +724,7 @@ func TestAcceptanceDivergence_AuthorityDoesNotRejectEpoch(t *testing.T) { r.On("UpdateApplicationStatus", mock.Anything, app.ID, model.ApplicationStatus_Diverged, mock.Anything). Return(nil).Once() - _, errs := m.acceptClaimsAndUpdateDatabase(makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + _, errs := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 1, len(errs)) assert.Equal(t, model.ApplicationStatus_Diverged, app.Status) assert.Equal(t, model.EpochStatus_ClaimStaged, currEpoch.Status) @@ -757,7 +763,7 @@ func TestAcceptanceDivergenceReaderMode_Quorum(t *testing.T) { })). Return(nil).Once() - _, errs := m.acceptClaimsAndUpdateDatabase(makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + _, errs := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 1, len(errs), "acceptance divergence detection must fire in reader mode") assert.Equal(t, model.EpochStatus_ClaimStaged, currEpoch.Status) } diff --git a/internal/claimer/claim_status.go b/internal/claimer/claim_status.go index a122fe6a2..cb120d46f 100644 --- a/internal/claimer/claim_status.go +++ b/internal/claimer/claim_status.go @@ -4,6 +4,7 @@ package claimer import ( + "context" "fmt" "github.com/cartesi/rollups-node/internal/model" @@ -11,31 +12,33 @@ import ( ) func (s *Service) updateEpochAcceptedFromClaimStatus( + ctx context.Context, app *model.Application, epoch *model.Epoch, claim iconsensus.IConsensusClaim, site string, ) error { - if err := s.verifyClaimOutputsMatch(app, epoch, claim, site); err != nil { + if err := s.verifyClaimOutputsMatch(ctx, app, epoch, claim, site); err != nil { return err } // getClaim is read-only. It tells us the claim state, but not the // transaction hash that accepted the claim. Store NULL for the hash; the // DB accepts this for reconciled claims. if err := s.repository.UpdateEpochWithAcceptedClaim( - s.Context, epoch.ApplicationID, epoch.Index, nil); err != nil { + ctx, epoch.ApplicationID, epoch.Index, nil); err != nil { return err } return nil } func (s *Service) updateEpochStagedFromClaimStatus( + ctx context.Context, app *model.Application, epoch *model.Epoch, claim iconsensus.IConsensusClaim, site string, ) (uint64, error) { - if err := s.verifyClaimOutputsMatch(app, epoch, claim, site); err != nil { + if err := s.verifyClaimOutputsMatch(ctx, app, epoch, claim, site); err != nil { return 0, err } if claim.StagingBlockNumber == nil { @@ -44,7 +47,7 @@ func (s *Service) updateEpochStagedFromClaimStatus( } stagingBlock := claim.StagingBlockNumber.Uint64() if err := s.repository.UpdateEpochReconciledStaged( - s.Context, epoch.ApplicationID, epoch.Index, stagingBlock); err != nil { + ctx, epoch.ApplicationID, epoch.Index, stagingBlock); err != nil { return 0, err } return stagingBlock, nil diff --git a/internal/claimer/claim_status_test.go b/internal/claimer/claim_status_test.go index d9d79c364..9ab51c47c 100644 --- a/internal/claimer/claim_status_test.go +++ b/internal/claimer/claim_status_test.go @@ -4,6 +4,7 @@ package claimer import ( + "context" "testing" "github.com/stretchr/testify/require" @@ -17,7 +18,7 @@ func TestUpdateEpochStagedFromClaimStatus_NilStagingBlock_ReturnsError(t *testin epoch := makeComputedEpoch(app, 1) claim := makeClaimStatus(claimStatusStaged, epoch, 0) - _, err := m.updateEpochStagedFromClaimStatus(app, epoch, claim, "test") + _, err := m.updateEpochStagedFromClaimStatus(context.Background(), app, epoch, claim, "test") require.Error(t, err) require.Contains(t, err.Error(), "nil staging block") diff --git a/internal/claimer/claimer.go b/internal/claimer/claimer.go index 0f9f421e7..263f04458 100644 --- a/internal/claimer/claimer.go +++ b/internal/claimer/claimer.go @@ -47,13 +47,13 @@ import ( "errors" ) -func (s *Service) Tick() []error { +func (s *Service) Tick(ctx context.Context) []error { errs := []error{} // Use the same finalized block number for all chain reads in this tick. // This is one RPC per tick even when there is no DB work. The call is // cheap, and Tick already runs on a polling interval. - defaultBlockNumber, err := s.blockchain.getDefaultBlockNumber(s.Context) + defaultBlockNumber, err := s.blockchain.getDefaultBlockNumber(ctx) if err != nil { // During shutdown, the parent context is canceled and RPC/DB calls // return context.Canceled. Ignore only that normal shutdown case. Other @@ -76,7 +76,7 @@ func (s *Service) Tick() []error { // Stage 1: submit. COMPUTED -> SUBMITTED, or directly to STAGED when the // transaction receipt already contains ClaimStaged. - prevSubmittedOrStaged, computedEpochs, computedApps, errComputed := s.repository.SelectClaimsToSubmitPerApp(s.Context) + prevSubmittedOrStaged, computedEpochs, computedApps, errComputed := s.repository.SelectClaimsToSubmitPerApp(ctx) if errComputed != nil { if s.IsStopping() && errors.Is(errComputed, context.Canceled) { s.Logger.Warn("Tick interrupted by shutdown", "stage", "SelectClaimsToSubmitPerApp", "error", errComputed) @@ -85,11 +85,11 @@ func (s *Service) Tick() []error { errs = append(errs, errComputed) return errs } - submitted, submitErrs := s.submitClaimsAndUpdateDatabase(prevSubmittedOrStaged, computedEpochs, computedApps, defaultBlockNumber) + submitted, submitErrs := s.submitClaimsAndUpdateDatabase(ctx, prevSubmittedOrStaged, computedEpochs, computedApps, defaultBlockNumber) errs = append(errs, submitErrs...) // Stage 2: stage. SUBMITTED -> STAGED. This read sees stage 1 updates. - prevAcceptedForSubmitted, submittedEpochs, submittedApps, errSubmitted := s.repository.SelectClaimsToStagePerApp(s.Context) + prevAcceptedForSubmitted, submittedEpochs, submittedApps, errSubmitted := s.repository.SelectClaimsToStagePerApp(ctx) if errSubmitted != nil { if s.IsStopping() && errors.Is(errSubmitted, context.Canceled) { s.Logger.Warn("Tick interrupted by shutdown", "stage", "SelectClaimsToStagePerApp", "error", errSubmitted) @@ -98,13 +98,13 @@ func (s *Service) Tick() []error { errs = append(errs, errSubmitted) return errs } - staged, stageErrs := s.stageClaimsAndUpdateDatabase(prevAcceptedForSubmitted, submittedEpochs, submittedApps, defaultBlockNumber) + staged, stageErrs := s.stageClaimsAndUpdateDatabase(ctx, prevAcceptedForSubmitted, submittedEpochs, submittedApps, defaultBlockNumber) errs = append(errs, stageErrs...) // Stages 3, 4, and 5: accept. STAGED -> ACCEPTED by our own transaction, // another party's event, or a getClaim read before we send acceptClaim. // This read sees stage 1 and stage 2 updates. - prevAcceptedForStaged, stagedEpochs, stagedApps, errStaged := s.repository.SelectClaimsToAcceptPerApp(s.Context) + prevAcceptedForStaged, stagedEpochs, stagedApps, errStaged := s.repository.SelectClaimsToAcceptPerApp(ctx) if errStaged != nil { if s.IsStopping() && errors.Is(errStaged, context.Canceled) { s.Logger.Warn("Tick interrupted by shutdown", "stage", "SelectClaimsToAcceptPerApp", "error", errStaged) @@ -127,7 +127,7 @@ func (s *Service) Tick() []error { // work, so operators can still see drain and reconciliation progress. Once // drained, the app remains enabled for L1 observation with foreclose_block // set. - foreclosed, listErr := s.listEnabledForeclosedNonPRTApps() + foreclosed, listErr := s.listEnabledForeclosedNonPRTApps(ctx) if listErr != nil { errs = append(errs, listErr) } @@ -135,20 +135,20 @@ func (s *Service) Tick() []error { // Finish the accept side of the lifecycle. First send acceptClaim for // staged epochs that are ready. Then check acceptClaim transactions sent in // previous ticks. Finally, scan for ClaimAccepted events from any party. - issuedAccepts, issueErrs := s.acceptStagedClaimsAndIssueAcceptTx(stagedEpochs, stagedApps, defaultBlockNumber) + issuedAccepts, issueErrs := s.acceptStagedClaimsAndIssueAcceptTx(ctx, stagedEpochs, stagedApps, defaultBlockNumber) errs = append(errs, issueErrs...) - confirmedAccepts, confirmErr := s.checkAcceptsInFlight(stagedEpochs, stagedApps, defaultBlockNumber) + confirmedAccepts, confirmErr := s.checkAcceptsInFlight(ctx, stagedEpochs, stagedApps, defaultBlockNumber) if confirmErr != nil { errs = append(errs, confirmErr) } - accepted, acceptErrs := s.acceptClaimsAndUpdateDatabase(prevAcceptedForStaged, stagedEpochs, stagedApps, defaultBlockNumber) + accepted, acceptErrs := s.acceptClaimsAndUpdateDatabase(ctx, prevAcceptedForStaged, stagedEpochs, stagedApps, defaultBlockNumber) errs = append(errs, acceptErrs...) // Keep logging foreclosed apps until all pre-foreclosure work is done. // After that, processForeclosedApps has nothing else to change. - forecloseErrs := s.processForeclosedApps(foreclosed) + forecloseErrs := s.processForeclosedApps(ctx, foreclosed) errs = append(errs, forecloseErrs...) s.cleanupOrphanedInFlight(computedApps, stagedApps, stagedEpochs) diff --git a/internal/claimer/claimer_test.go b/internal/claimer/claimer_test.go index 18867f723..99610bb65 100644 --- a/internal/claimer/claimer_test.go +++ b/internal/claimer/claimer_test.go @@ -4,6 +4,7 @@ package claimer import ( + "context" "math/big" "testing" "time" @@ -25,7 +26,7 @@ func TestDoNothing(t *testing.T) { prevEpochs := makeEpochMap() currEpochs := makeEpochMap() - transitions, errs := m.submitClaimsAndUpdateDatabase(prevEpochs, currEpochs, makeApplicationMap(), big.NewInt(0)) + transitions, errs := m.submitClaimsAndUpdateDatabase(context.Background(), prevEpochs, currEpochs, makeApplicationMap(), big.NewInt(0)) assert.Equal(t, 0, len(errs)) assert.Equal(t, 0, transitions, "no transitions when no epochs to process") } @@ -35,8 +36,12 @@ func TestTickInterleavesStagesWithPinnedBlockAndReschedulesOnProgress(t *testing defer r.AssertExpectations(t) defer b.AssertExpectations(t) + ctx := context.Background() err := service.InitTickServiceTemplate(&service.TickServiceConfigs{ - ServiceConfigs: service.ServiceConfigs{Name: "claimer-test"}, + ServiceConfigs: service.ServiceConfigs{ + Name: "claimer-test", + Context: ctx, + }, PollInterval: time.Hour, EnableReschedule: true, }, &m.TickServiceTemplate, m, m) @@ -76,7 +81,7 @@ func TestTickInterleavesStagesWithPinnedBlockAndReschedulesOnProgress(t *testing }), repository.Pagination{}, false). Return([]*model.Application{}, 0, nil).Once() - errs := m.Tick() + errs := m.Tick(ctx) require.Empty(t, errs) assert.True(t, m.DrainReschedule(), "a successful stage transition should request an immediate follow-up tick") diff --git a/internal/claimer/divergence.go b/internal/claimer/divergence.go index 77a7befb3..2c4dbdaa5 100644 --- a/internal/claimer/divergence.go +++ b/internal/claimer/divergence.go @@ -63,6 +63,7 @@ func divergenceBucket(c model.Consensus, stage divergenceStage) string { // later disagrees, the whole app is unsafe and becomes DIVERGED. Authority // has only one submitter, so any divergence is app-level. func (s *Service) markDivergence( + ctx context.Context, app *model.Application, epoch *model.Epoch, stage divergenceStage, @@ -72,15 +73,16 @@ func (s *Service) markDivergence( stage != divergenceStageSubmit && epoch.Status != model.EpochStatus_ClaimStaged if rejectable { - return s.rejectEpochAndSetApplicationDiverged(app, epoch, reasonText) + return s.rejectEpochAndSetApplicationDiverged(ctx, app, epoch, reasonText) } - return s.setApplicationDiverged(s.Context, app, "%s", reasonText) + return s.setApplicationDiverged(ctx, app, "%s", reasonText) } // markStagingDivergence handles a ClaimStaged event for our epoch whose data // differs from our local claim. markDivergence decides whether this is only an // epoch reject or an app-level failure. func (s *Service) markStagingDivergence( + ctx context.Context, app *model.Application, epoch *model.Epoch, event *iconsensus.IConsensusClaimStaged, @@ -102,13 +104,14 @@ func (s *Service) markStagingDivergence( ourMachineMerkleRoot.Hex(), epoch.Index, epoch.LastBlock, ) - return s.markDivergence(app, epoch, divergenceStageStaging, reason) + return s.markDivergence(ctx, app, epoch, divergenceStageStaging, reason) } // markSubmittedDivergence handles a ClaimSubmitted event whose data differs // from our local claim. This is always an app-level problem. Even in Quorum, // if the submitted claim later gets staged, our local claim is wrong. func (s *Service) markSubmittedDivergence( + ctx context.Context, app *model.Application, epoch *model.Epoch, event *iconsensus.IConsensusClaimSubmitted, @@ -133,13 +136,14 @@ func (s *Service) markSubmittedDivergence( ourOutputsMerkleRoot.Hex(), ourMachineMerkleRoot.Hex(), epoch.Index, epoch.LastBlock, ) - return s.markDivergence(app, epoch, divergenceStageSubmit, reason) + return s.markDivergence(ctx, app, epoch, divergenceStageSubmit, reason) } // markAcceptedDivergence handles a ClaimAccepted event whose data differs from // our local claim. markDivergence decides whether this rejects only the epoch // or marks the whole app DIVERGED. func (s *Service) markAcceptedDivergence( + ctx context.Context, app *model.Application, epoch *model.Epoch, event *iconsensus.IConsensusClaimAccepted, @@ -166,7 +170,7 @@ func (s *Service) markAcceptedDivergence( ourOutputsMerkleRoot.Hex(), ourMachineMerkleRoot.Hex(), epoch.Index, epoch.LastBlock, ) - return s.markDivergence(app, epoch, divergenceStageAcceptance, reason) + return s.markDivergence(ctx, app, epoch, divergenceStageAcceptance, reason) } func (s *Service) setApplicationDiverged( @@ -188,6 +192,7 @@ func (s *Service) setApplicationCorrupted( } func (s *Service) rejectEpochAndSetApplicationDiverged( + ctx context.Context, app *model.Application, epoch *model.Epoch, reason string, @@ -198,7 +203,7 @@ func (s *Service) rejectEpochAndSetApplicationDiverged( "reason", reason) err := s.repository.RejectEpochAndSetApplicationDiverged( - s.Context, app.ID, epoch.Index, reason) + ctx, app.ID, epoch.Index, reason) reasonErr := errors.New(reason) if err != nil { s.Logger.Error("failed to reject epoch and update application status", @@ -222,8 +227,13 @@ func (s *Service) rejectEpochAndSetApplicationDiverged( // later. The node cannot safely continue because it cannot compare claims, and // the missing data cannot be reconstructed by restarting — the status is // terminal. -func (s *Service) markMatcherPrecondFailure(app *model.Application, epoch *model.Epoch, site string) error { - return s.setApplicationCorrupted(s.Context, app, +func (s *Service) markMatcherPrecondFailure( + ctx context.Context, + app *model.Application, + epoch *model.Epoch, + site string, +) error { + return s.setApplicationCorrupted(ctx, app, "%s: cannot compare epoch %d (%d) against chain event — local row is missing "+ "outputs_merkle_root or machine_hash. This is terminal; inspect the epoch row.", site, epoch.Index, epoch.VirtualIndex) @@ -241,6 +251,7 @@ func (s *Service) markMatcherPrecondFailure(app *model.Application, epoch *model // Returns nil when the outputs match or when there is not enough local data to // check. Returns an error after marking the app DIVERGED when they differ. func (s *Service) verifyClaimOutputsMatch( + ctx context.Context, app *model.Application, epoch *model.Epoch, claim iconsensus.IConsensusClaim, @@ -262,7 +273,7 @@ func (s *Service) verifyClaimOutputsMatch( case claimStatusAccepted: status = "ACCEPTED" } - return s.setApplicationDiverged(s.Context, app, + return s.setApplicationDiverged(ctx, app, "chain_claim_outputs_mismatch: %s — getClaim returned %s for our "+ "(app, lpbn, machineMerkleRoot) tuple but with stagedOutputsMerkleRoot=%s "+ "while our local outputs_merkle_root is %s. Epoch %d (lastBlock %d). "+ @@ -280,6 +291,7 @@ type consensusAddressCheckKey struct { } func (s *Service) checkConsensusForAddressChange( + ctx context.Context, app *model.Application, defaultBlockNumber *big.Int, ) error { @@ -295,24 +307,25 @@ func (s *Service) checkConsensusForAddressChange( if err, ok := s.consensusAddressChecks[key]; ok { return err } - err := s.checkConsensusForAddressChangeUncached(app, defaultBlockNumber) + err := s.checkConsensusForAddressChangeUncached(ctx, app, defaultBlockNumber) s.consensusAddressChecks[key] = err return err } - return s.checkConsensusForAddressChangeUncached(app, defaultBlockNumber) + return s.checkConsensusForAddressChangeUncached(ctx, app, defaultBlockNumber) } func (s *Service) checkConsensusForAddressChangeUncached( + ctx context.Context, app *model.Application, defaultBlockNumber *big.Int, ) error { - newConsensusAddress, err := s.blockchain.getConsensusAddress(s.Context, app, defaultBlockNumber) + newConsensusAddress, err := s.blockchain.getConsensusAddress(ctx, app, defaultBlockNumber) if err != nil { return fmt.Errorf("getting consensus address for app %v: %w", app.IApplicationAddress, err) } if app.IConsensusAddress != newConsensusAddress { err = s.setApplicationCorrupted( - s.Context, + ctx, app, "consensus change detected. application: %v.", app.IApplicationAddress, diff --git a/internal/claimer/divergence_test.go b/internal/claimer/divergence_test.go index fd26390aa..57c56bdc7 100644 --- a/internal/claimer/divergence_test.go +++ b/internal/claimer/divergence_test.go @@ -4,6 +4,7 @@ package claimer import ( + "context" "math/big" "testing" @@ -36,7 +37,7 @@ func TestVerifyClaimOutputsMismatch(t *testing.T) { r.On("UpdateApplicationStatus", mock.Anything, app.ID, model.ApplicationStatus_Diverged, mock.Anything). Return(nil).Once() - _, errs := m.acceptStagedClaimsAndIssueAcceptTx(makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + _, errs := m.acceptStagedClaimsAndIssueAcceptTx(context.Background(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 1, len(errs), "chain_claim_outputs_mismatch must surface as an error") assert.Equal(t, 0, len(m.acceptsInFlight)) } diff --git a/internal/claimer/foreclosed_apps_test.go b/internal/claimer/foreclosed_apps_test.go index 0a88324bd..d82596613 100644 --- a/internal/claimer/foreclosed_apps_test.go +++ b/internal/claimer/foreclosed_apps_test.go @@ -69,7 +69,7 @@ func TestListEnabledForeclosedNonPRTApps_UsesAuthorityQuorumFilter(t *testing.T) mock.Anything, ).Return([]*model.Application{auth, quorum}, 2, nil).Once() - got, err := s.listEnabledForeclosedNonPRTApps() + got, err := s.listEnabledForeclosedNonPRTApps(context.Background()) require.NoError(t, err) require.Len(t, got, 2) assert.Contains(t, got, auth.ID) @@ -98,7 +98,7 @@ func TestListEnabledForeclosedNonPRTApps_ExcludesTerminalStatuses(t *testing.T) mock.Anything, ).Return([]*model.Application{}, 0, nil).Once() - got, err := s.listEnabledForeclosedNonPRTApps() + got, err := s.listEnabledForeclosedNonPRTApps(context.Background()) require.NoError(t, err) require.Empty(t, got) } @@ -115,7 +115,7 @@ func TestProcessForeclosedApps_DefersWhenUnreconciled(t *testing.T) { defer r.AssertExpectations(t) app := foreclosedAppHelper(1, 100, model.Consensus_Authority) - s.Context = context.Background() + ctx := context.Background() r.On("ForecloseUnacceptedEpochsAtOrAfterBlock", mock.Anything, app.ID, app.ForecloseBlock, @@ -130,7 +130,7 @@ func TestProcessForeclosedApps_DefersWhenUnreconciled(t *testing.T) { // No UpdateApplicationStatus expectation — if it fires, the mock // assertion fails the test because we registered no Setup for it. - errs := s.processForeclosedApps(map[int64]*model.Application{app.ID: app}) + errs := s.processForeclosedApps(ctx, map[int64]*model.Application{app.ID: app}) assert.Empty(t, errs, "deferral is not an error") } @@ -143,7 +143,7 @@ func TestProcessForeclosedApps_DefersWhenUnreconciled(t *testing.T) { func TestProcessForeclosedApps_DrainCheckErrorsAppendAndContinue(t *testing.T) { s, r, _ := newServiceMock() defer r.AssertExpectations(t) - s.Context = context.Background() + ctx := context.Background() app1 := foreclosedAppHelper(1, 100, model.Consensus_Authority) app2 := foreclosedAppHelper(2, 100, model.Consensus_Authority) @@ -158,7 +158,7 @@ func TestProcessForeclosedApps_DrainCheckErrorsAppendAndContinue(t *testing.T) { // ForecloseUnacceptedEpochsAtOrAfterBlock nor HasUnreconciledClaimsBeforeBlock // is reached — no expectation registered for either. - errs := s.processForeclosedApps(map[int64]*model.Application{app1.ID: app1, app2.ID: app2}) + errs := s.processForeclosedApps(ctx, map[int64]*model.Application{app1.ID: app1, app2.ID: app2}) assert.Len(t, errs, 2, "each app's drain error is appended; the pass does not abort early") } @@ -177,7 +177,7 @@ func TestProcessForeclosedApps_NoTransitionWhenDrained(t *testing.T) { defer r.AssertExpectations(t) app := foreclosedAppHelper(1, 100, model.Consensus_Authority) - s.Context = context.Background() + ctx := context.Background() r.On("ForecloseUnacceptedEpochsAtOrAfterBlock", mock.Anything, app.ID, app.ForecloseBlock, @@ -191,7 +191,7 @@ func TestProcessForeclosedApps_NoTransitionWhenDrained(t *testing.T) { // No UpdateApplicationStatus expectation — the assertion is by negation. - errs := s.processForeclosedApps(map[int64]*model.Application{app.ID: app}) + errs := s.processForeclosedApps(ctx, map[int64]*model.Application{app.ID: app}) assert.Empty(t, errs) } @@ -209,7 +209,7 @@ func TestProcessForeclosedApps_DefersWhenInputsUndrained(t *testing.T) { defer r.AssertExpectations(t) app := foreclosedAppHelper(1, 100, model.Consensus_Authority) - s.Context = context.Background() + ctx := context.Background() r.On("HasUndrainedEpochsBeforeBlock", mock.Anything, app.ID, app.ForecloseBlock, @@ -218,7 +218,7 @@ func TestProcessForeclosedApps_DefersWhenInputsUndrained(t *testing.T) { // HasUnreconciledClaimsBeforeBlock: an undrained input defers the whole pass // before terminalization and before claim reconciliation. - errs := s.processForeclosedApps(map[int64]*model.Application{app.ID: app}) + errs := s.processForeclosedApps(ctx, map[int64]*model.Application{app.ID: app}) assert.Empty(t, errs, "input-drain deferral is not an error") } @@ -231,7 +231,7 @@ func TestProcessForeclosedApps_TerminalizesUnacceptedOverlapAfterDrain(t *testin defer r.AssertExpectations(t) app := foreclosedAppHelper(1, 100, model.Consensus_Authority) - s.Context = context.Background() + ctx := context.Background() // Pin the sequence: the drain check MUST run before terminalization (else a // straddling-epoch input is stranded — the bug this ordering prevents), and @@ -247,7 +247,7 @@ func TestProcessForeclosedApps_TerminalizesUnacceptedOverlapAfterDrain(t *testin ).Return(false, nil).Once() mock.InOrder(drain, terminalize, reconcile) - errs := s.processForeclosedApps(map[int64]*model.Application{app.ID: app}) + errs := s.processForeclosedApps(ctx, map[int64]*model.Application{app.ID: app}) assert.Empty(t, errs) } @@ -261,10 +261,10 @@ func TestProcessForeclosedApps_SkipsZeroForecloseBlock(t *testing.T) { defer r.AssertExpectations(t) app := &model.Application{ID: 99, ConsensusType: model.Consensus_Authority} - s.Context = context.Background() + ctx := context.Background() // No mock expectations — the loop must skip before any repo call. - errs := s.processForeclosedApps(map[int64]*model.Application{app.ID: app}) + errs := s.processForeclosedApps(ctx, map[int64]*model.Application{app.ID: app}) assert.Empty(t, errs) } @@ -286,8 +286,8 @@ func TestProcessForeclosedApps_DefersWhenStillBackfilling(t *testing.T) { app := foreclosedAppHelper(1, 100, model.Consensus_Authority) app.LastInputCheckBlock = 50 // scanner well below the foreclose block - s.Context = context.Background() + ctx := context.Background() - errs := s.processForeclosedApps(map[int64]*model.Application{app.ID: app}) + errs := s.processForeclosedApps(ctx, map[int64]*model.Application{app.ID: app}) assert.Empty(t, errs, "bootstrap deferral is not an error") } diff --git a/internal/claimer/foreclosure.go b/internal/claimer/foreclosure.go index e8bccd81e..ea9ae6b1c 100644 --- a/internal/claimer/foreclosure.go +++ b/internal/claimer/foreclosure.go @@ -4,6 +4,7 @@ package claimer import ( + "context" "fmt" "github.com/cartesi/rollups-node/internal/model" @@ -16,9 +17,9 @@ import ( // Some foreclosed apps no longer have pending claim work, but operators still // need to see whether pre-foreclosure work is fully drained. This query keeps // those apps visible to processForeclosedApps. -func (s *Service) listEnabledForeclosedNonPRTApps() (map[int64]*model.Application, error) { +func (s *Service) listEnabledForeclosedNonPRTApps(ctx context.Context) (map[int64]*model.Application, error) { apps, _, err := s.repository.ListApplications( - s.Context, + ctx, foreclosedClaimDrainApplicationsFilter(), repository.Pagination{}, false, @@ -57,6 +58,7 @@ func foreclosedClaimDrainApplicationsFilter() repository.ApplicationFilter { // broadcasts when foreclose_block is set. Once all drain checks pass there is no // final action here. func (s *Service) processForeclosedApps( + ctx context.Context, apps map[int64]*model.Application, ) []error { var errs []error @@ -92,7 +94,7 @@ func (s *Service) processForeclosedApps( // then terminalize. PRT gates terminalization the same way // (internal/prt/service.go handleForeclosedApp). undrained, err := s.repository.HasUndrainedEpochsBeforeBlock( - s.Context, app.ID, app.ForecloseBlock, + ctx, app.ID, app.ForecloseBlock, ) if err != nil { errs = append(errs, fmt.Errorf( @@ -110,7 +112,7 @@ func (s *Service) processForeclosedApps( continue } terminalized, err := s.repository.ForecloseUnacceptedEpochsAtOrAfterBlock( - s.Context, app.ID, app.ForecloseBlock, + ctx, app.ID, app.ForecloseBlock, ) if err != nil { errs = append(errs, fmt.Errorf( @@ -128,7 +130,7 @@ func (s *Service) processForeclosedApps( ) } unreconciled, err := s.repository.HasUnreconciledClaimsBeforeBlock( - s.Context, app.ID, app.ForecloseBlock, + ctx, app.ID, app.ForecloseBlock, ) if err != nil { errs = append(errs, fmt.Errorf( @@ -152,6 +154,7 @@ func (s *Service) processForeclosedApps( } func (s *Service) forecloseClaim( + ctx context.Context, app *model.Application, epoch *model.Epoch, site string, @@ -167,7 +170,7 @@ func (s *Service) forecloseClaim( ) if err := s.repository.UpdateEpochWithForeclosedClaim( - s.Context, app.ID, epoch.Index); err != nil { + ctx, app.ID, epoch.Index); err != nil { return fmt.Errorf("marking epoch %d (%d) CLAIM_FORECLOSED: %w", epoch.Index, epoch.VirtualIndex, err) } diff --git a/internal/claimer/inflight.go b/internal/claimer/inflight.go index b118919e3..260bf2a92 100644 --- a/internal/claimer/inflight.go +++ b/internal/claimer/inflight.go @@ -4,6 +4,7 @@ package claimer import ( + "context" "errors" "fmt" "math/big" @@ -50,6 +51,7 @@ func (tx inFlightTx) ageAt(blockNumber *big.Int) uint64 { // // It returns the number of confirmed state changes and any error. func (s *Service) checkClaimsInFlight( + ctx context.Context, computedEpochs map[int64]*model.Epoch, apps map[int64]*model.Application, endBlock *big.Int, @@ -57,7 +59,7 @@ func (s *Service) checkClaimsInFlight( confirmed := 0 var errs []error for appID, tx := range s.claimsInFlight { - result := s.processSubmitInFlight(appID, tx, submitInFlightWork{ + result := s.processSubmitInFlight(ctx, appID, tx, submitInFlightWork{ app: apps[appID], epoch: computedEpochs[appID], }, endBlock) @@ -73,13 +75,14 @@ func (s *Service) checkClaimsInFlight( } func (s *Service) processSubmitInFlight( + ctx context.Context, appID int64, tx inFlightTx, work submitInFlightWork, endBlock *big.Int, ) claimStepResult { txHash := tx.txHash - ready, receipt, err := s.blockchain.pollTransaction(s.Context, txHash, endBlock) + ready, receipt, err := s.blockchain.pollTransaction(ctx, txHash, endBlock) if err != nil { s.Logger.Warn("Claim submission receipt lookup failed; keeping tx in flight.", "txHash", txHash, @@ -115,10 +118,11 @@ func (s *Service) processSubmitInFlight( s.dropClaimInFlight(appID) return claimNoProgress() } - return s.handleConfirmedSubmitInFlight(appID, txHash, receipt, work) + return s.handleConfirmedSubmitInFlight(ctx, appID, txHash, receipt, work) } func (s *Service) handleConfirmedSubmitInFlight( + ctx context.Context, appID int64, txHash common.Hash, receipt *types.Receipt, @@ -138,7 +142,7 @@ func (s *Service) handleConfirmedSubmitInFlight( outcome := stageReceiptNoMatch var divErr error if app != nil { - outcome, divErr = s.tryStageFromReceipt(receipt, app, computedEpoch) + outcome, divErr = s.tryStageFromReceipt(ctx, receipt, app, computedEpoch) } switch outcome { case stageReceiptStaged: @@ -187,7 +191,7 @@ func (s *Service) handleConfirmedSubmitInFlight( return claimRetryLater(divErr) case stageReceiptNoMatch: err := s.repository.UpdateEpochWithSubmittedClaim( - s.Context, + ctx, computedEpoch.ApplicationID, computedEpoch.Index, receipt.TxHash, @@ -213,6 +217,7 @@ func (s *Service) handleConfirmedSubmitInFlight( // When a transaction is confirmed, the matching epoch can move to // CLAIM_ACCEPTED. func (s *Service) checkAcceptsInFlight( + ctx context.Context, stagedEpochs map[int64]*model.Epoch, apps map[int64]*model.Application, endBlock *big.Int, @@ -220,7 +225,7 @@ func (s *Service) checkAcceptsInFlight( confirmed := 0 var errs []error for appID, tx := range s.acceptsInFlight { - result, pollErr := s.processAcceptInFlight(appID, tx, acceptInFlightWork{ + result, pollErr := s.processAcceptInFlight(ctx, appID, tx, acceptInFlightWork{ app: apps[appID], epoch: stagedEpochs[appID], }, endBlock) @@ -240,13 +245,14 @@ func (s *Service) checkAcceptsInFlight( } func (s *Service) processAcceptInFlight( + ctx context.Context, appID int64, tx inFlightTx, work acceptInFlightWork, endBlock *big.Int, ) (claimStepResult, error) { txHash := tx.txHash - ready, receipt, err := s.blockchain.pollTransaction(s.Context, txHash, endBlock) + ready, receipt, err := s.blockchain.pollTransaction(ctx, txHash, endBlock) if err != nil { s.Logger.Warn("Accept submission receipt lookup failed; keeping tx in flight.", "txHash", txHash, "err", err) @@ -278,12 +284,13 @@ func (s *Service) processAcceptInFlight( appAddress = work.app.IApplicationAddress } if receipt.Status == 0 { - return s.handleRevertedAcceptInFlight(appID, txHash, work, endBlock, appAddress), nil + return s.handleRevertedAcceptInFlight(ctx, appID, txHash, work, endBlock, appAddress), nil } - return s.handleConfirmedAcceptInFlight(appID, txHash, receipt, work, appAddress), nil + return s.handleConfirmedAcceptInFlight(ctx, appID, txHash, receipt, work, appAddress), nil } func (s *Service) handleRevertedAcceptInFlight( + ctx context.Context, appID int64, txHash common.Hash, work acceptInFlightWork, @@ -302,7 +309,7 @@ func (s *Service) handleRevertedAcceptInFlight( "id", appID, "tx", txHash) return claimNoProgress() } - claim, gerr := s.blockchain.getClaimStatus(s.Context, app, stagedEpoch, endBlock) + claim, gerr := s.blockchain.getClaimStatus(ctx, app, stagedEpoch, endBlock) if gerr != nil { s.Logger.Warn("Accept tx reverted; classifying getClaim failed, will retry next tick", "app", appAddress, "tx", txHash, "err", gerr) @@ -311,11 +318,11 @@ func (s *Service) handleRevertedAcceptInFlight( if app.ForecloseBlock != 0 && claim.Status != claimStatusAccepted { // Foreclosed: no on-chain path remains for a non-accepted claim, so // terminalize to CLAIM_FORECLOSED rather than marking the app FAILED. - return s.terminalizeForeclosedStagedClaim(app, stagedEpoch) + return s.terminalizeForeclosedStagedClaim(ctx, app, stagedEpoch) } switch claim.Status { case claimStatusAccepted: - if err := s.updateEpochAcceptedFromClaimStatus(app, stagedEpoch, claim, "checkAcceptsInFlight"); err != nil { + if err := s.updateEpochAcceptedFromClaimStatus(ctx, app, stagedEpoch, claim, "checkAcceptsInFlight"); err != nil { return claimRetryLater(fmt.Errorf("reconciling accept-revert front-run for epoch %d (%d): %w", stagedEpoch.Index, stagedEpoch.VirtualIndex, err)) } @@ -328,7 +335,7 @@ func (s *Service) handleRevertedAcceptInFlight( case claimStatusStaged: // Our claim is still STAGED. The transaction reverted for some other // reason. The next tick can send acceptClaim again. - if err := s.verifyClaimOutputsMatch(app, stagedEpoch, claim, "checkAcceptsInFlight"); err != nil { + if err := s.verifyClaimOutputsMatch(ctx, app, stagedEpoch, claim, "checkAcceptsInFlight"); err != nil { return claimRetryLater(fmt.Errorf("staged-outputs mismatch on accept-revert classification: %w", err)) } s.Logger.Warn("Accept tx reverted but claim still STAGED on chain; will retry next tick", @@ -338,7 +345,7 @@ func (s *Service) handleRevertedAcceptInFlight( // The DB says CLAIM_STAGED, but the contract says UNSTAGED. This should // not happen when reading a finalized block. Mark the app FAILED so the // operator can fix configuration and re-enable it. - if ferr := appstatus.SetFailedf(s.Context, s.Logger, s.repository, app, + if ferr := appstatus.SetFailedf(ctx, s.Logger, s.repository, app, "accept tx %v reverted and getClaim reports UNSTAGED for our "+ "(app, lpbn, machine) tuple — DB inconsistent with chain; check "+ "default block and node_config, then re-enable", @@ -352,6 +359,7 @@ func (s *Service) handleRevertedAcceptInFlight( } func (s *Service) handleConfirmedAcceptInFlight( + ctx context.Context, appID int64, txHash common.Hash, receipt *types.Receipt, @@ -362,7 +370,7 @@ func (s *Service) handleConfirmedAcceptInFlight( // Normal path: claim_transaction_hash was set when the epoch moved to // CLAIM_SUBMITTED. Pass nil so the repository keeps that hash. err := s.repository.UpdateEpochWithAcceptedClaim( - s.Context, stagedEpoch.ApplicationID, stagedEpoch.Index, nil) + ctx, stagedEpoch.ApplicationID, stagedEpoch.Index, nil) if err != nil { return claimRetryLater(fmt.Errorf("updating epoch %d (%d) with accepted claim: %w", stagedEpoch.Index, stagedEpoch.VirtualIndex, err)) diff --git a/internal/claimer/inflight_test.go b/internal/claimer/inflight_test.go index cd46cb886..b9bf81287 100644 --- a/internal/claimer/inflight_test.go +++ b/internal/claimer/inflight_test.go @@ -4,6 +4,7 @@ package claimer import ( + "context" "fmt" "math/big" "strings" @@ -51,7 +52,7 @@ func TestInFlightCompleted(t *testing.T) { r.On("UpdateEpochThroughStaging", mock.Anything, app.ID, currEpoch.Index, txHash, receiptBlock). Return(nil).Once() - transitions, errs := m.submitClaimsAndUpdateDatabase(makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + transitions, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 0, len(errs)) assert.Equal(t, 0, len(m.claimsInFlight)) // v3 fast path: submitted (1) + staged (1) = 2 transitions. @@ -93,7 +94,7 @@ func TestInFlightCompleted_QuorumNonDeciding(t *testing.T) { r.On("UpdateEpochWithSubmittedClaim", mock.Anything, app.ID, currEpoch.Index, txHash). Return(nil).Once() - transitions, errs := m.submitClaimsAndUpdateDatabase(makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + transitions, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 0, len(errs)) assert.Equal(t, 0, len(m.claimsInFlight)) // Fall-back path: one transition (COMPUTED → SUBMITTED), not the fast-path's two. @@ -132,7 +133,7 @@ func TestInFlightReverted(t *testing.T) { b.On("submitClaimToBlockchain", mock.Anything, app, currEpoch). Return(common.HexToHash("0x10"), nil).Once() - _, errs := m.submitClaimsAndUpdateDatabase(makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + _, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, len(errs), 0) assert.Equal(t, len(m.claimsInFlight), 1) } @@ -152,7 +153,7 @@ func TestClaimInFlightMissingFromCurrClaims(t *testing.T) { b.On("pollTransaction", mock.Anything, reqHash, endBlock). Return(true, receipt, nil).Once() - _, errs := m.submitClaimsAndUpdateDatabase(makeEpochMap(), makeEpochMap(), makeApplicationMap(app), endBlock) + _, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(), makeApplicationMap(app), endBlock) assert.Equal(t, len(errs), 0) } @@ -176,7 +177,7 @@ func TestClaimInFlightPollErrorKeepsTrackingAndStopsDuplicateSubmit(t *testing.T Return(false, nilReceipt, expectedErr).Once() transitions, errs := m.submitClaimsAndUpdateDatabase( - makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) require.Equal(t, 1, len(errs)) assert.ErrorIs(t, errs[0], expectedErr) assert.Equal(t, 0, transitions) @@ -210,7 +211,7 @@ func TestClaimInFlightPollErrorsDoNotStopOtherApps(t *testing.T) { b.On("pollTransaction", mock.Anything, tx2, endBlock). Return(false, nilReceipt, err2).Once() - transitions, err := m.checkClaimsInFlight(makeEpochMap(epoch1, epoch2), makeApplicationMap(app1, app2), endBlock) + transitions, err := m.checkClaimsInFlight(context.Background(), makeEpochMap(epoch1, epoch2), makeApplicationMap(app1, app2), endBlock) require.Error(t, err) assert.ErrorIs(t, err, err1) assert.ErrorIs(t, err, err2) @@ -240,7 +241,7 @@ func TestClaimInFlightReceiptNotFoundBeforeTimeoutKeepsTrackingAndStopsDuplicate Return(false, nilReceipt, nil).Once() transitions, errs := m.submitClaimsAndUpdateDatabase( - makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) require.Empty(t, errs) assert.Equal(t, 0, transitions) assert.Contains(t, m.claimsInFlight, app.ID, @@ -275,7 +276,7 @@ func TestClaimInFlightReceiptNotFoundAfterTimeoutClearsAndRetries(t *testing.T) b.On("submitClaimToBlockchain", mock.Anything, app, currEpoch). Return(newTxHash, nil).Once() - transitions, errs := m.submitClaimsAndUpdateDatabase(makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + transitions, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) require.Empty(t, errs) assert.Equal(t, 1, transitions, "stale in-flight tx should allow the normal submit path to retry") got, ok := m.claimsInFlight[app.ID] @@ -302,7 +303,7 @@ func TestAcceptInFlightPollErrorKeepsTracking(t *testing.T) { b.On("pollTransaction", mock.Anything, txHash, endBlock). Return(false, nilReceipt, expectedErr).Once() - transitions, err := m.checkAcceptsInFlight(makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + transitions, err := m.checkAcceptsInFlight(context.Background(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) require.ErrorIs(t, err, expectedErr) assert.Equal(t, 0, transitions) assert.Contains(t, m.acceptsInFlight, app.ID, @@ -342,7 +343,7 @@ func TestAcceptInFlightErrorsDoNotStopOtherAppsOrDropPollErrors(t *testing.T) { r.On("UpdateEpochWithAcceptedClaim", mock.Anything, app2.ID, epoch2.Index, (*common.Hash)(nil)). Return(updateErr).Once() - transitions, err := m.checkAcceptsInFlight(makeEpochMap(epoch1, epoch2), makeApplicationMap(app1, app2), endBlock) + transitions, err := m.checkAcceptsInFlight(context.Background(), makeEpochMap(epoch1, epoch2), makeApplicationMap(app1, app2), endBlock) require.Error(t, err) assert.ErrorIs(t, err, pollErr) assert.ErrorIs(t, err, updateErr) @@ -371,7 +372,7 @@ func TestAcceptInFlightReceiptNotFoundAfterTimeoutClearsTracking(t *testing.T) { b.On("pollTransaction", mock.Anything, txHash, endBlock). Return(false, nilReceipt, nil).Once() - transitions, err := m.checkAcceptsInFlight(makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + transitions, err := m.checkAcceptsInFlight(context.Background(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) require.NoError(t, err) assert.Equal(t, 0, transitions) assert.NotContains(t, m.acceptsInFlight, app.ID, @@ -403,7 +404,7 @@ func TestAcceptInFlightSuccessUpdatesEpochAndClearsTracking(t *testing.T) { r.On("UpdateEpochWithAcceptedClaim", mock.Anything, app.ID, currEpoch.Index, (*common.Hash)(nil)). Return(nil).Once() - transitions, err := m.checkAcceptsInFlight(stagedEpochs, makeApplicationMap(app), endBlock) + transitions, err := m.checkAcceptsInFlight(context.Background(), stagedEpochs, makeApplicationMap(app), endBlock) require.NoError(t, err) assert.Equal(t, 1, transitions) assert.NotContains(t, m.acceptsInFlight, app.ID) @@ -438,7 +439,7 @@ func TestAcceptInFlightRevertedAcceptedReconcilesEpoch(t *testing.T) { r.On("UpdateEpochWithAcceptedClaim", mock.Anything, app.ID, currEpoch.Index, (*common.Hash)(nil)). Return(nil).Once() - transitions, err := m.checkAcceptsInFlight(stagedEpochs, makeApplicationMap(app), endBlock) + transitions, err := m.checkAcceptsInFlight(context.Background(), stagedEpochs, makeApplicationMap(app), endBlock) require.NoError(t, err) assert.Equal(t, 1, transitions) assert.NotContains(t, m.acceptsInFlight, app.ID) @@ -472,7 +473,7 @@ func TestAcceptInFlightRevertedUnstagedMarksApplicationFailed(t *testing.T) { })). Return(nil).Once() - transitions, err := m.checkAcceptsInFlight(makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + transitions, err := m.checkAcceptsInFlight(context.Background(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) require.NoError(t, err) assert.Equal(t, 0, transitions) assert.Equal(t, model.ApplicationStatus_Failed, app.Status) @@ -509,7 +510,7 @@ func TestAcceptInFlightRevertedForeclosedTerminalizes(t *testing.T) { // CRITICAL: no UpdateApplicationStatus expectation — any FAILED/DIVERGED/ // CORRUPTED write trips the mock as an unexpected call. - transitions, err := m.checkAcceptsInFlight(makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + transitions, err := m.checkAcceptsInFlight(context.Background(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) require.NoError(t, err) assert.Equal(t, 1, transitions, "terminalization counts as completed work") assert.Equal(t, model.EpochStatus_ClaimForeclosed, currEpoch.Status) diff --git a/internal/claimer/reverts.go b/internal/claimer/reverts.go index 3659c5e51..3c0e588f9 100644 --- a/internal/claimer/reverts.go +++ b/internal/claimer/reverts.go @@ -4,6 +4,7 @@ package claimer import ( + "context" "fmt" "math/big" "reflect" @@ -96,6 +97,7 @@ const ( // ClaimNotStaged and ClaimStagingPeriodNotOverYet only come from acceptClaim, // so handleAcceptClaimRevert handles them. func (s *Service) handleSubmitClaimRevert( + ctx context.Context, err error, app *model.Application, epoch *model.Epoch, @@ -162,7 +164,7 @@ func (s *Service) handleSubmitClaimRevert( case isCustomConsensusError(err, "InvalidOutputsMerkleRootProofSize"): stateErr := s.setApplicationCorrupted( - s.Context, app, + ctx, app, "submitClaim reverted with InvalidOutputsMerkleRootProofSize for "+ "epoch %d (%d), last_block %d — outputs_merkle_proof in DB is "+ "the wrong length for the machine memory tree.", @@ -174,7 +176,7 @@ func (s *Service) handleSubmitClaimRevert( // Operator configuration error: the signing key is not a Quorum // validator. The operator can fix the key, so use FAILED rather than a // terminal DIVERGED/CORRUPTED status. - stateErr := appstatus.SetFailedf(s.Context, s.Logger, s.repository, app, + stateErr := appstatus.SetFailedf(ctx, s.Logger, s.repository, app, "submitClaim reverted with CallerIsNotValidator: the configured "+ "signing key is not a member of the Quorum for app %s. "+ "Check the validator key configuration.", @@ -188,7 +190,7 @@ func (s *Service) handleSubmitClaimRevert( // not fit the tree. Like InvalidOutputsMerkleRootProofSize, this means // the proof stored locally is bad — local data corruption. stateErr := s.setApplicationCorrupted( - s.Context, app, + ctx, app, "submitClaim reverted with InvalidNodeIndex for "+ "epoch %d (%d), last_block %d — outputs_merkle_proof in DB does "+ "not form a valid replacement proof for the machine memory tree.", @@ -197,7 +199,7 @@ func (s *Service) handleSubmitClaimRevert( return submitClaimAppHalted, stateErr } - switch action, stateErr := s.classifySharedConsensusRevert("submitClaim", err, app, epoch); action { + switch action, stateErr := s.classifySharedConsensusRevert(ctx, "submitClaim", err, app, epoch); action { case sharedRevertAppHalted: return submitClaimAppHalted, stateErr case sharedRevertRetryLater: @@ -233,6 +235,7 @@ const ( // processed block number — NotEpochFinalBlock (FAILED) and NotPastBlock // (retry later). call names the reverting method in reasons and logs. func (s *Service) classifySharedConsensusRevert( + ctx context.Context, call string, err error, app *model.Application, @@ -245,7 +248,7 @@ func (s *Service) classifySharedConsensusRevert( // from ApplicationReverted. A provider simulating against a block just // before a fresh deployment could fire this transiently, but FAILED is // recoverable, so that rare case only costs an operator re-enable. - stateErr := appstatus.SetFailedf(s.Context, s.Logger, s.repository, app, + stateErr := appstatus.SetFailedf(ctx, s.Logger, s.repository, app, "%s reverted with ApplicationNotDeployed for app %s, "+ "epoch %d (%d), last_block %d: no contract code exists at the "+ "application address. Verify the application address and that "+ @@ -262,7 +265,7 @@ func (s *Service) classifySharedConsensusRevert( // reverted — and an adversarial application contract can revert // selectively (e.g. per tx.origin) to suppress a targeted validator's // votes — so the reason carries it. - stateErr := appstatus.SetFailedf(s.Context, s.Logger, s.repository, app, + stateErr := appstatus.SetFailedf(ctx, s.Logger, s.repository, app, "%s reverted with ApplicationReverted for app %s, "+ "epoch %d (%d), last_block %d: the application contract reverted "+ "when the consensus contract queried it. Verify the deployed "+ @@ -274,7 +277,7 @@ func (s *Service) classifySharedConsensusRevert( return sharedRevertAppHalted, stateErr case isCustomConsensusError(err, "IllformedApplicationReturnData"): - stateErr := appstatus.SetFailedf(s.Context, s.Logger, s.repository, app, + stateErr := appstatus.SetFailedf(ctx, s.Logger, s.repository, app, "%s reverted with IllformedApplicationReturnData for app %s, "+ "epoch %d (%d), last_block %d: the application contract returned "+ "malformed data when the consensus contract queried it. Verify "+ @@ -286,7 +289,7 @@ func (s *Service) classifySharedConsensusRevert( return sharedRevertAppHalted, stateErr case isCustomConsensusError(err, "NotEpochFinalBlock"): - stateErr := appstatus.SetFailedf(s.Context, s.Logger, s.repository, app, + stateErr := appstatus.SetFailedf(ctx, s.Logger, s.repository, app, "%s reverted with NotEpochFinalBlock for app %s, "+ "epoch %d (%d), last_block %d: the node submitted a "+ "lastProcessedBlockNumber that the consensus contract does not "+ @@ -343,6 +346,7 @@ func (s *Service) classifySharedConsensusRevert( // / NotEpochFinalBlock (all FAILED) and NotPastBlock (retry later) reverts — // classified by classifySharedConsensusRevert. func (s *Service) handleAcceptClaimRevert( + ctx context.Context, err error, app *model.Application, epoch *model.Epoch, @@ -410,7 +414,7 @@ func (s *Service) handleAcceptClaimRevert( // newer version than this node. Surface it as FAILED (recoverable: an // operator can upgrade the node and re-enable) instead of spinning // silently every tick. - stateErr := appstatus.SetFailedf(s.Context, s.Logger, s.repository, app, + stateErr := appstatus.SetFailedf(ctx, s.Logger, s.repository, app, "acceptClaim reverted with ClaimNotStaged carrying unmodeled ClaimStatus %d for "+ "epoch %d (%d) — this node models only 0/1/2. The IConsensus contract may be "+ "newer than this node supports; upgrade the node or verify the contract.", @@ -437,7 +441,7 @@ func (s *Service) handleAcceptClaimRevert( return acceptClaimRetryLater, nil } - switch action, stateErr := s.classifySharedConsensusRevert("acceptClaim", err, app, epoch); action { + switch action, stateErr := s.classifySharedConsensusRevert(ctx, "acceptClaim", err, app, epoch); action { case sharedRevertAppHalted: return acceptClaimAppHalted, stateErr case sharedRevertRetryLater: diff --git a/internal/claimer/reverts_test.go b/internal/claimer/reverts_test.go index 058986d49..904f5d694 100644 --- a/internal/claimer/reverts_test.go +++ b/internal/claimer/reverts_test.go @@ -4,6 +4,7 @@ package claimer import ( + "context" "fmt" "math/big" "strings" @@ -96,7 +97,7 @@ func TestNotFirstClaimHandledGracefully(t *testing.T) { Return(common.Hash{}, notFirstClaimError()).Once() _, errs := m.submitClaimsAndUpdateDatabase( - makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 0, len(errs)) assert.Equal(t, 0, len(m.claimsInFlight)) } @@ -121,7 +122,7 @@ func TestNotFirstClaimQuorumRetriesForEventSync(t *testing.T) { Return(common.Hash{}, notFirstClaimError()).Once() _, errs := m.submitClaimsAndUpdateDatabase( - makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 0, len(errs)) assert.Equal(t, 0, len(m.claimsInFlight)) } @@ -146,7 +147,7 @@ func TestApplicationForeclosedIsTransient(t *testing.T) { currEpochs := makeEpochMap(currEpoch) transitions, errs := m.submitClaimsAndUpdateDatabase( - makeEpochMap(), currEpochs, makeApplicationMap(app), endBlock) + context.Background(), makeEpochMap(), currEpochs, makeApplicationMap(app), endBlock) assert.Equal(t, 0, transitions, "no DB transition on transient revert") assert.Equal(t, 0, len(errs), "ApplicationForeclosed must not surface as an error") assert.Equal(t, 1, len(currEpochs), "epoch must remain in work map for retry") @@ -173,7 +174,7 @@ func TestInvalidOutputsMerkleRootProofSizeSetsCorrupted(t *testing.T) { currEpochs := makeEpochMap(currEpoch) _, errs := m.submitClaimsAndUpdateDatabase( - makeEpochMap(), currEpochs, makeApplicationMap(app), endBlock) + context.Background(), makeEpochMap(), currEpochs, makeApplicationMap(app), endBlock) assert.Equal(t, 1, len(errs), "CORRUPTED transition must surface a terminal error") assert.Equal(t, 0, len(currEpochs), "epoch must be dropped from work map") assert.Equal(t, 0, len(m.claimsInFlight)) @@ -200,7 +201,7 @@ func TestCallerIsNotValidatorSetsFailed(t *testing.T) { currEpochs := makeEpochMap(currEpoch) _, errs := m.submitClaimsAndUpdateDatabase( - makeEpochMap(), currEpochs, makeApplicationMap(app), endBlock) + context.Background(), makeEpochMap(), currEpochs, makeApplicationMap(app), endBlock) // SetFailedf returns nil on success — the call site only surfaces an // error when state-update itself failed, so no error is expected here. assert.Equal(t, 0, len(errs)) @@ -228,7 +229,7 @@ func TestNotPastBlockRetriesLater(t *testing.T) { currEpochs := makeEpochMap(currEpoch) transitions, errs := m.submitClaimsAndUpdateDatabase( - makeEpochMap(), currEpochs, makeApplicationMap(app), endBlock) + context.Background(), makeEpochMap(), currEpochs, makeApplicationMap(app), endBlock) assert.Equal(t, 0, transitions, "no DB transition on transient revert") assert.Equal(t, 0, len(errs), "NotPastBlock must not surface as an error") assert.Equal(t, 1, len(currEpochs), "epoch must remain in work map for retry") @@ -305,7 +306,7 @@ func TestSubmitClaimRevertsSetApplicationFailed(t *testing.T) { currEpochs := makeEpochMap(currEpoch) transitions, errs := m.submitClaimsAndUpdateDatabase( - makeEpochMap(), currEpochs, makeApplicationMap(app), endBlock) + context.Background(), makeEpochMap(), currEpochs, makeApplicationMap(app), endBlock) // SetFailedf returns nil on success — the call site only surfaces // an error when the status update itself failed. assert.Equal(t, 0, transitions, "FAILED is not a claim transition") @@ -336,7 +337,7 @@ func TestSubmitClaimFailedRevertWithDBError(t *testing.T) { currEpochs := makeEpochMap(currEpoch) _, errs := m.submitClaimsAndUpdateDatabase( - makeEpochMap(), currEpochs, makeApplicationMap(app), endBlock) + context.Background(), makeEpochMap(), currEpochs, makeApplicationMap(app), endBlock) assert.Equal(t, 1, len(errs), "status-update failure must surface as an error") if len(errs) == 1 { assert.ErrorContains(t, errs[0], "db down", "the surfaced error must be the DB error") @@ -413,7 +414,7 @@ func TestHandleAcceptClaimRevert(t *testing.T) { m, _, _ := newServiceMock() app := makeApplication() epoch := makeStagedEpoch(app, 3, 50) - outcome, stateErr := m.handleAcceptClaimRevert(tc.err, app, epoch) + outcome, stateErr := m.handleAcceptClaimRevert(context.Background(), tc.err, app, epoch) assert.Equal(t, tc.want, outcome) assert.Nil(t, stateErr, "classifier must not mutate state") }) @@ -459,7 +460,7 @@ func TestClaimNotStagedUnmodeledStatusFailsClosed(t *testing.T) { r.On("UpdateApplicationStatus", mock.Anything, app.ID, model.ApplicationStatus_Failed, mock.Anything). Return(nil).Once() - outcome, stateErr := m.handleAcceptClaimRevert(claimNotStagedError(3), app, epoch) + outcome, stateErr := m.handleAcceptClaimRevert(context.Background(), claimNotStagedError(3), app, epoch) assert.Equal(t, acceptClaimAppHalted, outcome, "a cleanly-decoded unmodeled ClaimStatus must escalate, not retry") // SetFailedf returns nil on success; the FAILED write itself is asserted by @@ -504,7 +505,7 @@ func TestAcceptClaimRevertsSetApplicationFailed(t *testing.T) { })). Return(nil).Once() - outcome, stateErr := m.handleAcceptClaimRevert(consensusRevertError(revertName), app, epoch) + outcome, stateErr := m.handleAcceptClaimRevert(context.Background(), consensusRevertError(revertName), app, epoch) assert.Equal(t, acceptClaimAppHalted, outcome) // SetFailedf returns nil on success; the FAILED write itself is // asserted by the mock expectation above. @@ -526,7 +527,7 @@ func TestInvalidNodeIndexSetsCorrupted(t *testing.T) { r.On("UpdateApplicationStatus", mock.Anything, app.ID, model.ApplicationStatus_Corrupted, mock.Anything). Return(nil).Once() - outcome, stateErr := m.handleSubmitClaimRevert(consensusRevertError("InvalidNodeIndex"), app, epoch) + outcome, stateErr := m.handleSubmitClaimRevert(context.Background(), consensusRevertError("InvalidNodeIndex"), app, epoch) assert.Equal(t, submitClaimAppHalted, outcome) assert.Error(t, stateErr, "CORRUPTED is terminal; the handler must return the reason error") } @@ -586,7 +587,7 @@ func TestHandleSubmitClaimRevert(t *testing.T) { // covered by the existing end-to-end pipeline tests. app.ConsensusType = model.Consensus_Authority epoch := makeEpoch(app.ID, model.EpochStatus_ClaimComputed, 3) - outcome, _ := m.handleSubmitClaimRevert(tc.err, app, epoch) + outcome, _ := m.handleSubmitClaimRevert(context.Background(), tc.err, app, epoch) assert.Equal(t, tc.want, outcome) }) } diff --git a/internal/claimer/stage.go b/internal/claimer/stage.go index 72336df21..195addb20 100644 --- a/internal/claimer/stage.go +++ b/internal/claimer/stage.go @@ -4,6 +4,7 @@ package claimer import ( + "context" "fmt" "math/big" @@ -61,6 +62,7 @@ const ( // always a separate transaction. Code that tries to accept from the submit // receipt is using the wrong contract model. func (s *Service) tryStageFromReceipt( + ctx context.Context, receipt *types.Receipt, app *model.Application, epoch *model.Epoch, @@ -87,7 +89,7 @@ func (s *Service) tryStageFromReceipt( } matches, ok := claimStagedEventMatches(app, epoch, event) if !ok { - pErr := s.markMatcherPrecondFailure(app, epoch, "tryStageFromReceipt") + pErr := s.markMatcherPrecondFailure(ctx, app, epoch, "tryStageFromReceipt") return stageReceiptPrecondFailure, pErr } if !matches { @@ -97,11 +99,11 @@ func (s *Service) tryStageFromReceipt( // state here is also a fault. Mark the app DIVERGED and return a // special result so the caller does not log success or fall back to // the plain SUBMITTED update. - divErr := s.markStagingDivergence(app, epoch, event, "tryStageFromReceipt") + divErr := s.markStagingDivergence(ctx, app, epoch, event, "tryStageFromReceipt") return stageReceiptDivergent, divErr } err = s.repository.UpdateEpochThroughStaging( - s.Context, epoch.ApplicationID, epoch.Index, + ctx, epoch.ApplicationID, epoch.Index, receipt.TxHash, log.BlockNumber) if err != nil { return stageReceiptDBPending, fmt.Errorf( @@ -130,6 +132,7 @@ func (s *Service) tryStageFromReceipt( // set to DIVERGED. The reason tells the guardian to call foreclose() before // the staging period ends. func (s *Service) stageClaimsAndUpdateDatabase( + ctx context.Context, acceptedEpochs map[int64]*model.Epoch, submittedEpochs map[int64]*model.Epoch, apps map[int64]*model.Application, @@ -139,7 +142,7 @@ func (s *Service) stageClaimsAndUpdateDatabase( errs := []error{} for key, currEpoch := range submittedEpochs { - result := s.processSubmittedClaim(submittedClaimWork{ + result := s.processSubmittedClaim(ctx, submittedClaimWork{ app: apps[key], prevEpoch: acceptedEpochs[key], epoch: currEpoch, @@ -156,6 +159,7 @@ func (s *Service) stageClaimsAndUpdateDatabase( } func (s *Service) processSubmittedClaim( + ctx context.Context, work submittedClaimWork, defaultBlockNumber *big.Int, ) claimStepResult { @@ -163,7 +167,7 @@ func (s *Service) processSubmittedClaim( currEpoch := work.epoch prevEpoch := work.prevEpoch - if err := s.checkConsensusForAddressChange(app, defaultBlockNumber); err != nil { + if err := s.checkConsensusForAddressChange(ctx, app, defaultBlockNumber); err != nil { return claimDropped(err) } @@ -173,7 +177,7 @@ func (s *Service) processSubmittedClaim( } _, currEvent, _, err := s.blockchain.findClaimStagedEventAndSucc( - s.Context, app, currEpoch, fromBlock, defaultBlockNumber.Uint64(), + ctx, app, currEpoch, fromBlock, defaultBlockNumber.Uint64(), ) if err != nil { return claimDropped(err) @@ -188,13 +192,13 @@ func (s *Service) processSubmittedClaim( ) matches, ok := claimStagedEventMatches(app, currEpoch, currEvent) if !ok { - return claimDropped(s.markMatcherPrecondFailure(app, currEpoch, "stageClaimsAndUpdateDatabase")) + return claimDropped(s.markMatcherPrecondFailure(ctx, app, currEpoch, "stageClaimsAndUpdateDatabase")) } if !matches { - return claimDropped(s.markStagingDivergence(app, currEpoch, currEvent, "stageClaimsAndUpdateDatabase")) + return claimDropped(s.markStagingDivergence(ctx, app, currEpoch, currEvent, "stageClaimsAndUpdateDatabase")) } err = s.repository.UpdateEpochToStaged( - s.Context, currEpoch.ApplicationID, currEpoch.Index, + ctx, currEpoch.ApplicationID, currEpoch.Index, currEvent.Raw.BlockNumber) if err != nil { return claimDropped(err) @@ -212,7 +216,7 @@ func (s *Service) processSubmittedClaim( // exists up to this block, this submitted claim has no remaining on-chain // path. if app.ForecloseBlock != 0 { - if ferr := s.forecloseClaim(app, currEpoch, "stageClaimsAndUpdateDatabase"); ferr != nil { + if ferr := s.forecloseClaim(ctx, app, currEpoch, "stageClaimsAndUpdateDatabase"); ferr != nil { return claimDropped(ferr) } return claimWorkCompleted(1) diff --git a/internal/claimer/stage_test.go b/internal/claimer/stage_test.go index e5c8f64db..4db0fcc76 100644 --- a/internal/claimer/stage_test.go +++ b/internal/claimer/stage_test.go @@ -4,6 +4,7 @@ package claimer import ( + "context" "fmt" "math/big" "strings" @@ -50,7 +51,7 @@ func TestStagingFastPathDivergence(t *testing.T) { r.On("UpdateApplicationStatus", mock.Anything, app.ID, model.ApplicationStatus_Diverged, mock.Anything). Return(nil).Once() - _, errs := m.submitClaimsAndUpdateDatabase(makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + _, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) // The fast-path consumed the receipt and triggered DIVERGED. The // divergence error is surfaced (matching the convention used by other // terminal-status setters); @@ -100,7 +101,7 @@ func TestStagingFastPathDBPending(t *testing.T) { computedEpochs := makeEpochMap(currEpoch) _, errs := m.submitClaimsAndUpdateDatabase( - makeEpochMap(), computedEpochs, makeApplicationMap(app), endBlock) + context.Background(), makeEpochMap(), computedEpochs, makeApplicationMap(app), endBlock) require.Equal(t, 1, len(errs), "DB-pending must surface as a tick-level error") assert.ErrorIs(t, errs[0], dbErr) @@ -131,7 +132,7 @@ func TestStageByObservation(t *testing.T) { r.On("UpdateEpochToStaged", mock.Anything, app.ID, currEpoch.Index, currEvent.Raw.BlockNumber). Return(nil).Once() - transitions, errs := m.stageClaimsAndUpdateDatabase(makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + transitions, errs := m.stageClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 0, len(errs)) assert.Equal(t, 1, transitions) } @@ -152,7 +153,7 @@ func TestStageForeclosesSubmittedForeclosedApp(t *testing.T) { r.On("UpdateEpochWithForeclosedClaim", mock.Anything, app.ID, currEpoch.Index). Return(nil).Once() - transitions, errs := m.stageClaimsAndUpdateDatabase(makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + transitions, errs := m.stageClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 0, len(errs)) assert.Equal(t, 1, transitions) assert.Equal(t, model.EpochStatus_ClaimForeclosed, currEpoch.Status) @@ -189,7 +190,7 @@ func TestStagingDivergence_Quorum(t *testing.T) { })). Return(nil).Once() - _, errs := m.stageClaimsAndUpdateDatabase(makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + _, errs := m.stageClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 1, len(errs)) assert.Equal(t, model.ApplicationStatus_Diverged, app.Status) assert.Equal(t, model.EpochStatus_ClaimRejected, currEpoch.Status) @@ -218,7 +219,7 @@ func TestStagingDivergence_AuthorityDoesNotRejectEpoch(t *testing.T) { r.On("UpdateApplicationStatus", mock.Anything, app.ID, model.ApplicationStatus_Diverged, mock.Anything). Return(nil).Once() - _, errs := m.stageClaimsAndUpdateDatabase(makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + _, errs := m.stageClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 1, len(errs)) assert.Equal(t, model.ApplicationStatus_Diverged, app.Status) assert.Equal(t, model.EpochStatus_ClaimSubmitted, currEpoch.Status) @@ -244,7 +245,7 @@ func TestStagingMatcherPreconditionFailureMarksApplicationCorrupted(t *testing.T })). Return(nil).Once() - _, errs := m.stageClaimsAndUpdateDatabase(makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + _, errs := m.stageClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 1, len(errs)) assert.Equal(t, model.ApplicationStatus_Corrupted, app.Status) } @@ -280,7 +281,7 @@ func TestStagingDivergenceReaderMode_Quorum(t *testing.T) { })). Return(nil).Once() - _, errs := m.stageClaimsAndUpdateDatabase(makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + _, errs := m.stageClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 1, len(errs), "divergence detection must fire in reader mode") assert.Equal(t, model.EpochStatus_ClaimRejected, currEpoch.Status) } diff --git a/internal/claimer/submit.go b/internal/claimer/submit.go index c2c2c490f..87da00219 100644 --- a/internal/claimer/submit.go +++ b/internal/claimer/submit.go @@ -29,7 +29,7 @@ func (s *Service) findClaimSubmittedEventAndSucc( err := checkEpochSequenceConstraint(prevEpoch, currEpoch) if err != nil { err = s.setApplicationCorrupted( - s.Context, + ctx, app, "%v. epoch: %v (%v).", err, @@ -50,7 +50,7 @@ func (s *Service) findClaimSubmittedEventAndSucc( if claimSubmittedEventMatchesEpoch(app, prevEpoch, event) { matches, ok := claimSubmittedEventMatches(app, prevEpoch, event) if !ok { - err = s.markMatcherPrecondFailure(app, prevEpoch, "findClaimSubmittedEventAndSucc(prev)") + err = s.markMatcherPrecondFailure(ctx, app, prevEpoch, "findClaimSubmittedEventAndSucc(prev)") return nil, nil, err } if matches { @@ -61,7 +61,7 @@ func (s *Service) findClaimSubmittedEventAndSucc( continue } err = s.setApplicationDiverged( - s.Context, + ctx, app, "application has an invalid epoch: %v (%v), missing claim submitted event (%v).", prevEpoch.Index, @@ -73,7 +73,7 @@ func (s *Service) findClaimSubmittedEventAndSucc( } if prevClaimSubmissionEvent == nil { err = s.setApplicationCorrupted( - s.Context, + ctx, app, "application has an invalid epoch: %v (%v). No claim submission event to match.", prevEpoch.Index, @@ -86,6 +86,7 @@ func (s *Service) findClaimSubmittedEventAndSucc( } func (s *Service) classifyClaimSubmittedEvents( + ctx context.Context, app *model.Application, epoch *model.Epoch, events []*iconsensus.IConsensusClaimSubmitted, @@ -106,7 +107,7 @@ func (s *Service) classifyClaimSubmittedEvents( ) matches, ok := claimSubmittedEventMatches(app, epoch, event) if !ok { - return nil, true, s.markMatcherPrecondFailure(app, epoch, site) + return nil, true, s.markMatcherPrecondFailure(ctx, app, epoch, site) } if matches { if !s.shouldRecordMatchingClaimSubmitted(app, epoch, event) { @@ -118,7 +119,7 @@ func (s *Service) classifyClaimSubmittedEvents( if s.shouldIgnoreQuorumSubmittedMismatch(app, epoch, event, site) { continue } - return nil, true, s.markSubmittedDivergence(app, epoch, event, site) + return nil, true, s.markSubmittedDivergence(ctx, app, epoch, event, site) } return nil, false, nil } @@ -189,12 +190,13 @@ func (s *Service) shouldRecordMatchingClaimSubmitted( // CLAIM_SUBMITTED. It returns the number of successful state changes and any // errors. func (s *Service) submitClaimsAndUpdateDatabase( + ctx context.Context, acceptedOrSubmittedEpochs map[int64]*model.Epoch, computedEpochs map[int64]*model.Epoch, apps map[int64]*model.Application, defaultBlockNumber *big.Int, ) (int, []error) { - confirmed, err := s.checkClaimsInFlight(computedEpochs, apps, defaultBlockNumber) + confirmed, err := s.checkClaimsInFlight(ctx, computedEpochs, apps, defaultBlockNumber) if err != nil { return confirmed, []error{err} } @@ -202,7 +204,7 @@ func (s *Service) submitClaimsAndUpdateDatabase( transitions := confirmed errs := []error{} for key, currEpoch := range computedEpochs { - result := s.processComputedClaim(computedClaimWork{ + result := s.processComputedClaim(ctx, computedClaimWork{ app: apps[key], prevEpoch: acceptedOrSubmittedEpochs[key], epoch: currEpoch, @@ -219,6 +221,7 @@ func (s *Service) submitClaimsAndUpdateDatabase( } func (s *Service) processComputedClaim( + ctx context.Context, work computedClaimWork, defaultBlockNumber *big.Int, ) claimStepResult { @@ -232,21 +235,21 @@ func (s *Service) processComputedClaim( } // Stop if the consensus contract address changed on chain. - if err := s.checkConsensusForAddressChange(app, defaultBlockNumber); err != nil { + if err := s.checkConsensusForAddressChange(ctx, app, defaultBlockNumber); err != nil { return claimDropped(err) } - if result, done := s.reconcileComputedAcceptedEvent(work, defaultBlockNumber); done { + if result, done := s.reconcileComputedAcceptedEvent(ctx, work, defaultBlockNumber); done { return result } - ic, submittedEvents, result, done := s.findSubmittedEventsForComputedClaim(work, defaultBlockNumber) + ic, submittedEvents, result, done := s.findSubmittedEventsForComputedClaim(ctx, work, defaultBlockNumber) if done { return result } currEvent, shouldDrop, err := s.classifyClaimSubmittedEvents( - app, currEpoch, submittedEvents, "submitClaimsAndUpdateDatabase(ClaimSubmitted)") + ctx, app, currEpoch, submittedEvents, "submitClaimsAndUpdateDatabase(ClaimSubmitted)") if shouldDrop { return claimDropped(err) } @@ -254,7 +257,7 @@ func (s *Service) processComputedClaim( return claimRetryLater(err) } if currEvent != nil { - return s.recordSubmittedEvent(app, currEpoch, currEvent) + return s.recordSubmittedEvent(ctx, app, currEpoch, currEvent) } if prevEpoch != nil && prevEpoch.Status != model.EpochStatus_ClaimAccepted { @@ -275,7 +278,7 @@ func (s *Service) processComputedClaim( // This read also runs for foreclosed apps. A claim accepted before // foreclosure must still be copied into the DB. Only the new submitClaim // transaction is skipped for foreclosed apps. - if reconciled, err := s.reconcileBeforeSubmit(app, currEpoch, defaultBlockNumber); reconciled { + if reconciled, err := s.reconcileBeforeSubmit(ctx, app, currEpoch, defaultBlockNumber); reconciled { if err != nil { return claimDropped(err) } @@ -288,7 +291,7 @@ func (s *Service) processComputedClaim( // already showed that this claim is not STAGED or ACCEPTED, so it has // no remaining on-chain path. if app.ForecloseBlock != 0 { - if ferr := s.forecloseClaim(app, currEpoch, "submitClaimsAndUpdateDatabase"); ferr != nil { + if ferr := s.forecloseClaim(ctx, app, currEpoch, "submitClaimsAndUpdateDatabase"); ferr != nil { return claimDropped(ferr) } return claimWorkCompleted(1) @@ -296,12 +299,13 @@ func (s *Service) processComputedClaim( } if s.submissionEnabled { - return s.broadcastComputedClaim(ic, app, currEpoch, defaultBlockNumber) + return s.broadcastComputedClaim(ctx, ic, app, currEpoch, defaultBlockNumber) } return claimNoProgress() } func (s *Service) reconcileComputedAcceptedEvent( + ctx context.Context, work computedClaimWork, defaultBlockNumber *big.Int, ) (claimStepResult, bool) { @@ -325,7 +329,7 @@ func (s *Service) reconcileComputedAcceptedEvent( acceptScanFrom = prevEpoch.LastBlock + 1 } _, foreignAccepted, _, err := s.blockchain.findClaimAcceptedEventAndSucc( - s.Context, app, currEpoch, acceptScanFrom, defaultBlockNumber.Uint64(), + ctx, app, currEpoch, acceptScanFrom, defaultBlockNumber.Uint64(), ) if err != nil { return claimDropped(fmt.Errorf( @@ -337,14 +341,14 @@ func (s *Service) reconcileComputedAcceptedEvent( } matches, ok := claimAcceptedEventMatches(app, currEpoch, foreignAccepted) if !ok { - return claimDropped(s.markMatcherPrecondFailure(app, currEpoch, "submitClaimsAndUpdateDatabase(ClaimAccepted)")), true + return claimDropped(s.markMatcherPrecondFailure(ctx, app, currEpoch, "submitClaimsAndUpdateDatabase(ClaimAccepted)")), true } if !matches { - return claimDropped(s.markAcceptedDivergence(app, currEpoch, foreignAccepted, "submitClaimsAndUpdateDatabase")), true + return claimDropped(s.markAcceptedDivergence(ctx, app, currEpoch, foreignAccepted, "submitClaimsAndUpdateDatabase")), true } acceptedTxHash := foreignAccepted.Raw.TxHash if err := s.repository.UpdateEpochWithAcceptedClaim( - s.Context, currEpoch.ApplicationID, currEpoch.Index, &acceptedTxHash); err != nil { + ctx, currEpoch.ApplicationID, currEpoch.Index, &acceptedTxHash); err != nil { return claimDropped(fmt.Errorf( "reconciling COMPUTED→ACCEPTED for epoch %d (%d): %w", currEpoch.Index, currEpoch.VirtualIndex, err)), true @@ -359,6 +363,7 @@ func (s *Service) reconcileComputedAcceptedEvent( } func (s *Service) findSubmittedEventsForComputedClaim( + ctx context.Context, work computedClaimWork, defaultBlockNumber *big.Int, ) (*iconsensus.IConsensus, []*iconsensus.IConsensusClaimSubmitted, claimStepResult, bool) { @@ -371,11 +376,11 @@ func (s *Service) findSubmittedEventsForComputedClaim( var err error if prevEpoch != nil { ic, submittedEvents, err = s.findClaimSubmittedEventAndSucc( - s.Context, app, prevEpoch, currEpoch, prevEpoch.LastBlock+1, defaultBlockNumber.Uint64(), + ctx, app, prevEpoch, currEpoch, prevEpoch.LastBlock+1, defaultBlockNumber.Uint64(), ) } else { ic, submittedEvents, err = s.blockchain.findClaimSubmittedEventAndSucc( - s.Context, app, currEpoch, currEpoch.LastBlock+1, defaultBlockNumber.Uint64(), + ctx, app, currEpoch, currEpoch.LastBlock+1, defaultBlockNumber.Uint64(), ) } if err != nil { @@ -385,6 +390,7 @@ func (s *Service) findSubmittedEventsForComputedClaim( } func (s *Service) recordSubmittedEvent( + ctx context.Context, app *model.Application, currEpoch *model.Epoch, currEvent *iconsensus.IConsensusClaimSubmitted, @@ -396,7 +402,7 @@ func (s *Service) recordSubmittedEvent( ) txHash := currEvent.Raw.TxHash err := s.repository.UpdateEpochWithSubmittedClaim( - s.Context, + ctx, currEpoch.ApplicationID, currEpoch.Index, txHash, @@ -415,6 +421,7 @@ func (s *Service) recordSubmittedEvent( } func (s *Service) broadcastComputedClaim( + ctx context.Context, ic *iconsensus.IConsensus, app *model.Application, currEpoch *model.Epoch, @@ -427,7 +434,7 @@ func (s *Service) broadcastComputedClaim( ) txHash, err := s.blockchain.submitClaimToBlockchain(ic, app, currEpoch) if err != nil { - switch outcome, stateErr := s.handleSubmitClaimRevert(err, app, currEpoch); outcome { + switch outcome, stateErr := s.handleSubmitClaimRevert(ctx, err, app, currEpoch); outcome { case submitClaimAlreadyOnChain: return claimNoProgress() case submitClaimRetryLater: @@ -466,18 +473,19 @@ func (s *Service) broadcastComputedClaim( // // All chain reads in one tick use the same finalized block number. func (s *Service) reconcileBeforeSubmit( + ctx context.Context, app *model.Application, currEpoch *model.Epoch, defaultBlockNumber *big.Int, ) (bool, error) { - claim, err := s.blockchain.getClaimStatus(s.Context, app, currEpoch, defaultBlockNumber) + claim, err := s.blockchain.getClaimStatus(ctx, app, currEpoch, defaultBlockNumber) if err != nil { return false, fmt.Errorf("pre-submit getClaim (app=%v, epoch=%d): %w", app.IApplicationAddress, currEpoch.Index, err) } switch claim.Status { case claimStatusAccepted: - if err := s.updateEpochAcceptedFromClaimStatus(app, currEpoch, claim, "reconcileBeforeSubmit"); err != nil { + if err := s.updateEpochAcceptedFromClaimStatus(ctx, app, currEpoch, claim, "reconcileBeforeSubmit"); err != nil { return false, fmt.Errorf("reconciling epoch %d (%d) to ACCEPTED: %w", currEpoch.Index, currEpoch.VirtualIndex, err) } @@ -489,7 +497,7 @@ func (s *Service) reconcileBeforeSubmit( ) return true, nil case claimStatusStaged: - stagingBlock, err := s.updateEpochStagedFromClaimStatus(app, currEpoch, claim, "reconcileBeforeSubmit") + stagingBlock, err := s.updateEpochStagedFromClaimStatus(ctx, app, currEpoch, claim, "reconcileBeforeSubmit") if err != nil { return false, fmt.Errorf("reconciling epoch %d (%d) to STAGED: %w", currEpoch.Index, currEpoch.VirtualIndex, err) diff --git a/internal/claimer/submit_test.go b/internal/claimer/submit_test.go index 951edd865..733ad93cf 100644 --- a/internal/claimer/submit_test.go +++ b/internal/claimer/submit_test.go @@ -4,6 +4,7 @@ package claimer import ( + "context" "math/big" "testing" @@ -37,7 +38,7 @@ func TestSubmitFirstClaim(t *testing.T) { b.On("submitClaimToBlockchain", mock.Anything, app, currEpoch). Return(common.HexToHash("0x10"), nil).Once() - transitions, errs := m.submitClaimsAndUpdateDatabase(makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + transitions, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 0, len(errs)) assert.Equal(t, 1, len(m.claimsInFlight)) assert.Equal(t, 1, transitions, "submitting a claim counts as a transition") @@ -70,7 +71,7 @@ func TestSubmitClaimForeclosesUnstagedForeclosedApp(t *testing.T) { computedEpochs := makeEpochMap(currEpoch) transitions, errs := m.submitClaimsAndUpdateDatabase( - makeEpochMap(), computedEpochs, makeApplicationMap(app), endBlock) + context.Background(), makeEpochMap(), computedEpochs, makeApplicationMap(app), endBlock) assert.Equal(t, 0, len(errs), "foreclosing an impossible claim is not an error") assert.Equal(t, 1, transitions, "CLAIM_FORECLOSED is a local status transition") @@ -100,7 +101,7 @@ func TestSubmitClaimForeclosesUnstagedForeclosedAppWhenSubmissionDisabled(t *tes Return(nil).Once() transitions, errs := m.submitClaimsAndUpdateDatabase( - makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 0, len(errs)) assert.Equal(t, 1, transitions) @@ -147,7 +148,7 @@ func TestSubmitClaimForecloseMidFlight(t *testing.T) { Return(tick1TxHash, nil).Once() transitions1, errs1 := m.submitClaimsAndUpdateDatabase( - makeEpochMap(), makeEpochMap(epochN), makeApplicationMap(app), endBlock) + context.Background(), makeEpochMap(), makeEpochMap(epochN), makeApplicationMap(app), endBlock) require.Empty(t, errs1) require.Equal(t, 1, transitions1, "tick 1: broadcast counts as a transition") require.Len(t, m.claimsInFlight, 1, "tick 1: claim enters in-flight set") @@ -177,7 +178,7 @@ func TestSubmitClaimForecloseMidFlight(t *testing.T) { // see the now-populated ForecloseBlock. transitions2, errs2 := m.submitClaimsAndUpdateDatabase( - makeEpochMap(), makeEpochMap(epochNPlus1), makeApplicationMap(app), endBlock) + context.Background(), makeEpochMap(), makeEpochMap(epochNPlus1), makeApplicationMap(app), endBlock) require.Empty(t, errs2, "foreclosing an impossible claim is not an error") assert.Equal(t, 1, transitions2, "tick 2: claim becomes CLAIM_FORECLOSED") assert.Equal(t, model.EpochStatus_ClaimForeclosed, epochNPlus1.Status) @@ -216,7 +217,7 @@ func TestSubmitClaimReconcilesAcceptedForForeclosedApp(t *testing.T) { computedEpochs := makeEpochMap(currEpoch) transitions, errs := m.submitClaimsAndUpdateDatabase( - makeEpochMap(), computedEpochs, makeApplicationMap(app), endBlock) + context.Background(), makeEpochMap(), computedEpochs, makeApplicationMap(app), endBlock) assert.Equal(t, 0, len(errs)) assert.Equal(t, 1, transitions, "ACCEPTED reconciliation counts as a transition") @@ -247,7 +248,7 @@ func TestSubmitClaimReconcilesStagedBeforeBroadcast(t *testing.T) { computedEpochs := makeEpochMap(currEpoch) transitions, errs := m.submitClaimsAndUpdateDatabase( - makeEpochMap(), computedEpochs, makeApplicationMap(app), endBlock) + context.Background(), makeEpochMap(), computedEpochs, makeApplicationMap(app), endBlock) assert.Equal(t, 0, len(errs)) assert.Equal(t, 1, transitions, "STAGED reconciliation counts as a transition") @@ -278,7 +279,7 @@ func TestReconcileBeforeSubmitAcceptedOutputsMismatchSetsDiverged(t *testing.T) Return(nil).Once() transitions, errs := m.submitClaimsAndUpdateDatabase( - makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 1, len(errs)) assert.Equal(t, 0, transitions) assert.Equal(t, 0, len(m.claimsInFlight)) @@ -305,7 +306,7 @@ func TestSubmitClaimWithAntecessor(t *testing.T) { b.On("submitClaimToBlockchain", mock.Anything, app, currEpoch). Return(common.HexToHash("0x10"), nil).Once() - transitions, errs := m.submitClaimsAndUpdateDatabase(makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + transitions, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 0, len(errs)) assert.Equal(t, 1, len(m.claimsInFlight)) assert.Equal(t, 1, transitions, "submitting a claim counts as a transition") @@ -334,7 +335,7 @@ func TestSubmitClaimWithAcceptedAntecessorWithoutClaimTransactionHash(t *testing Return(common.HexToHash("0x10"), nil).Once() transitions, errs := m.submitClaimsAndUpdateDatabase( - makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) require.Empty(t, errs) assert.Len(t, m.claimsInFlight, 1) assert.Equal(t, 1, transitions, "accepted predecessor with unknown tx hash must not block submission") @@ -359,7 +360,7 @@ func TestSkipSubmitClaimWithStagedAntecessor(t *testing.T) { Return(&iconsensus.IConsensus{}, prevEvent, currEvent, nil).Once() transitions, errs := m.submitClaimsAndUpdateDatabase( - makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 0, len(errs)) assert.Equal(t, 0, len(m.claimsInFlight)) assert.Equal(t, 0, transitions, "staged predecessor must block newer claim submission") @@ -383,7 +384,7 @@ func TestSkipSubmitFirstClaim(t *testing.T) { b.On("findClaimSubmittedEventAndSucc", mock.Anything, app, currEpoch, currEpoch.LastBlock+1, endBlock.Uint64()). Return(&iconsensus.IConsensus{}, prevEvent, currEvent, nil).Once() - transitions, errs := m.submitClaimsAndUpdateDatabase(makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + transitions, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 0, len(errs)) assert.Equal(t, 0, len(m.claimsInFlight)) assert.Equal(t, 0, transitions, "no transition when submission is disabled") @@ -408,7 +409,7 @@ func TestSkipSubmitClaimWithAntecessor(t *testing.T) { b.On("findClaimSubmittedEventAndSucc", mock.Anything, app, prevEpoch, prevEpoch.LastBlock+1, endBlock.Uint64()). Return(&iconsensus.IConsensus{}, prevEvent, currEvent, nil).Once() - _, errs := m.submitClaimsAndUpdateDatabase(makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + _, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, len(errs), 0) assert.Equal(t, len(m.claimsInFlight), 0) } @@ -432,7 +433,7 @@ func TestUpdateFirstClaim(t *testing.T) { r.On("UpdateEpochWithSubmittedClaim", mock.Anything, app.ID, currEpoch.Index, currEvent.Raw.TxHash). Return(nil).Once() - transitions, errs := m.submitClaimsAndUpdateDatabase(makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + transitions, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 0, len(errs)) assert.Equal(t, 0, len(m.claimsInFlight)) assert.Equal(t, 1, transitions, "finding on-chain event counts as a transition") @@ -458,7 +459,7 @@ func TestUpdateClaimWithAntecessor(t *testing.T) { r.On("UpdateEpochWithSubmittedClaim", mock.Anything, app.ID, currEpoch.Index, currEvent.Raw.TxHash). Return(nil).Once() - _, errs := m.submitClaimsAndUpdateDatabase(makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + _, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, len(errs), 0) assert.Equal(t, len(m.claimsInFlight), 0) } @@ -489,7 +490,7 @@ func TestQuorumSubmittedEventsIgnoresForeignDifferentOutputsAndUpdatesMatchingEv r.On("UpdateEpochWithSubmittedClaim", mock.Anything, app.ID, currEpoch.Index, currEvent.Raw.TxHash). Return(nil).Once() - transitions, errs := m.submitClaimsAndUpdateDatabase(makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + transitions, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 0, len(errs)) assert.Equal(t, 0, len(m.claimsInFlight)) assert.Equal(t, 1, transitions, "matching later event counts as a transition") @@ -521,7 +522,7 @@ func TestQuorumDifferentOutputSubmittedEventStillSubmitsLocalClaim(t *testing.T) b.On("submitClaimToBlockchain", mock.Anything, app, currEpoch). Return(txHash, nil).Once() - transitions, errs := m.submitClaimsAndUpdateDatabase(makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + transitions, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 0, len(errs)) assert.Equal(t, txHash, m.claimsInFlight[app.ID].txHash) assert.Equal(t, 1, transitions) @@ -549,7 +550,7 @@ func TestQuorumForeignMatchingSubmittedEventStillSubmitsLocalClaim(t *testing.T) b.On("submitClaimToBlockchain", mock.Anything, app, currEpoch). Return(txHash, nil).Once() - transitions, errs := m.submitClaimsAndUpdateDatabase(makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + transitions, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 0, len(errs)) assert.Equal(t, txHash, m.claimsInFlight[app.ID].txHash) assert.Equal(t, 1, transitions) @@ -576,7 +577,7 @@ func TestQuorumReaderModeRecordsForeignMatchingSubmittedEvent(t *testing.T) { r.On("UpdateEpochWithSubmittedClaim", mock.Anything, app.ID, currEpoch.Index, foreignEvent.Raw.TxHash). Return(nil).Once() - transitions, errs := m.submitClaimsAndUpdateDatabase(makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + transitions, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 0, len(errs)) assert.Equal(t, 0, len(m.claimsInFlight)) assert.Equal(t, 1, transitions, "reader mode must mirror a matching Quorum ClaimSubmitted from any validator") @@ -616,7 +617,7 @@ func TestQuorumSubmittedEventsIgnoresForeignAdversarialProofAndSubmitsLocalClaim b.On("submitClaimToBlockchain", mock.Anything, app, currEpoch). Return(txHash, nil).Once() - transitions, errs := m.submitClaimsAndUpdateDatabase(makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + transitions, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 0, len(errs)) assert.Equal(t, txHash, m.claimsInFlight[app.ID].txHash) assert.Equal(t, 1, transitions) @@ -648,7 +649,7 @@ func TestQuorumSubmittedEventsOwnMismatchSetsDiverged(t *testing.T) { Return(nil).Once() currEpochs := makeEpochMap(currEpoch) - transitions, errs := m.submitClaimsAndUpdateDatabase(makeEpochMap(), currEpochs, makeApplicationMap(app), endBlock) + transitions, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), currEpochs, makeApplicationMap(app), endBlock) assert.Equal(t, 1, len(errs)) assert.Equal(t, 0, len(currEpochs)) assert.Equal(t, 0, len(m.claimsInFlight)) @@ -680,7 +681,7 @@ func TestQuorumReaderModeIgnoresNonMatchingSubmittedEvent(t *testing.T) { b.On("findClaimSubmittedEventAndSucc", mock.Anything, app, currEpoch, currEpoch.LastBlock+1, endBlock.Uint64()). Return(&iconsensus.IConsensus{}, []*iconsensus.IConsensusClaimSubmitted{foreignEvent}, nil).Once() - transitions, errs := m.submitClaimsAndUpdateDatabase(makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + transitions, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 0, len(errs)) assert.Equal(t, 0, len(m.claimsInFlight)) assert.Equal(t, 0, transitions) @@ -716,7 +717,7 @@ func TestSubmitClaimWithAntecessorMismatch(t *testing.T) { r.On("UpdateApplicationStatus", mock.Anything, int64(0), model.ApplicationStatus_Corrupted, mock.Anything). Return(nil).Once() - _, errs := m.submitClaimsAndUpdateDatabase(makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + _, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 1, len(errs)) } @@ -746,7 +747,7 @@ func TestSubmitClaimWithEventMismatch(t *testing.T) { r.On("UpdateApplicationStatus", mock.Anything, int64(0), model.ApplicationStatus_Diverged, mock.Anything). Return(nil).Once() - _, errs := m.submitClaimsAndUpdateDatabase(makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + _, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 1, len(errs)) } @@ -780,7 +781,7 @@ func TestQuorumPreviousSubmittedEventsIgnoresForeignMismatchAndSubmitsCurrentCla b.On("submitClaimToBlockchain", mock.Anything, app, currEpoch). Return(txHash, nil).Once() - transitions, errs := m.submitClaimsAndUpdateDatabase(makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + transitions, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 0, len(errs)) assert.Equal(t, txHash, m.claimsInFlight[app.ID].txHash) assert.Equal(t, 1, transitions) @@ -813,7 +814,7 @@ func TestQuorumPreviousSubmittedEventsOwnMismatchSetsDiverged(t *testing.T) { Return(nil).Once() currEpochs := makeEpochMap(currEpoch) - transitions, errs := m.submitClaimsAndUpdateDatabase(makeEpochMap(prevEpoch), currEpochs, makeApplicationMap(app), endBlock) + transitions, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), currEpochs, makeApplicationMap(app), endBlock) assert.Equal(t, 1, len(errs)) assert.Equal(t, 0, len(currEpochs)) assert.Equal(t, 0, len(m.claimsInFlight)) @@ -836,7 +837,7 @@ func TestSubmitClaimWithAntecessorOutOfOrder(t *testing.T) { r.On("UpdateApplicationStatus", mock.Anything, int64(0), model.ApplicationStatus_Corrupted, mock.Anything). Return(nil).Once() - _, errs := m.submitClaimsAndUpdateDatabase(makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), big.NewInt(0)) + _, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), big.NewInt(0)) assert.Equal(t, 1, len(errs)) } @@ -869,7 +870,7 @@ func TestErrSubmittedMissingEvent(t *testing.T) { r.On("UpdateApplicationStatus", mock.Anything, int64(0), model.ApplicationStatus_Corrupted, mock.Anything). Return(nil).Once() - _, errs := m.submitClaimsAndUpdateDatabase(makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + _, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 1, len(errs)) } @@ -890,7 +891,7 @@ func TestConsensusAddressChangedOnSubmittedClaims(t *testing.T) { r.On("UpdateApplicationStatus", mock.Anything, int64(0), model.ApplicationStatus_Corrupted, mock.Anything). Return(nil).Once() - _, errs := m.submitClaimsAndUpdateDatabase(makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + _, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, len(errs), 1) } @@ -907,7 +908,7 @@ func TestCheckConsensusForAddressChangeUsesTickBlock(t *testing.T) { Return(app.IConsensusAddress, nil). Once() - err := m.checkConsensusForAddressChange(app, tickBlock) + err := m.checkConsensusForAddressChange(context.Background(), app, tickBlock) require.NoError(t, err) } @@ -923,9 +924,10 @@ func TestCheckConsensusForAddressChangeCachesTickResult(t *testing.T) { Return(app.IConsensusAddress, nil). Once() - err := m.checkConsensusForAddressChange(app, tickBlock) + ctx := context.Background() + err := m.checkConsensusForAddressChange(ctx, app, tickBlock) require.NoError(t, err) - err = m.checkConsensusForAddressChange(app, tickBlock) + err = m.checkConsensusForAddressChange(ctx, app, tickBlock) require.NoError(t, err) } diff --git a/internal/evmreader/evmreader.go b/internal/evmreader/evmreader.go index 14622fe8a..56f0901f4 100644 --- a/internal/evmreader/evmreader.go +++ b/internal/evmreader/evmreader.go @@ -101,8 +101,8 @@ func (r *Service) setApplicationCorrupted(ctx context.Context, app *Application, return appstatus.SetCorruptedf(ctx, r.Logger, r.repository, app, reasonFmt, args...) } -func (r *Service) Tick() []error { - blockNumber, err := r.fetchMostRecentHeader(r.Context, r.defaultBlock) +func (r *Service) Tick(ctx context.Context) []error { + blockNumber, err := r.fetchMostRecentHeader(ctx, r.defaultBlock) if err != nil { if errors.Is(err, context.Canceled) { return nil @@ -117,7 +117,7 @@ func (r *Service) Tick() []error { // Scans run under the service context: cancellable on shutdown, free to take // as long as catch-up needs. Per-request bounds live on the HTTP transport. - r.processBlockHead(r.Context, blockNumber, r.resolver) + r.processBlockHead(ctx, blockNumber, r.resolver) return nil } diff --git a/internal/evmreader/evmreader_test.go b/internal/evmreader/evmreader_test.go index ec8731ac9..f8208b7f5 100644 --- a/internal/evmreader/evmreader_test.go +++ b/internal/evmreader/evmreader_test.go @@ -227,7 +227,7 @@ func (s *EvmReaderSuite) TestTickScansWithServiceContext() { assertValidContext := func(args mock.Arguments) { ctx := args.Get(0).(context.Context) - s.Require().Equal(s.evmReader.Context, ctx) + s.Require().Equal(s.ctx, ctx) s.Require().Nil(ctx.Err()) } @@ -249,7 +249,7 @@ func (s *EvmReaderSuite) TestTickScansWithServiceContext() { s.Require().False(s.evmReader.Ready()) - errs := s.evmReader.Tick() + errs := s.evmReader.Tick(s.ctx) s.Require().Empty(errs) s.client.AssertCalled(s.T(), "HeaderByNumber", mock.Anything, mock.Anything) @@ -280,7 +280,7 @@ func (s *EvmReaderSuite) TestTickReturnsHeaderFetchErrorWithoutLocalErrorLog() { mock.Anything, ).Return(hdr, headerErr).Once() - errs := s.evmReader.Tick() + errs := s.evmReader.Tick(s.ctx) s.Require().Len(errs, 1) s.Require().ErrorIs(errs[0], headerErr) diff --git a/internal/evmreader/service.go b/internal/evmreader/service.go index 96cfaf7dd..b33d0096e 100644 --- a/internal/evmreader/service.go +++ b/internal/evmreader/service.go @@ -120,12 +120,12 @@ func (s *Service) Ready() bool { return s.ready.Load() } -func (s *Service) OnServe() error { +func (s *Service) OnServe(ctx context.Context) error { s.alive.Store(true) s.ready.Store(true) defer s.alive.Store(false) defer s.ready.Store(false) - return s.TickServiceTemplate.OnServe() + return s.TickServiceTemplate.OnServe(ctx) } func (s *Service) setupPersistentConfig( diff --git a/internal/jsonrpc/service.go b/internal/jsonrpc/service.go index 4c947e6d2..d4124f906 100644 --- a/internal/jsonrpc/service.go +++ b/internal/jsonrpc/service.go @@ -112,7 +112,7 @@ func (s *Service) OnStop(_ bool) []error { return errs } -func (s *Service) OnServe() error { +func (s *Service) OnServe(ctx context.Context) error { listener, err := s.listen("tcp", s.server.Addr) if err != nil { return err @@ -139,13 +139,13 @@ func (s *Service) OnServe() error { // The HTTP loop exited first. This is unexpected unless the listener // failed or the server was already closed, so cancel the framework // loop and wait for it to observe the cancellation before returning. - s.Cancel() - <-s.Context.Done() + s.Stop(true) + <-ctx.Done() if err != nil { return err } return nil - case <-s.Context.Done(): + case <-ctx.Done(): // The framework loop exited first because it handled a shutdown signal // or context cancellation and called Stop(), which should trigger // s.server.Shutdown(). Wait for the HTTP loop to finish so Serve() diff --git a/internal/node/node.go b/internal/node/node.go index 98cda64b9..752b9d33f 100644 --- a/internal/node/node.go +++ b/internal/node/node.go @@ -153,11 +153,11 @@ func (me *Service) OnStop(force bool) []error { return errs } -func (me *Service) OnServe() error { +func (me *Service) OnServe(ctx context.Context) error { for _, s := range me.Children { go s.Serve() } - <-me.Context.Done() + <-ctx.Done() return nil } diff --git a/internal/prt/service.go b/internal/prt/service.go index e06542a14..ba1dee52c 100644 --- a/internal/prt/service.go +++ b/internal/prt/service.go @@ -124,13 +124,13 @@ func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { // Tick executes the Validator main logic of producing claims and/or proofs // for processed epochs of all running applications. -func (s *Service) Tick() []error { +func (s *Service) Tick(ctx context.Context) []error { // Check for shutdown before starting work, consistent with the advancer. if s.IsStopping() { return nil } - apps, _, err := getAllRunningApplications(s.Context, s.repository) + apps, _, err := getAllRunningApplications(ctx, s.repository) if err != nil { // Only suppress context errors during shutdown; surface real DB errors. if s.IsStopping() && errors.Is(err, context.Canceled) { @@ -143,7 +143,7 @@ func (s *Service) Tick() []error { // validate each application errs := []error{} for idx := range apps { - if s.Context.Err() != nil { + if ctx.Err() != nil { return errs } app := apps[idx] @@ -152,7 +152,7 @@ func (s *Service) Tick() []error { // the sole writer of ForecloseBlock; the app keeps health status OK and // remains enabled for L1 observation. if app.ForecloseBlock != 0 { - if ferr := s.handleForeclosedApp(s.Context, app); ferr != nil { + if ferr := s.handleForeclosedApp(ctx, app); ferr != nil { if s.IsStopping() && errors.Is(ferr, context.Canceled) { continue } @@ -160,7 +160,7 @@ func (s *Service) Tick() []error { } continue } - if err := s.validateApplication(s.Context, app); err != nil { + if err := s.validateApplication(ctx, app); err != nil { // During shutdown, in-flight L1 requests see context cancellation. // Suppress these to avoid spurious ERR log entries. if s.IsStopping() && errors.Is(err, context.Canceled) { diff --git a/internal/validator/validator.go b/internal/validator/validator.go index aed324b25..e0ada24bf 100644 --- a/internal/validator/validator.go +++ b/internal/validator/validator.go @@ -68,8 +68,8 @@ func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { // Tick executes the Validator main logic of producing claims and/or proofs // for processed epochs of all running applications. -func (s *Service) Tick() []error { - apps, _, err := getAllRunningApplications(s.Context, s.repository) +func (s *Service) Tick(ctx context.Context) []error { + apps, _, err := getAllRunningApplications(ctx, s.repository) if err != nil { // During shutdown the parent context is canceled and every in- // flight DB query returns context.Canceled. Suppress only the @@ -85,7 +85,7 @@ func (s *Service) Tick() []error { // validate each application errs := []error{} for idx := range apps { - if err := s.validateApplication(s.Context, apps[idx]); err != nil { + if err := s.validateApplication(ctx, apps[idx]); err != nil { // Same shutdown-cancellation suppression as above, per-app. if s.IsStopping() && errors.Is(err, context.Canceled) { s.Logger.Warn("Tick interrupted by shutdown", diff --git a/pkg/service/service.go b/pkg/service/service.go index 69bc748a0..02d3a1b91 100644 --- a/pkg/service/service.go +++ b/pkg/service/service.go @@ -96,7 +96,7 @@ type LifecycleImpl interface { Ready() bool OnReload() []error OnStop(bool) []error - OnServe() error + OnServe(ctx context.Context) error } // ServiceTemplate stores runtime information. @@ -105,8 +105,8 @@ type ServiceTemplate struct { Name string Logger *slog.Logger lifecycleImpl LifecycleImpl - Context context.Context - Cancel context.CancelFunc + context context.Context + cancelContext context.CancelFunc sigHangUp chan os.Signal // SIGHUP to reload sigShutdown chan os.Signal // SIGINT/SIGTERM to exit gracefully ServeMux *http.ServeMux @@ -118,7 +118,7 @@ type ServiceTemplate struct { // detect that shutdown is in progress and suppress errors that are // expected during teardown (e.g., context.Canceled from in-flight RPC // calls). This covers the race window between Stop() being called and - // ctx.Cancel() propagating. + // cancelContext() propagating. stopping atomic.Bool // stopped server Stop() run exactly once, even when Stop() is called @@ -159,18 +159,14 @@ func InitServiceTemplate(c *ServiceConfigs, s *ServiceTemplate, impl LifecycleIm } // context and cancelation - if s.Context == nil { - if c.Context == nil { - c.Context = context.Background() - } - s.Context = c.Context + if c.Context == nil { + c.Context = context.Background() } - if s.Cancel == nil { - if c.Cancel == nil { - s.Context, c.Cancel = context.WithCancel(c.Context) - } - s.Cancel = c.Cancel + s.context = c.Context + if c.Cancel == nil { + s.context, c.Cancel = context.WithCancel(c.Context) } + s.cancelContext = c.Cancel // signal handling if c.EnableSignalHandling { @@ -280,7 +276,7 @@ func (s *ServiceTemplate) Stop(force bool) []error { elapsed := time.Since(start) s.Running.Store(false) - s.Cancel() + s.cancelContext() if len(errs) > 0 { s.Logger.Error("Stop", @@ -310,7 +306,7 @@ func (s *ServiceTemplate) Serve() error { case <-s.sigShutdown: s.Stop(false) // Graceful shutdown; errors are logged by Stop. return - case <-s.Context.Done(): + case <-s.context.Done(): s.Stop(true) // Stop logs errors internally. return } @@ -319,7 +315,7 @@ func (s *ServiceTemplate) Serve() error { defer s.Stop(true) - return s.lifecycleImpl.OnServe() + return s.lifecycleImpl.OnServe(s.context) } // LogConfig logs the service configuration at debug level. @@ -333,7 +329,7 @@ func (s *ServiceTemplate) LogConfig(config any) { */ type TickImpl interface { - Tick() []error + Tick(ctx context.Context) []error } type TickServiceTemplate struct { @@ -395,9 +391,9 @@ func InitTickServiceTemplate( return nil } -func (s *TickServiceTemplate) tick() []error { +func (s *TickServiceTemplate) tick(ctx context.Context) []error { start := time.Now() - errs := s.tickImpl.Tick() + errs := s.tickImpl.Tick(ctx) elapsed := time.Since(start) if len(errs) > 0 { @@ -416,22 +412,21 @@ func (s *TickServiceTemplate) OnStop(bool) []error { return nil } -func (s *TickServiceTemplate) OnServe() error { - ctx := s.Context +func (s *TickServiceTemplate) OnServe(ctx context.Context) error { if ctx.Err() != nil { return nil } - s.tick() + s.tick(ctx) for { select { case <-ctx.Done(): return nil case <-s.ticker.C: - s.tick() + s.tick(ctx) // 'reschedule' is nil when rescheduling is disabled thus blocking forever, // preserving timer-only behavior. case <-s.reschedule: - s.tick() + s.tick(ctx) } } } diff --git a/pkg/service/service_test.go b/pkg/service/service_test.go index 3e79ef51f..35a897bf1 100644 --- a/pkg/service/service_test.go +++ b/pkg/service/service_test.go @@ -23,7 +23,7 @@ type mockImpl struct { func (m *mockImpl) OnReload() []error { return nil } func (m *mockImpl) OnStop(bool) []error { return nil } -func (m *mockImpl) Tick() []error { +func (m *mockImpl) Tick(ctx context.Context) []error { n := m.tickCount.Add(1) if m.onTick != nil { m.onTick(n) diff --git a/pkg/service/telemetry_test.go b/pkg/service/telemetry_test.go index c7f6293a4..801d7e56e 100644 --- a/pkg/service/telemetry_test.go +++ b/pkg/service/telemetry_test.go @@ -4,6 +4,7 @@ package service import ( + "context" "io" "net/http" "net/http/httptest" @@ -99,9 +100,9 @@ func TestCreateDefaultTelemetry_PanicRecovered(t *testing.T) { type falseLifecycleImpl struct{ ServiceTemplate } -func (*falseLifecycleImpl) Alive() bool { return false } -func (*falseLifecycleImpl) Ready() bool { return false } -func (*falseLifecycleImpl) OnServe() error { return nil } +func (*falseLifecycleImpl) Alive() bool { return false } +func (*falseLifecycleImpl) Ready() bool { return false } +func (*falseLifecycleImpl) OnServe(context.Context) error { return nil } func TestCreateDefaultTelemetry_Returns500WhenLifecycleFails(t *testing.T) { service := &ServiceTemplate{ diff --git a/test/validator/validator_test.go b/test/validator/validator_test.go index c76f43593..442fb684f 100644 --- a/test/validator/validator_test.go +++ b/test/validator/validator_test.go @@ -139,7 +139,7 @@ func (s *ValidatorRepositoryIntegrationSuite) TestItReturnsPristineClaim() { err = s.repository.StoreAdvanceResult(s.ctx, 1, &advanceResult) s.Require().Nil(err) - errs := s.validator.Tick() + errs := s.validator.Tick(s.ctx) s.Require().Equal(0, len(errs)) updatedEpoch, err := s.repository.GetEpoch(s.ctx, app.IApplicationAddress.String(), epoch.Index) @@ -262,7 +262,7 @@ func (s *ValidatorRepositoryIntegrationSuite) TestItReturnsPreviousClaim() { err = s.repository.StoreAdvanceResult(s.ctx, 1, &advanceResult) s.Require().Nil(err) - errs := s.validator.Tick() + errs := s.validator.Tick(s.ctx) s.Require().Equal(0, len(errs)) updatedEpoch, err := s.repository.GetEpoch(s.ctx, app.IApplicationAddress.String(), secondEpoch.Index) @@ -345,7 +345,7 @@ func (s *ValidatorRepositoryIntegrationSuite) TestItReturnsANewClaimAndProofs() err = s.repository.StoreAdvanceResult(s.ctx, 1, &advanceResult) s.Require().Nil(err) - errs := s.validator.Tick() + errs := s.validator.Tick(s.ctx) s.Require().Equal(0, len(errs)) updatedEpoch, err := s.repository.GetEpoch(s.ctx, app.IApplicationAddress.String(), epoch.Index) @@ -499,7 +499,7 @@ func (s *ValidatorRepositoryIntegrationSuite) TestItReturnsANewClaimAndProofs() err = s.repository.StoreAdvanceResult(s.ctx, 1, &advanceResult) s.Require().Nil(err) - errs := s.validator.Tick() + errs := s.validator.Tick(s.ctx) s.Require().Equal(0, len(errs)) updatedSecondEpoch, err := s.repository.GetEpoch( From 170d73078f1a31671a76f3d885764b36157f1163 Mon Sep 17 00:00:00 2001 From: Renato Maia <1887792+renatomaia@users.noreply.github.com> Date: Sun, 26 Apr 2026 18:36:51 -0300 Subject: [PATCH 08/16] refactor(services): use context cancellation to indicate service stop --- internal/advancer/advancer.go | 6 +----- internal/advancer/service.go | 2 +- internal/claimer/claimer.go | 8 ++++---- internal/prt/service.go | 8 ++++---- internal/validator/validator.go | 11 +++++------ pkg/service/service.go | 27 --------------------------- 6 files changed, 15 insertions(+), 47 deletions(-) diff --git a/internal/advancer/advancer.go b/internal/advancer/advancer.go index e2991a0a5..287f7efcc 100644 --- a/internal/advancer/advancer.go +++ b/internal/advancer/advancer.go @@ -70,15 +70,11 @@ func getUnprocessedInputs( // potentially has more work. Callers use this to decide whether to re-tick immediately // (via the Reschedule channel) or wait for the next timer/event. func (s *Service) Step(ctx context.Context) (bool, error) { - // Check for context cancellation or shutdown in progress. - // The framework sets Stopping before calling Impl.Stop(), so this + // Check for context cancellation or shutdown in progress. This // prevents starting new work while the machine manager is being torn down. if err := ctx.Err(); err != nil { return false, err } - if s.IsStopping() { - return false, nil - } // Update the machine manager with any new or disabled applications err := s.machineManager.UpdateMachines(ctx) diff --git a/internal/advancer/service.go b/internal/advancer/service.go index 1b4a36807..c90985613 100644 --- a/internal/advancer/service.go +++ b/internal/advancer/service.go @@ -123,7 +123,7 @@ func (s *Service) Tick(ctx context.Context) []error { } // During shutdown, the machine manager is closed and GetMachine() may // return ErrNoApp. Suppress this to avoid spurious ERR log entries. - if errors.Is(err, ErrNoApp) && s.IsStopping() { + if errors.Is(err, ErrNoApp) && ctx.Err() != nil { s.Logger.Warn("Tick interrupted by shutdown", "error", err) return nil } diff --git a/internal/claimer/claimer.go b/internal/claimer/claimer.go index 263f04458..51ca26023 100644 --- a/internal/claimer/claimer.go +++ b/internal/claimer/claimer.go @@ -58,7 +58,7 @@ func (s *Service) Tick(ctx context.Context) []error { // During shutdown, the parent context is canceled and RPC/DB calls // return context.Canceled. Ignore only that normal shutdown case. Other // errors, such as deadline exceeded, must still be returned. - if s.IsStopping() && errors.Is(err, context.Canceled) { + if errors.Is(err, context.Canceled) { s.Logger.Warn("Tick interrupted by shutdown", "stage", "getDefaultBlockNumber", "error", err) return nil } @@ -78,7 +78,7 @@ func (s *Service) Tick(ctx context.Context) []error { // transaction receipt already contains ClaimStaged. prevSubmittedOrStaged, computedEpochs, computedApps, errComputed := s.repository.SelectClaimsToSubmitPerApp(ctx) if errComputed != nil { - if s.IsStopping() && errors.Is(errComputed, context.Canceled) { + if errors.Is(errComputed, context.Canceled) { s.Logger.Warn("Tick interrupted by shutdown", "stage", "SelectClaimsToSubmitPerApp", "error", errComputed) return nil } @@ -91,7 +91,7 @@ func (s *Service) Tick(ctx context.Context) []error { // Stage 2: stage. SUBMITTED -> STAGED. This read sees stage 1 updates. prevAcceptedForSubmitted, submittedEpochs, submittedApps, errSubmitted := s.repository.SelectClaimsToStagePerApp(ctx) if errSubmitted != nil { - if s.IsStopping() && errors.Is(errSubmitted, context.Canceled) { + if errors.Is(errSubmitted, context.Canceled) { s.Logger.Warn("Tick interrupted by shutdown", "stage", "SelectClaimsToStagePerApp", "error", errSubmitted) return nil } @@ -106,7 +106,7 @@ func (s *Service) Tick(ctx context.Context) []error { // This read sees stage 1 and stage 2 updates. prevAcceptedForStaged, stagedEpochs, stagedApps, errStaged := s.repository.SelectClaimsToAcceptPerApp(ctx) if errStaged != nil { - if s.IsStopping() && errors.Is(errStaged, context.Canceled) { + if errors.Is(errStaged, context.Canceled) { s.Logger.Warn("Tick interrupted by shutdown", "stage", "SelectClaimsToAcceptPerApp", "error", errStaged) return nil } diff --git a/internal/prt/service.go b/internal/prt/service.go index ba1dee52c..7329b2132 100644 --- a/internal/prt/service.go +++ b/internal/prt/service.go @@ -126,14 +126,14 @@ func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { // for processed epochs of all running applications. func (s *Service) Tick(ctx context.Context) []error { // Check for shutdown before starting work, consistent with the advancer. - if s.IsStopping() { + if ctx.Err() != nil { return nil } apps, _, err := getAllRunningApplications(ctx, s.repository) if err != nil { // Only suppress context errors during shutdown; surface real DB errors. - if s.IsStopping() && errors.Is(err, context.Canceled) { + if errors.Is(err, context.Canceled) { s.Logger.Warn("Tick interrupted by shutdown", "error", err) return nil } @@ -153,7 +153,7 @@ func (s *Service) Tick(ctx context.Context) []error { // remains enabled for L1 observation. if app.ForecloseBlock != 0 { if ferr := s.handleForeclosedApp(ctx, app); ferr != nil { - if s.IsStopping() && errors.Is(ferr, context.Canceled) { + if errors.Is(ferr, context.Canceled) { continue } errs = append(errs, ferr) @@ -163,7 +163,7 @@ func (s *Service) Tick(ctx context.Context) []error { if err := s.validateApplication(ctx, app); err != nil { // During shutdown, in-flight L1 requests see context cancellation. // Suppress these to avoid spurious ERR log entries. - if s.IsStopping() && errors.Is(err, context.Canceled) { + if errors.Is(err, context.Canceled) { s.Logger.Warn("Tick interrupted by shutdown", "application", app.IApplicationAddress, "error", err) continue diff --git a/internal/validator/validator.go b/internal/validator/validator.go index e0ada24bf..64697702c 100644 --- a/internal/validator/validator.go +++ b/internal/validator/validator.go @@ -75,7 +75,7 @@ func (s *Service) Tick(ctx context.Context) []error { // flight DB query returns context.Canceled. Suppress only the // graceful-shutdown case; deadline-exceeded (real failure) still // propagates. Mirrors internal/prt/service.go's Tick pattern. - if s.IsStopping() && errors.Is(err, context.Canceled) { + if errors.Is(err, context.Canceled) { s.Logger.Warn("Tick interrupted by shutdown", "error", err) return nil } @@ -87,7 +87,7 @@ func (s *Service) Tick(ctx context.Context) []error { for idx := range apps { if err := s.validateApplication(ctx, apps[idx]); err != nil { // Same shutdown-cancellation suppression as above, per-app. - if s.IsStopping() && errors.Is(err, context.Canceled) { + if errors.Is(err, context.Canceled) { s.Logger.Warn("Tick interrupted by shutdown", "application", apps[idx].IApplicationAddress, "error", err) continue @@ -176,10 +176,9 @@ func (s *Service) validateApplication(ctx context.Context, app *Application) err if err != nil { // Don't log shutdown-cancellation at ERR — every in-flight DB // query returns context.Canceled and Tick's outer suppression - // (s.IsStopping() && errors.Is(err, context.Canceled)) handles - // the propagation. DeadlineExceeded is a real failure and - // must still be logged. - if !(s.IsStopping() && errors.Is(err, context.Canceled)) { + // (errors.Is(err, context.Canceled)) handles the propagation. + // DeadlineExceeded is a real failure and must still be logged. + if !errors.Is(err, context.Canceled) { s.Logger.Error("failed to create claim and proofs.", "error", err) } return err diff --git a/pkg/service/service.go b/pkg/service/service.go index 02d3a1b91..09a375f81 100644 --- a/pkg/service/service.go +++ b/pkg/service/service.go @@ -101,7 +101,6 @@ type LifecycleImpl interface { // ServiceTemplate stores runtime information. type ServiceTemplate struct { - Running atomic.Bool Name string Logger *slog.Logger lifecycleImpl LifecycleImpl @@ -113,14 +112,6 @@ type ServiceTemplate struct { telemetry *http.Server telemetryFunc func() error - // stopping is set to true at the beginning of Stop(), before Impl.Stop() - // is called. Services can check this via IsStopping() from Tick() to - // detect that shutdown is in progress and suppress errors that are - // expected during teardown (e.g., context.Canceled from in-flight RPC - // calls). This covers the race window between Stop() being called and - // cancelContext() propagating. - stopping atomic.Bool - // stopped server Stop() run exactly once, even when Stop() is called // multiple times (by the child's Serve() loop and by the parent orchestrator). stopped atomic.Bool @@ -234,20 +225,6 @@ func (s *ServiceTemplate) Reload() []error { return errs } -// IsStopping reports whether Stop() has been called. Services use this in -// Tick() to detect shutdown-in-progress and suppress expected teardown errors. -func (s *ServiceTemplate) IsStopping() bool { - return s.stopping.Load() -} - -// SetStopping sets the stopping flag. Services whose Stop() method shadows -// Service.Stop() (i.e., every ServiceImpl) must call this at the top of their -// Stop so that concurrent Tick goroutines can observe IsStopping() == true -// before resources are torn down. -func (s *ServiceTemplate) SetStopping() { - s.stopping.Store(true) -} - func (s *ServiceTemplate) Stop(force bool) []error { // CAS achieves once-semantics: the second caller returns immediately // (fire-and-forget) rather than blocking like sync.Once. This is safe @@ -257,7 +234,6 @@ func (s *ServiceTemplate) Stop(force bool) []error { return nil // already stopped } - s.stopping.Store(true) start := time.Now() errs := s.lifecycleImpl.OnStop(force) if s.telemetry != nil { @@ -275,7 +251,6 @@ func (s *ServiceTemplate) Stop(force bool) []error { } elapsed := time.Since(start) - s.Running.Store(false) s.cancelContext() if len(errs) > 0 { @@ -296,8 +271,6 @@ func (s *ServiceTemplate) Serve() error { return ErrServiceStopped } - s.Running.Store(true) - go func() { for { select { From f51403296e90646a5aabdbf85503f1789f3f6ae3 Mon Sep 17 00:00:00 2001 From: Renato Maia <1887792+renatomaia@users.noreply.github.com> Date: Mon, 4 May 2026 09:31:23 -0300 Subject: [PATCH 09/16] refactor(services): replace 'SignalSchedule' method by a return value in 'Tick' functions --- internal/advancer/advancer_test.go | 21 +++-- internal/advancer/service.go | 17 ++-- internal/claimer/claimer.go | 22 +++--- internal/claimer/claimer_test.go | 5 +- internal/claimer/service.go | 1 - internal/evmreader/evmreader.go | 8 +- internal/evmreader/evmreader_test.go | 4 +- internal/prt/service.go | 12 +-- internal/validator/validator.go | 8 +- pkg/service/service.go | 68 ++++------------ pkg/service/service_test.go | 112 +++------------------------ test/validator/validator_test.go | 8 +- 12 files changed, 76 insertions(+), 210 deletions(-) diff --git a/internal/advancer/advancer_test.go b/internal/advancer/advancer_test.go index f4985f692..426ee4bdc 100644 --- a/internal/advancer/advancer_test.go +++ b/internal/advancer/advancer_test.go @@ -73,7 +73,6 @@ func newMockAdvancerServiceWithContextAndBatchSize( Context: ctx, Cancel: cancelCtx, }, - EnableReschedule: true, } err := service.InitTickServiceTemplate(serviceArgs, &s.TickServiceTemplate, s, s) if err != nil { @@ -126,12 +125,12 @@ func (s *AdvancerSuite) TestServiceInterface() { repository.GetEpochsReturn = map[common.Address][]*Epoch{ machineManager.Map[1].application.IApplicationAddress: {}, } - tickErrors := advancer.Tick(context.Background()) + _, tickErrors := advancer.Tick(context.Background()) require.Empty(tickErrors) // Test Tick with error repository.GetEpochsError = errors.New("list epochs error") - tickErrors = advancer.Tick(context.Background()) + _, tickErrors = advancer.Tick(context.Background()) require.NotEmpty(tickErrors) require.Contains(tickErrors[0].Error(), "list epochs error") @@ -1597,10 +1596,10 @@ func (s *AdvancerSuite) TestSelfWakeOnSuccess() { require.NoError(err) // Call Tick() which internally calls Step() and signals reschedule. - svc.Tick(context.Background()) + reschedule, _ := svc.Tick(context.Background()) // The reschedule channel should have a pending signal. - require.True(svc.DrainReschedule(), + require.True(reschedule, "reschedule channel should have a pending signal after Tick with work") } @@ -1621,9 +1620,9 @@ func (s *AdvancerSuite) TestNoSelfWakeWhenIdle() { svc, err := newMockAdvancerService(mm, repo) require.NoError(err) - svc.Tick(context.Background()) + reschedule, _ := svc.Tick(context.Background()) - require.False(svc.DrainReschedule(), + require.False(reschedule, "reschedule channel should be empty when no work exists") } @@ -1640,10 +1639,10 @@ func (s *AdvancerSuite) TestNoSelfWakeOnError() { svc, err := newMockAdvancerService(mm, repo) require.NoError(err) - errs := svc.Tick(context.Background()) + reschedule, errs := svc.Tick(context.Background()) require.NotEmpty(errs) - require.False(svc.DrainReschedule(), + require.False(reschedule, "reschedule should NOT be signaled on error") } @@ -1681,12 +1680,12 @@ func (s *AdvancerSuite) TestPartialSuccessStillReschedules() { // Call Tick — app1 fails, app2 succeeds with more work remaining (batch limit hit). // Tick should surface the error AND signal reschedule for app2's pending work. - errs := svc.Tick(context.Background()) + reschedule, errs := svc.Tick(context.Background()) require.NotEmpty(errs, "Tick should surface app1's error") // Reschedule SHOULD fire: app2 had work, and one failing app must not // delay healthy apps by suppressing the reschedule signal. - require.True(svc.DrainReschedule(), + require.True(reschedule, "reschedule should be signaled when hadWork is true, even with errors") } diff --git a/internal/advancer/service.go b/internal/advancer/service.go index c90985613..8829b31a9 100644 --- a/internal/advancer/service.go +++ b/internal/advancer/service.go @@ -52,7 +52,6 @@ func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { } s := &Service{} - c.EnableReschedule = true err = service.InitTickServiceTemplate(&c.TickServiceConfigs, &s.TickServiceTemplate, s, s) if err != nil { @@ -107,25 +106,21 @@ func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { } // Service interface implementation -func (s *Service) Tick(ctx context.Context) []error { - hadWork, err := s.Step(ctx) - +func (s *Service) Tick(ctx context.Context) (bool, []error) { // Signal reschedule whenever work was done, even if some apps errored. // Failed apps are marked Failed and removed by the machine manager, // so they won't cause amplified retries on the next tick. // Without this, one failing app delays all healthy apps by a full poll interval. - if hadWork { - s.SignalReschedule() - } + hadWork, err := s.Step(ctx) if err == nil { - return nil + return hadWork, nil } // During shutdown, the machine manager is closed and GetMachine() may // return ErrNoApp. Suppress this to avoid spurious ERR log entries. if errors.Is(err, ErrNoApp) && ctx.Err() != nil { s.Logger.Warn("Tick interrupted by shutdown", "error", err) - return nil + return hadWork, nil } // Canceled is graceful per the project convention: code paths that // wrap cancellation (e.g. handleSnapshot → createSnapshot → @@ -134,9 +129,9 @@ func (s *Service) Tick(ctx context.Context) []error { // real failure and is propagated. if errors.Is(err, context.Canceled) { s.Logger.Debug("Tick cancelled (shutdown)", "error", err) - return nil + return hadWork, nil } - return []error{err} + return hadWork, []error{err} } func (s *Service) OnStop(b bool) []error { diff --git a/internal/claimer/claimer.go b/internal/claimer/claimer.go index 51ca26023..8a07be3a6 100644 --- a/internal/claimer/claimer.go +++ b/internal/claimer/claimer.go @@ -47,7 +47,7 @@ import ( "errors" ) -func (s *Service) Tick(ctx context.Context) []error { +func (s *Service) Tick(ctx context.Context) (bool, []error) { errs := []error{} // Use the same finalized block number for all chain reads in this tick. @@ -60,10 +60,10 @@ func (s *Service) Tick(ctx context.Context) []error { // errors, such as deadline exceeded, must still be returned. if errors.Is(err, context.Canceled) { s.Logger.Warn("Tick interrupted by shutdown", "stage", "getDefaultBlockNumber", "error", err) - return nil + return false, nil } errs = append(errs, err) - return errs + return false, errs } s.consensusAddressChecks = map[consensusAddressCheckKey]error{} defer func() { @@ -80,10 +80,10 @@ func (s *Service) Tick(ctx context.Context) []error { if errComputed != nil { if errors.Is(errComputed, context.Canceled) { s.Logger.Warn("Tick interrupted by shutdown", "stage", "SelectClaimsToSubmitPerApp", "error", errComputed) - return nil + return false, nil } errs = append(errs, errComputed) - return errs + return false, errs } submitted, submitErrs := s.submitClaimsAndUpdateDatabase(ctx, prevSubmittedOrStaged, computedEpochs, computedApps, defaultBlockNumber) errs = append(errs, submitErrs...) @@ -93,10 +93,10 @@ func (s *Service) Tick(ctx context.Context) []error { if errSubmitted != nil { if errors.Is(errSubmitted, context.Canceled) { s.Logger.Warn("Tick interrupted by shutdown", "stage", "SelectClaimsToStagePerApp", "error", errSubmitted) - return nil + return false, nil } errs = append(errs, errSubmitted) - return errs + return false, errs } staged, stageErrs := s.stageClaimsAndUpdateDatabase(ctx, prevAcceptedForSubmitted, submittedEpochs, submittedApps, defaultBlockNumber) errs = append(errs, stageErrs...) @@ -108,10 +108,10 @@ func (s *Service) Tick(ctx context.Context) []error { if errStaged != nil { if errors.Is(errStaged, context.Canceled) { s.Logger.Warn("Tick interrupted by shutdown", "stage", "SelectClaimsToAcceptPerApp", "error", errStaged) - return nil + return false, nil } errs = append(errs, errStaged) - return errs + return false, errs } // Foreclosed apps still need some read-only claim work. A claim accepted @@ -161,7 +161,7 @@ func (s *Service) Tick(ctx context.Context) []error { // Signal reschedule whenever pipeline progress was made, even with errors. if submitted > 0 || staged > 0 || issuedAccepts > 0 || confirmedAccepts > 0 || accepted > 0 { - s.SignalReschedule() + return true, errs } - return errs + return false, errs } diff --git a/internal/claimer/claimer_test.go b/internal/claimer/claimer_test.go index 99610bb65..09e80964b 100644 --- a/internal/claimer/claimer_test.go +++ b/internal/claimer/claimer_test.go @@ -43,7 +43,6 @@ func TestTickInterleavesStagesWithPinnedBlockAndReschedulesOnProgress(t *testing Context: ctx, }, PollInterval: time.Hour, - EnableReschedule: true, }, &m.TickServiceTemplate, m, m) require.NoError(t, err) t.Cleanup(func() { m.Stop(false) }) @@ -81,8 +80,8 @@ func TestTickInterleavesStagesWithPinnedBlockAndReschedulesOnProgress(t *testing }), repository.Pagination{}, false). Return([]*model.Application{}, 0, nil).Once() - errs := m.Tick(ctx) + reschedule, errs := m.Tick(ctx) require.Empty(t, errs) - assert.True(t, m.DrainReschedule(), "a successful stage transition should request an immediate follow-up tick") + assert.True(t, reschedule, "a successful stage transition should request an immediate follow-up tick") } diff --git a/internal/claimer/service.go b/internal/claimer/service.go index 2a0c87460..5f62e9c74 100644 --- a/internal/claimer/service.go +++ b/internal/claimer/service.go @@ -92,7 +92,6 @@ func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { } s := &Service{} - c.EnableReschedule = true err = service.InitTickServiceTemplate(&c.TickServiceConfigs, &s.TickServiceTemplate, s, s) if err != nil { diff --git a/internal/evmreader/evmreader.go b/internal/evmreader/evmreader.go index 56f0901f4..331efe888 100644 --- a/internal/evmreader/evmreader.go +++ b/internal/evmreader/evmreader.go @@ -101,13 +101,13 @@ func (r *Service) setApplicationCorrupted(ctx context.Context, app *Application, return appstatus.SetCorruptedf(ctx, r.Logger, r.repository, app, reasonFmt, args...) } -func (r *Service) Tick(ctx context.Context) []error { +func (r *Service) Tick(ctx context.Context) (bool, []error) { blockNumber, err := r.fetchMostRecentHeader(ctx, r.defaultBlock) if err != nil { if errors.Is(err, context.Canceled) { - return nil + return false, nil } - return []error{err} + return false, []error{err} } if blockNumber != r.lastBlockNumber.Load() { @@ -119,7 +119,7 @@ func (r *Service) Tick(ctx context.Context) []error { // as long as catch-up needs. Per-request bounds live on the HTTP transport. r.processBlockHead(ctx, blockNumber, r.resolver) - return nil + return false, nil } func (r *Service) processBlockHead( diff --git a/internal/evmreader/evmreader_test.go b/internal/evmreader/evmreader_test.go index f8208b7f5..3d46d214e 100644 --- a/internal/evmreader/evmreader_test.go +++ b/internal/evmreader/evmreader_test.go @@ -249,7 +249,7 @@ func (s *EvmReaderSuite) TestTickScansWithServiceContext() { s.Require().False(s.evmReader.Ready()) - errs := s.evmReader.Tick(s.ctx) + _, errs := s.evmReader.Tick(s.ctx) s.Require().Empty(errs) s.client.AssertCalled(s.T(), "HeaderByNumber", mock.Anything, mock.Anything) @@ -280,7 +280,7 @@ func (s *EvmReaderSuite) TestTickReturnsHeaderFetchErrorWithoutLocalErrorLog() { mock.Anything, ).Return(hdr, headerErr).Once() - errs := s.evmReader.Tick(s.ctx) + _, errs := s.evmReader.Tick(s.ctx) s.Require().Len(errs, 1) s.Require().ErrorIs(errs[0], headerErr) diff --git a/internal/prt/service.go b/internal/prt/service.go index 7329b2132..b60acb362 100644 --- a/internal/prt/service.go +++ b/internal/prt/service.go @@ -124,10 +124,10 @@ func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { // Tick executes the Validator main logic of producing claims and/or proofs // for processed epochs of all running applications. -func (s *Service) Tick(ctx context.Context) []error { +func (s *Service) Tick(ctx context.Context) (bool, []error) { // Check for shutdown before starting work, consistent with the advancer. if ctx.Err() != nil { - return nil + return false, nil } apps, _, err := getAllRunningApplications(ctx, s.repository) @@ -135,16 +135,16 @@ func (s *Service) Tick(ctx context.Context) []error { // Only suppress context errors during shutdown; surface real DB errors. if errors.Is(err, context.Canceled) { s.Logger.Warn("Tick interrupted by shutdown", "error", err) - return nil + return false, nil } - return []error{fmt.Errorf("failed to get running applications. %w", err)} + return false, []error{fmt.Errorf("failed to get running applications. %w", err)} } // validate each application errs := []error{} for idx := range apps { if ctx.Err() != nil { - return errs + return false, errs } app := apps[idx] // Foreclosed apps: run the drain path (reconcile accepted epochs, @@ -171,7 +171,7 @@ func (s *Service) Tick(ctx context.Context) []error { errs = append(errs, err) } } - return errs + return false, errs } // handleForeclosedApp drains a foreclosed DaveConsensus application's epochs to diff --git a/internal/validator/validator.go b/internal/validator/validator.go index 64697702c..4a89f6b0c 100644 --- a/internal/validator/validator.go +++ b/internal/validator/validator.go @@ -68,7 +68,7 @@ func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { // Tick executes the Validator main logic of producing claims and/or proofs // for processed epochs of all running applications. -func (s *Service) Tick(ctx context.Context) []error { +func (s *Service) Tick(ctx context.Context) (bool, []error) { apps, _, err := getAllRunningApplications(ctx, s.repository) if err != nil { // During shutdown the parent context is canceled and every in- @@ -77,9 +77,9 @@ func (s *Service) Tick(ctx context.Context) []error { // propagates. Mirrors internal/prt/service.go's Tick pattern. if errors.Is(err, context.Canceled) { s.Logger.Warn("Tick interrupted by shutdown", "error", err) - return nil + return false, nil } - return []error{fmt.Errorf("failed to get running applications. %w", err)} + return false, []error{fmt.Errorf("failed to get running applications. %w", err)} } // validate each application @@ -95,7 +95,7 @@ func (s *Service) Tick(ctx context.Context) []error { errs = append(errs, err) } } - return errs + return false, errs } type ValidatorRepository interface { diff --git a/pkg/service/service.go b/pkg/service/service.go index 09a375f81..449aeb810 100644 --- a/pkg/service/service.go +++ b/pkg/service/service.go @@ -302,32 +302,18 @@ func (s *ServiceTemplate) LogConfig(config any) { */ type TickImpl interface { - Tick(ctx context.Context) []error + Tick(ctx context.Context) (bool, []error) } type TickServiceTemplate struct { ServiceTemplate tickImpl TickImpl ticker *time.Ticker - reschedule chan struct{} // self-continuation signal; see CreateInfo.EnableReschedule } type TickServiceConfigs struct { ServiceConfigs PollInterval time.Duration - - // EnableReschedule, when true, creates a self-continuation channel. - // Services that discover remaining work after a Tick() call - // SignalReschedule() to re-tick immediately without waiting for the - // timer interval. - // - // Migration: When the events library (feature/events-library-research) - // ships, Serve() will gain an additional EventChannel case for external - // cross-service notifications. Reschedule remains complementary: - // Reschedule = internal self-continuation ("I have more work"), - // EventChannel = external stimulus ("another service produced work"). - // Both coexist in the select loop alongside the Ticker safety-net. - EnableReschedule bool } func InitTickServiceTemplate( @@ -356,28 +342,30 @@ func InitTickServiceTemplate( } tmpl.ticker = time.NewTicker(cfg.PollInterval) - // self-rescheduling - if cfg.EnableReschedule { - tmpl.reschedule = make(chan struct{}, 1) - } - return nil } -func (s *TickServiceTemplate) tick(ctx context.Context) []error { +func (s *TickServiceTemplate) tick(ctx context.Context) bool { + if ctx.Err() != nil { + return false + } start := time.Now() - errs := s.tickImpl.Tick(ctx) + reschedule, errs := s.tickImpl.Tick(ctx) elapsed := time.Since(start) if len(errs) > 0 { s.Logger.Error("Tick", "duration", elapsed, - "error", errs) + "reschedule", reschedule, + "error", errs, + ) } else { s.Logger.Debug("Tick", - "duration", elapsed) + "duration", elapsed, + "reschedule", reschedule, + ) } - return errs + return reschedule } func (s *TickServiceTemplate) OnStop(bool) []error { @@ -389,43 +377,17 @@ func (s *TickServiceTemplate) OnServe(ctx context.Context) error { if ctx.Err() != nil { return nil } - s.tick(ctx) + for s.tick(ctx) {} for { select { case <-ctx.Done(): return nil case <-s.ticker.C: - s.tick(ctx) - // 'reschedule' is nil when rescheduling is disabled thus blocking forever, - // preserving timer-only behavior. - case <-s.reschedule: - s.tick(ctx) + for s.tick(ctx) {} } } } -// SignalReschedule performs a non-blocking send on the reschedule channel. -// If a signal is already pending, this is a no-op (one wake is sufficient). -// Does nothing if rescheduling is not enabled. -// INVARIANT: This method must never block. -func (s *TickServiceTemplate) SignalReschedule() { - select { - case s.reschedule <- struct{}{}: - default: - } -} - -// DrainReschedule consumes and discards a pending reschedule signal, if any. -// Returns true if a signal was pending. Intended for testing. -func (s *TickServiceTemplate) DrainReschedule() bool { - select { - case <-s.reschedule: - return true - default: - return false - } -} - /* * Service Logger */ diff --git a/pkg/service/service_test.go b/pkg/service/service_test.go index 35a897bf1..87da1a7d1 100644 --- a/pkg/service/service_test.go +++ b/pkg/service/service_test.go @@ -18,17 +18,18 @@ import ( type mockImpl struct { TickServiceTemplate tickCount atomic.Int32 - onTick func(n int32) // called on each Tick with the tick count (1-based) + onTick func(n int32) bool // called on each Tick with the tick count (1-based) } func (m *mockImpl) OnReload() []error { return nil } func (m *mockImpl) OnStop(bool) []error { return nil } -func (m *mockImpl) Tick(ctx context.Context) []error { +func (m *mockImpl) Tick(ctx context.Context) (bool, []error) { n := m.tickCount.Add(1) + reschedule := false if m.onTick != nil { - m.onTick(n) + reschedule = m.onTick(n) } - return nil + return reschedule, nil } // createTestService creates a Service for testing with the given mock and @@ -37,7 +38,6 @@ func (m *mockImpl) Tick(ctx context.Context) []error { func createTestService( t *testing.T, impl *mockImpl, - enableReschedule bool, ) (IService, context.CancelFunc) { t.Helper() ctx, cancel := context.WithCancel(context.Background()) @@ -49,7 +49,6 @@ func createTestService( Cancel: cancel, }, PollInterval: 10 * time.Minute, // long: tests control wakeup explicitly - EnableReschedule: enableReschedule, }, &impl.TickServiceTemplate, impl, impl) require.NoError(t, err) return impl, cancel @@ -120,16 +119,14 @@ func (s *ServeSuite) TestRescheduleTriggersImmediateRetick() { // Tick() again immediately without waiting for the timer. var impl *mockImpl impl = &mockImpl{ - onTick: func(n int32) { + onTick: func(n int32) bool { // Signal reschedule on ticks 1 and 2 (the initial tick // and the first rescheduled tick). Stop on tick 3. - if n <= 2 { - impl.SignalReschedule() - } + return n <= 2 }, } - svc, cancel := createTestService(s.T(), impl, true) + svc, cancel := createTestService(s.T(), impl) defer cancel() done := make(chan struct{}) @@ -149,65 +146,17 @@ func (s *ServeSuite) TestRescheduleTriggersImmediateRetick() { "should have at least 3 ticks: initial + 2 rescheduled") } -func (s *ServeSuite) TestRescheduleCoalesces() { - // Multiple signals while Tick() is running should result in at most - // one extra tick, not one per signal. - tickStarted := make(chan struct{}) - tickProceed := make(chan struct{}) - - impl := &mockImpl{ - onTick: func(n int32) { - if n == 1 { - // Signal that the first tick is running. - close(tickStarted) - // Block the first tick until the test is ready. - <-tickProceed - } - }, - } - - svc, cancel := createTestService(s.T(), impl, true) - defer cancel() - - done := make(chan struct{}) - go func() { - _ = svc.Serve() - close(done) - }() - - // Wait for the first tick to start. - <-tickStarted - - // Send multiple signals while tick is blocked. Only one fits in the buffer. - for range 10 { - impl.SignalReschedule() - } - - // Let the first tick complete. - close(tickProceed) - - // Wait for the rescheduled tick to fire, then shut down. - time.Sleep(50 * time.Millisecond) - cancel() - <-done - - // Should be exactly 2 ticks: the initial one + one rescheduled (coalesced). - ticks := impl.tickCount.Load() - s.Equal(int32(2), ticks, - "should have exactly 2 ticks: initial + 1 coalesced reschedule") -} - func (s *ServeSuite) TestContextCancellationExitsPromptly() { // When context is cancelled with a reschedule signal pending, // Serve() should exit promptly. var impl *mockImpl impl = &mockImpl{ - onTick: func(_ int32) { - impl.SignalReschedule() + onTick: func(_ int32) bool { + return true }, } - svc, cancel := createTestService(s.T(), impl, true) + svc, cancel := createTestService(s.T(), impl) done := make(chan struct{}) go func() { @@ -232,7 +181,7 @@ func (s *ServeSuite) TestServeExitsOnContextCancelledBeforeFirstTick() { impl := &mockImpl{} // Create the service with a live context, then cancel before Serve(). - svc, cancel := createTestService(s.T(), impl, false) + svc, cancel := createTestService(s.T(), impl) cancel() err := svc.Serve() @@ -240,40 +189,3 @@ func (s *ServeSuite) TestServeExitsOnContextCancelledBeforeFirstTick() { // No ticks should have fired since context was already cancelled. s.Equal(int32(0), impl.tickCount.Load()) } - -func (s *ServeSuite) TestRescheduleEnabledCreatesChannel() { - impl := &mockImpl{} - _, cancel := createTestService(s.T(), impl, true) - defer cancel() - - s.NotNil(impl.reschedule, "reschedule channel should be created when enabled") -} - -func (s *ServeSuite) TestRescheduleDisabledLeavesNilChannel() { - impl := &mockImpl{} - _, cancel := createTestService(s.T(), impl, false) - defer cancel() - - s.Nil(impl.reschedule, "reschedule channel should be nil when disabled") -} - -func (s *ServeSuite) TestSignalRescheduleNoopWhenDisabled() { - impl := &mockImpl{} - _, cancel := createTestService(s.T(), impl, false) - defer cancel() - - // Should not panic on nil channel. - s.NotPanics(func() { impl.SignalReschedule() }) -} - -func (s *ServeSuite) TestDrainReschedule() { - impl := &mockImpl{} - _, cancel := createTestService(s.T(), impl, true) - defer cancel() - - s.False(impl.DrainReschedule(), "should be empty initially") - - impl.SignalReschedule() - s.True(impl.DrainReschedule(), "should drain pending signal") - s.False(impl.DrainReschedule(), "should be empty after drain") -} diff --git a/test/validator/validator_test.go b/test/validator/validator_test.go index 442fb684f..b1c861be2 100644 --- a/test/validator/validator_test.go +++ b/test/validator/validator_test.go @@ -139,7 +139,7 @@ func (s *ValidatorRepositoryIntegrationSuite) TestItReturnsPristineClaim() { err = s.repository.StoreAdvanceResult(s.ctx, 1, &advanceResult) s.Require().Nil(err) - errs := s.validator.Tick(s.ctx) + _, errs := s.validator.Tick(s.ctx) s.Require().Equal(0, len(errs)) updatedEpoch, err := s.repository.GetEpoch(s.ctx, app.IApplicationAddress.String(), epoch.Index) @@ -262,7 +262,7 @@ func (s *ValidatorRepositoryIntegrationSuite) TestItReturnsPreviousClaim() { err = s.repository.StoreAdvanceResult(s.ctx, 1, &advanceResult) s.Require().Nil(err) - errs := s.validator.Tick(s.ctx) + _, errs := s.validator.Tick(s.ctx) s.Require().Equal(0, len(errs)) updatedEpoch, err := s.repository.GetEpoch(s.ctx, app.IApplicationAddress.String(), secondEpoch.Index) @@ -345,7 +345,7 @@ func (s *ValidatorRepositoryIntegrationSuite) TestItReturnsANewClaimAndProofs() err = s.repository.StoreAdvanceResult(s.ctx, 1, &advanceResult) s.Require().Nil(err) - errs := s.validator.Tick(s.ctx) + _, errs := s.validator.Tick(s.ctx) s.Require().Equal(0, len(errs)) updatedEpoch, err := s.repository.GetEpoch(s.ctx, app.IApplicationAddress.String(), epoch.Index) @@ -499,7 +499,7 @@ func (s *ValidatorRepositoryIntegrationSuite) TestItReturnsANewClaimAndProofs() err = s.repository.StoreAdvanceResult(s.ctx, 1, &advanceResult) s.Require().Nil(err) - errs := s.validator.Tick(s.ctx) + _, errs := s.validator.Tick(s.ctx) s.Require().Equal(0, len(errs)) updatedSecondEpoch, err := s.repository.GetEpoch( From b11a53757b1556548d60830b381549fc349e20aa Mon Sep 17 00:00:00 2001 From: Renato Maia <1887792+renatomaia@users.noreply.github.com> Date: Fri, 15 May 2026 17:08:04 -0300 Subject: [PATCH 10/16] fix(services): avoid service to end before shutdown is complete --- pkg/service/service.go | 13 +++- pkg/service/service_test.go | 129 ++++++++++++++++++++++++++++++++++++ 2 files changed, 139 insertions(+), 3 deletions(-) diff --git a/pkg/service/service.go b/pkg/service/service.go index 449aeb810..8c894240c 100644 --- a/pkg/service/service.go +++ b/pkg/service/service.go @@ -114,7 +114,8 @@ type ServiceTemplate struct { // stopped server Stop() run exactly once, even when Stop() is called // multiple times (by the child's Serve() loop and by the parent orchestrator). - stopped atomic.Bool + stopped atomic.Bool + stoppedChan chan struct{} } // ServiceConfigs stores configuration for the InitServiceTemplate function @@ -139,6 +140,8 @@ func InitServiceTemplate(c *ServiceConfigs, s *ServiceTemplate, impl LifecycleIm return ErrInvalid } + s.stoppedChan = make(chan struct{}) + s.lifecycleImpl = impl s.Name = c.Name @@ -252,6 +255,7 @@ func (s *ServiceTemplate) Stop(force bool) []error { elapsed := time.Since(start) s.cancelContext() + close(s.stoppedChan) if len(errs) > 0 { s.Logger.Error("Stop", @@ -286,9 +290,12 @@ func (s *ServiceTemplate) Serve() error { } }() - defer s.Stop(true) + err := s.lifecycleImpl.OnServe(s.context) + + go s.Stop(true) + <-s.stoppedChan - return s.lifecycleImpl.OnServe(s.context) + return err } // LogConfig logs the service configuration at debug level. diff --git a/pkg/service/service_test.go b/pkg/service/service_test.go index 87da1a7d1..68842f0c0 100644 --- a/pkg/service/service_test.go +++ b/pkg/service/service_test.go @@ -189,3 +189,132 @@ func (s *ServeSuite) TestServeExitsOnContextCancelledBeforeFirstTick() { // No ticks should have fired since context was already cancelled. s.Equal(int32(0), impl.tickCount.Load()) } + +type delayedCloseImpl struct { + ServiceTemplate + onServeInitChan chan struct{} + onStopInitChan chan struct{} +} + +func (s *delayedCloseImpl) OnStop(bool) []error { + <-s.onStopInitChan // wait signal to initiate stop + return nil +} + +func (s *delayedCloseImpl) OnServe(ctx context.Context) error { + close(s.onServeInitChan) // signal service was initiated + <-ctx.Done() + return nil +} + +func (s *ServeSuite) TestServeExitsAfterStopIsComplete() { + svc := &delayedCloseImpl{ + onServeInitChan: make(chan struct{}), + onStopInitChan: make(chan struct{}), + } + + // Create the service with a live context, then cancel before Serve(). + err := InitServiceTemplate(&ServiceConfigs{ + Name: "stopOnChanClose", + LogLevel: slog.LevelError, + }, &svc.ServiceTemplate, svc) + s.NoError(err) + + onServeEndChan := make(chan error) + go func() { + err = svc.Serve() + onServeEndChan <- err // signal service ended and provide error + close(onServeEndChan) + }() + + onStopEndChan := make(chan []error) + select { + case <-svc.onServeInitChan: // wait service to initiate, so can be stopped. + // initiate service shutdown through context cancelation + go func() { + errs := svc.Stop(true) + onStopEndChan <- errs // signal stop ended and provide the errors + close(onStopEndChan) + }() + case <-time.After(2 * time.Second): + s.Fail("Serve() did not start within 2 seconds") + } + + // Serve() nor Stop() should not exit just yet. + select { + case <-onServeEndChan: + s.Fail("Serve() exited before 'OnStop' completion") + case <-onStopEndChan: + s.Fail("Stop() exited before 'OnStop' completion") + case <-time.After(100 * time.Millisecond): + // OK + } + + close(svc.onStopInitChan) // signal that stop shall initiate and eventually complete + + // Serve() should exit without errors. + select { + case err = <-onServeEndChan: + s.NoError(err) + case <-time.After(2 * time.Second): + s.Fail("Serve() did not exit within 2 seconds after 'OnStop' concluded") + } + + // Stop() should exit without errors. + select { + case errs := <-onStopEndChan: + s.Empty(errs) + case <-time.After(2 * time.Second): + s.Fail("Stop() did not exit within 2 seconds after 'OnStop' concluded") + } +} + +func (s *ServeSuite) TestServeExitsAfterStopIsCompleteOnContextCancelation() { + svc := &delayedCloseImpl{ + onServeInitChan: make(chan struct{}), + onStopInitChan: make(chan struct{}), + } + + ctx, cancel := context.WithCancel(context.Background()) + + // Create the service with a live context, then cancel before Serve(). + err := InitServiceTemplate(&ServiceConfigs{ + Name: "stopOnChanClose", + LogLevel: slog.LevelError, + Context: ctx, + Cancel: cancel, + }, &svc.ServiceTemplate, svc) + s.NoError(err) + + onServeEndChan := make(chan error) + go func() { + err = svc.Serve() + onServeEndChan <- err // signal service ended and provide error + close(onServeEndChan) + }() + + select { + case <-svc.onServeInitChan: // wait service to initiate, so can be stopped. + cancel() // initiate service shutdown through context cancelation + case <-time.After(2 * time.Second): + s.Fail("Serve() did not start within 2 seconds") + } + + // Serve() should not exit just yet. + select { + case <-onServeEndChan: + s.Fail("Serve() exited before 'OnStop' completion") + case <-time.After(100 * time.Millisecond): + // OK + } + + close(svc.onStopInitChan) // signal that stop shall initiate and eventually complete + + // Serve() should exit without errors. + select { + case err := <-onServeEndChan: + s.NoError(err) + case <-time.After(2 * time.Second): + s.Fail("Serve() did not exit within 2 seconds after 'OnStop' concluded") + } +} From 529bccece0df232ff163e0f792fa0e90524814fa Mon Sep 17 00:00:00 2001 From: Renato Maia <1887792+renatomaia@users.noreply.github.com> Date: Mon, 18 May 2026 11:24:26 -0300 Subject: [PATCH 11/16] fix(services): propagate errors of child services in single-process node --- internal/node/node.go | 22 ++++- internal/node/node_test.go | 173 +++++++++++++++++++++++++++++++++++++ 2 files changed, 192 insertions(+), 3 deletions(-) create mode 100644 internal/node/node_test.go diff --git a/internal/node/node.go b/internal/node/node.go index 752b9d33f..17ac2f9a9 100644 --- a/internal/node/node.go +++ b/internal/node/node.go @@ -5,6 +5,7 @@ package node import ( "context" + "errors" "fmt" "github.com/cartesi/rollups-node/pkg/service" @@ -154,10 +155,25 @@ func (me *Service) OnStop(force bool) []error { } func (me *Service) OnServe(ctx context.Context) error { - for _, s := range me.Children { - go s.Serve() + childrenCount := len(me.Children) + errCh := make(chan error, childrenCount) + for _, child := range me.Children { + child := child + go func() { errCh <- child.Serve() }() + } + + errs := make([]error, 0) + for range childrenCount { + err := <-errCh + if err != nil && !errors.Is(err, context.Canceled) { + me.Stop(true) + errs = append(errs, err) + } + } + if len(errs) > 0 { + return errors.Join(errs...) } - <-ctx.Done() + return nil } diff --git a/internal/node/node_test.go b/internal/node/node_test.go new file mode 100644 index 000000000..65130d10d --- /dev/null +++ b/internal/node/node_test.go @@ -0,0 +1,173 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +package node + +import ( + "context" + "fmt" + "log/slog" + "sync" + "testing" + "time" + + "github.com/cartesi/rollups-node/pkg/service" + + "github.com/stretchr/testify/require" + "github.com/stretchr/testify/suite" +) + +type blockingChildImpl struct { + service.ServiceTemplate + started chan struct{} + done chan struct{} + once sync.Once +} + +func (c *blockingChildImpl) OnServe(ctx context.Context) error { + close(c.started) + <-ctx.Done() + c.once.Do(func() { close(c.done) }) + return nil +} + +func createBlockingChild(t *testing.T, cfg *service.ServiceConfigs, name string) *blockingChildImpl { + t.Helper() + childCfg := *cfg + childCfg.Name = name + + child := &blockingChildImpl{ + started: make(chan struct{}), + done: make(chan struct{}), + } + require.NoError(t, service.InitServiceTemplate(&childCfg, &child.ServiceTemplate, child)) + return child +} + +type NodeSuite struct { + suite.Suite +} + +func TestServe(t *testing.T) { + suite.Run(t, new(NodeSuite)) +} + +func (s *NodeSuite) TestNodeStopCancelsChildContexts() { + ctx, cancel := context.WithCancel(context.Background()) + parentCfg := service.ServiceConfigs{ + Name: "node", + LogLevel: slog.LevelError, + Context: ctx, + Cancel: cancel, + } + + nodeSvc := &Service{} + require.NoError(s.T(), service.InitServiceTemplate(&parentCfg, &nodeSvc.ServiceTemplate, nodeSvc)) + + child1 := createBlockingChild(s.T(), &parentCfg, "child-1") + child2 := createBlockingChild(s.T(), &parentCfg, "child-2") + nodeSvc.Children = []service.IService{child1, child2} + + done := make(chan struct{}) + go func() { + _ = nodeSvc.Serve() + close(done) + }() + + select { + case <-child1.started: + case <-time.After(2 * time.Second): + s.Fail("child-1 did not start") + } + select { + case <-child2.started: + case <-time.After(2 * time.Second): + s.Fail("child-2 did not start") + } + + nodeSvc.Stop(false) + + select { + case <-child1.done: + case <-time.After(2 * time.Second): + s.Fail("child-1 did not observe ctx.Done()") + } + select { + case <-child2.done: + case <-time.After(2 * time.Second): + s.Fail("child-2 did not observe ctx.Done()") + } + + select { + case <-done: + case <-time.After(2 * time.Second): + s.Fail("node did not exit after Stop()") + } +} + +type errorChildImpl struct { + service.ServiceTemplate + started chan struct{} +} + +func (c *errorChildImpl) OnServe(ctx context.Context) error { + close(c.started) + time.Sleep(10 * time.Millisecond) + return fmt.Errorf("Oops %s!", c.Name) +} + +func createErrorChild(t *testing.T, cfg *service.ServiceConfigs, name string) *errorChildImpl { + t.Helper() + childCfg := *cfg + childCfg.Name = name + + child := &errorChildImpl{ + started: make(chan struct{}), + } + require.NoError(t, service.InitServiceTemplate(&childCfg, &child.ServiceTemplate, child)) + return child +} + +func (s *NodeSuite) TestNodeReturnChildErrors() { + + ctx, cancel := context.WithCancel(context.Background()) + parentCfg := service.ServiceConfigs{ + Name: "node", + LogLevel: slog.LevelError, + Context: ctx, + Cancel: cancel, + } + + nodeSvc := &Service{} + require.NoError(s.T(), service.InitServiceTemplate(&parentCfg, &nodeSvc.ServiceTemplate, nodeSvc)) + + child1 := createErrorChild(s.T(), &parentCfg, "child-1") + child2 := createErrorChild(s.T(), &parentCfg, "child-2") + nodeSvc.Children = []service.IService{child1, child2} + + done := make(chan error) + go func() { + err := nodeSvc.Serve() + done <- err + close(done) + }() + + select { + case <-child1.started: + case <-time.After(2 * time.Second): + s.Fail("child-1 did not start") + } + select { + case <-child2.started: + case <-time.After(2 * time.Second): + s.Fail("child-2 did not start") + } + + select { + case err := <-done: + s.ErrorContains(err, "Oops child-1!") + s.ErrorContains(err, "Oops child-2!") + case <-time.After(2 * time.Second): + s.Fail("node did not exit after child errors") + } +} From 28f72c6538bd974cf6974b73f206799bb746eea4 Mon Sep 17 00:00:00 2001 From: Renato Maia <1887792+renatomaia@users.noreply.github.com> Date: Fri, 19 Jun 2026 20:26:46 -0300 Subject: [PATCH 12/16] refactor(services): support better multiple services in single process --- cmd/cartesi-rollups-advancer/root/root.go | 28 +- cmd/cartesi-rollups-claimer/root/root.go | 27 +- cmd/cartesi-rollups-evm-reader/root/root.go | 29 +- cmd/cartesi-rollups-jsonrpc-api/root/root.go | 24 +- cmd/cartesi-rollups-node/root/root.go | 19 +- cmd/cartesi-rollups-prt/root/root.go | 28 +- cmd/cartesi-rollups-validator/root/root.go | 25 +- internal/advancer/advancer.go | 2 +- internal/advancer/advancer_test.go | 51 +-- internal/advancer/service.go | 16 +- internal/claimer/claimer_test.go | 13 +- internal/claimer/service.go | 7 +- internal/claimer/service_test.go | 9 +- internal/cli/cobra.go | 17 +- .../evmreader/accounts_drive_proved_test.go | 1 - internal/evmreader/edge_cases_test.go | 4 +- internal/evmreader/evmreader_test.go | 43 ++- internal/evmreader/foreclosure_test.go | 1 - internal/evmreader/output_test.go | 13 +- internal/evmreader/sealedepochs_test.go | 2 +- internal/evmreader/service.go | 11 +- internal/jsonrpc/service.go | 20 +- internal/node/node.go | 177 +++------ internal/node/node_test.go | 173 --------- internal/prt/service.go | 7 +- internal/validator/validator.go | 7 +- internal/validator/validator_test.go | 2 +- pkg/service/service.go | 360 ++---------------- pkg/service/service_test.go | 215 ++--------- pkg/service/supervisor.go | 329 ++++++++++++++++ pkg/service/supervisor_test.go | 256 +++++++++++++ pkg/service/telemetry_test.go | 62 +-- pkg/service/tick.go | 97 +++++ pkg/service/tick_test.go | 173 +++++++++ 34 files changed, 1229 insertions(+), 1019 deletions(-) delete mode 100644 internal/node/node_test.go create mode 100644 pkg/service/supervisor.go create mode 100644 pkg/service/supervisor_test.go create mode 100644 pkg/service/tick.go create mode 100644 pkg/service/tick_test.go diff --git a/cmd/cartesi-rollups-advancer/root/root.go b/cmd/cartesi-rollups-advancer/root/root.go index 65a37964a..bd802bacf 100644 --- a/cmd/cartesi-rollups-advancer/root/root.go +++ b/cmd/cartesi-rollups-advancer/root/root.go @@ -79,22 +79,27 @@ func run(cmd *cobra.Command, args []string) { ctx, cancel := context.WithTimeout(context.Background(), cfg.MaxStartupTime) defer cancel() + svcName := config.ServiceAdvancer + logLevel := config.ResolveServiceLogLevel(config.ServiceAdvancer, cfg.LogLevel) + logColor := cfg.LogColor + logger := service.NewLogger(logLevel, logColor).With("service", svcName) + createInfo := advancer.CreateInfo{ + SupervisorConfigs: service.SupervisorConfigs{ + Logger: logger, + EnableSignalHandling: true, + TelemetryCreate: true, + TelemetryAddress: cfg.AdvancerTelemetryAddress, + }, TickServiceConfigs: service.TickServiceConfigs{ PollInterval: cfg.AdvancerPollingInterval, ServiceConfigs: service.ServiceConfigs{ - Name: config.ServiceAdvancer, - LogLevel: config.ResolveServiceLogLevel(config.ServiceAdvancer, cfg.LogLevel), - LogColor: cfg.LogColor, - EnableSignalHandling: true, - TelemetryCreate: true, - TelemetryAddress: cfg.AdvancerTelemetryAddress, + Name: svcName, + Logger: logger, }, }, Config: *cfg, } - logger := service.NewServiceLogger(&createInfo.ServiceConfigs) - createInfo.ServiceConfigs.Logger = logger var err error createInfo.Repository, err = factory.NewRepositoryFromConnectionString(ctx, cfg.DatabaseConnection.Raw()) @@ -104,5 +109,10 @@ func run(cmd *cobra.Command, args []string) { advancerService, err := advancer.Create(ctx, &createInfo) cli.CheckErr(logger, err) - cli.CheckErr(logger, advancerService.Serve()) + createInfo.SupervisorConfigs.Services = []service.ServiceImpl{ advancerService } + supervisor := &service.ServicesSupervisor{} + err = service.InitServicesSupervisor(&createInfo.SupervisorConfigs, supervisor) + cli.CheckErr(logger, err) + + cli.CheckErrs(logger, supervisor.Serve()) } diff --git a/cmd/cartesi-rollups-claimer/root/root.go b/cmd/cartesi-rollups-claimer/root/root.go index 96eb4cdc8..e77b0b2bb 100644 --- a/cmd/cartesi-rollups-claimer/root/root.go +++ b/cmd/cartesi-rollups-claimer/root/root.go @@ -80,22 +80,27 @@ func run(cmd *cobra.Command, args []string) { ctx, cancel := context.WithTimeout(context.Background(), cfg.MaxStartupTime) defer cancel() + svcName := config.ServiceClaimer + logLevel := config.ResolveServiceLogLevel(config.ServiceClaimer, cfg.LogLevel) + logColor := cfg.LogColor + logger := service.NewLogger(logLevel, logColor).With("service", svcName) + createInfo := claimer.CreateInfo{ + SupervisorConfigs: service.SupervisorConfigs{ + Logger: logger, + EnableSignalHandling: true, + TelemetryCreate: true, + TelemetryAddress: cfg.ClaimerTelemetryAddress, + }, TickServiceConfigs: service.TickServiceConfigs{ PollInterval: cfg.ClaimerPollingInterval, ServiceConfigs: service.ServiceConfigs{ - Name: config.ServiceClaimer, - LogLevel: config.ResolveServiceLogLevel(config.ServiceClaimer, cfg.LogLevel), - LogColor: cfg.LogColor, - EnableSignalHandling: true, - TelemetryCreate: true, - TelemetryAddress: cfg.ClaimerTelemetryAddress, + Name: svcName, + Logger: logger, }, }, Config: *cfg, } - logger := service.NewServiceLogger(&createInfo.ServiceConfigs) - createInfo.ServiceConfigs.Logger = logger authOpt, err := config.HTTPAuthorizationOption() cli.CheckErr(logger, err) @@ -116,6 +121,10 @@ func run(cmd *cobra.Command, args []string) { claimerService, err := claimer.Create(ctx, &createInfo) cli.CheckErr(logger, err) - err = claimerService.Serve() + createInfo.SupervisorConfigs.Services = []service.ServiceImpl{ claimerService } + supervisor := &service.ServicesSupervisor{} + err = service.InitServicesSupervisor(&createInfo.SupervisorConfigs, supervisor) cli.CheckErr(logger, err) + + cli.CheckErrs(logger, supervisor.Serve()) } diff --git a/cmd/cartesi-rollups-evm-reader/root/root.go b/cmd/cartesi-rollups-evm-reader/root/root.go index bb40f0d8a..ac051b956 100644 --- a/cmd/cartesi-rollups-evm-reader/root/root.go +++ b/cmd/cartesi-rollups-evm-reader/root/root.go @@ -80,22 +80,26 @@ func run(cmd *cobra.Command, args []string) { ctx, cancel := context.WithTimeout(context.Background(), cfg.MaxStartupTime) defer cancel() + svcName := config.ServiceEvmReader + logLevel := config.ResolveServiceLogLevel(config.ServiceEvmReader, cfg.LogLevel) + logColor := cfg.LogColor + logger := service.NewLogger(logLevel, logColor).With("service", svcName) + createInfo := evmreader.CreateInfo{ + SupervisorConfigs: service.SupervisorConfigs{ + Logger: logger, + EnableSignalHandling: true, + TelemetryCreate: true, + TelemetryAddress: cfg.EvmReaderTelemetryAddress, + }, TickServiceConfigs: service.TickServiceConfigs{ ServiceConfigs: service.ServiceConfigs{ - Name: config.ServiceEvmReader, - LogLevel: config.ResolveServiceLogLevel(config.ServiceEvmReader, cfg.LogLevel), - LogColor: cfg.LogColor, - EnableSignalHandling: true, - TelemetryCreate: true, - TelemetryAddress: cfg.EvmReaderTelemetryAddress, + Name: svcName, + Logger: logger, }, - PollInterval: cfg.EvmReaderPollingInterval, }, Config: *cfg, } - logger := service.NewServiceLogger(&createInfo.ServiceConfigs) - createInfo.ServiceConfigs.Logger = logger var err error authOpt, err := config.HTTPAuthorizationOption() @@ -118,5 +122,10 @@ func run(cmd *cobra.Command, args []string) { readerService, err := evmreader.Create(ctx, &createInfo) cli.CheckErr(logger, err) - cli.CheckErr(logger, readerService.Serve()) + createInfo.SupervisorConfigs.Services = []service.ServiceImpl{ readerService } + supervisor := &service.ServicesSupervisor{} + err = service.InitServicesSupervisor(&createInfo.SupervisorConfigs, supervisor) + cli.CheckErr(logger, err) + + cli.CheckErrs(logger, supervisor.Serve()) } diff --git a/cmd/cartesi-rollups-jsonrpc-api/root/root.go b/cmd/cartesi-rollups-jsonrpc-api/root/root.go index a18b984bf..b540c3e73 100644 --- a/cmd/cartesi-rollups-jsonrpc-api/root/root.go +++ b/cmd/cartesi-rollups-jsonrpc-api/root/root.go @@ -67,19 +67,24 @@ func run(cmd *cobra.Command, args []string) { ctx, cancel := context.WithTimeout(context.Background(), cfg.MaxStartupTime) defer cancel() + svcName := config.ServiceJsonrpc + logLevel := config.ResolveServiceLogLevel(config.ServiceJsonrpc, cfg.LogLevel) + logColor := cfg.LogColor + logger := service.NewLogger(logLevel, logColor).With("service", svcName) + createInfo := jsonrpc.CreateInfo{ - ServiceConfigs: service.ServiceConfigs{ - Name: config.ServiceJsonrpc, - LogLevel: config.ResolveServiceLogLevel(config.ServiceJsonrpc, cfg.LogLevel), - LogColor: cfg.LogColor, + SupervisorConfigs: service.SupervisorConfigs{ + Logger: logger, EnableSignalHandling: true, TelemetryCreate: true, TelemetryAddress: cfg.JsonrpcTelemetryAddress, }, + ServiceConfigs: service.ServiceConfigs{ + Name: svcName, + Logger: logger, + }, Config: *cfg, } - logger := service.NewServiceLogger(&createInfo.ServiceConfigs) - createInfo.ServiceConfigs.Logger = logger var err error createInfo.Repository, err = factory.NewRepositoryFromConnectionString(ctx, cfg.DatabaseConnection.Raw()) @@ -89,5 +94,10 @@ func run(cmd *cobra.Command, args []string) { jsonrpcService, err := jsonrpc.Create(ctx, &createInfo) cli.CheckErr(logger, err) - cli.CheckErr(logger, jsonrpcService.Serve()) + createInfo.SupervisorConfigs.Services = []service.ServiceImpl{ jsonrpcService } + supervisor := &service.ServicesSupervisor{} + err = service.InitServicesSupervisor(&createInfo.SupervisorConfigs, supervisor) + cli.CheckErr(logger, err) + + cli.CheckErrs(logger, supervisor.Serve()) } diff --git a/cmd/cartesi-rollups-node/root/root.go b/cmd/cartesi-rollups-node/root/root.go index 32ce499ca..63252aa20 100644 --- a/cmd/cartesi-rollups-node/root/root.go +++ b/cmd/cartesi-rollups-node/root/root.go @@ -151,19 +151,24 @@ func run(cmd *cobra.Command, args []string) { ctx, cancel := context.WithTimeout(context.Background(), cfg.MaxStartupTime) defer cancel() + svcName := config.ServiceNode + logLevel := cfg.LogLevel + logColor := cfg.LogColor + logger := service.NewLogger(logLevel, logColor).With("service", svcName) + createInfo := node.CreateInfo{ - ServiceConfigs: service.ServiceConfigs{ - Name: config.ServiceNode, - LogLevel: cfg.LogLevel, - LogColor: cfg.LogColor, + SupervisorConfigs: service.SupervisorConfigs{ + Logger: logger, EnableSignalHandling: true, TelemetryCreate: true, TelemetryAddress: cfg.NodeTelemetryAddress, }, + ServiceConfigs: service.ServiceConfigs{ + Name: config.ServiceNode, + Logger: logger, + }, Config: *cfg, } - logger := service.NewServiceLogger(&createInfo.ServiceConfigs) - createInfo.ServiceConfigs.Logger = logger var err error createInfo.ReaderClient, err = newEthClient(ctx, config.ServiceEvmReader, cfg.BlockchainHttpRequestTimeout) @@ -182,5 +187,5 @@ func run(cmd *cobra.Command, args []string) { nodeService, err := node.Create(ctx, &createInfo) cli.CheckErr(logger, err) - cli.CheckErr(logger, nodeService.Serve()) + cli.CheckErrs(logger, nodeService.Serve()) } diff --git a/cmd/cartesi-rollups-prt/root/root.go b/cmd/cartesi-rollups-prt/root/root.go index fb50771c4..8f77a81b2 100644 --- a/cmd/cartesi-rollups-prt/root/root.go +++ b/cmd/cartesi-rollups-prt/root/root.go @@ -68,22 +68,27 @@ func run(cmd *cobra.Command, args []string) { ctx, cancel := context.WithTimeout(context.Background(), cfg.MaxStartupTime) defer cancel() + svcName := config.ServicePrt + logLevel := config.ResolveServiceLogLevel(config.ServicePrt, cfg.LogLevel) + logColor := cfg.LogColor + logger := service.NewLogger(logLevel, logColor).With("service", svcName) + createInfo := prt.CreateInfo{ + SupervisorConfigs: service.SupervisorConfigs{ + Logger: logger, + EnableSignalHandling: true, + TelemetryCreate: true, + TelemetryAddress: cfg.PrtTelemetryAddress, + }, TickServiceConfigs: service.TickServiceConfigs{ PollInterval: cfg.PrtPollingInterval, ServiceConfigs: service.ServiceConfigs{ - Name: config.ServicePrt, - LogLevel: config.ResolveServiceLogLevel(config.ServicePrt, cfg.LogLevel), - LogColor: cfg.LogColor, - EnableSignalHandling: true, - TelemetryCreate: true, - TelemetryAddress: cfg.PrtTelemetryAddress, + Name: svcName, + Logger: logger, }, }, Config: *cfg, } - logger := service.NewServiceLogger(&createInfo.ServiceConfigs) - createInfo.ServiceConfigs.Logger = logger var err error authOpt, err := config.HTTPAuthorizationOption() @@ -105,5 +110,10 @@ func run(cmd *cobra.Command, args []string) { prtService, err := prt.Create(ctx, &createInfo) cli.CheckErr(logger, err) - cli.CheckErr(logger, prtService.Serve()) + createInfo.SupervisorConfigs.Services = []service.ServiceImpl{ prtService } + supervisor := &service.ServicesSupervisor{} + err = service.InitServicesSupervisor(&createInfo.SupervisorConfigs, supervisor) + cli.CheckErr(logger, err) + + cli.CheckErrs(logger, supervisor.Serve()) } diff --git a/cmd/cartesi-rollups-validator/root/root.go b/cmd/cartesi-rollups-validator/root/root.go index 7c0347529..c0ef3bf4e 100644 --- a/cmd/cartesi-rollups-validator/root/root.go +++ b/cmd/cartesi-rollups-validator/root/root.go @@ -67,22 +67,26 @@ func run(cmd *cobra.Command, args []string) { ctx, cancel := context.WithTimeout(context.Background(), cfg.MaxStartupTime) defer cancel() + svcName := config.ServiceValidator + logLevel := config.ResolveServiceLogLevel(config.ServiceValidator, cfg.LogLevel) + logColor := cfg.LogColor + logger := service.NewLogger(logLevel, logColor).With("service", svcName) + createInfo := validator.CreateInfo{ + SupervisorConfigs: service.SupervisorConfigs{ + EnableSignalHandling: true, + TelemetryCreate: true, + TelemetryAddress: cfg.ValidatorTelemetryAddress, + }, TickServiceConfigs: service.TickServiceConfigs{ PollInterval: cfg.ValidatorPollingInterval, ServiceConfigs: service.ServiceConfigs{ Name: config.ServiceValidator, - LogLevel: config.ResolveServiceLogLevel(config.ServiceValidator, cfg.LogLevel), - LogColor: cfg.LogColor, - EnableSignalHandling: true, - TelemetryCreate: true, - TelemetryAddress: cfg.ValidatorTelemetryAddress, + Logger: logger, }, }, Config: *cfg, } - logger := service.NewServiceLogger(&createInfo.ServiceConfigs) - createInfo.ServiceConfigs.Logger = logger var err error createInfo.Repository, err = factory.NewRepositoryFromConnectionString(ctx, cfg.DatabaseConnection.Raw()) @@ -92,5 +96,10 @@ func run(cmd *cobra.Command, args []string) { validatorService, err := validator.Create(ctx, &createInfo) cli.CheckErr(logger, err) - cli.CheckErr(logger, validatorService.Serve()) + createInfo.SupervisorConfigs.Services = []service.ServiceImpl{ validatorService } + supervisor := &service.ServicesSupervisor{} + err = service.InitServicesSupervisor(&createInfo.SupervisorConfigs, supervisor) + cli.CheckErr(logger, err) + + cli.CheckErrs(logger, supervisor.Serve()) } diff --git a/internal/advancer/advancer.go b/internal/advancer/advancer.go index 287f7efcc..1593cc81d 100644 --- a/internal/advancer/advancer.go +++ b/internal/advancer/advancer.go @@ -311,7 +311,7 @@ func (s *Service) processInputs(ctx context.Context, app *Application, inputs [] "epoch", input.EpochIndex, "index", input.Index, "error", err) - s.Stop(false) // triggers graceful shutdown of all services + s.Supervisor.Stop(true) // triggers graceful shutdown of all services return err } diff --git a/internal/advancer/advancer_test.go b/internal/advancer/advancer_test.go index 426ee4bdc..054621f2b 100644 --- a/internal/advancer/advancer_test.go +++ b/internal/advancer/advancer_test.go @@ -44,23 +44,6 @@ func newMockAdvancerServiceWithBatchSize( machineManager *MockMachineManager, repo *MockRepository, batchSize uint64, -) (*Service, error) { - ctx, cf := context.WithCancel(context.Background()) - return newMockAdvancerServiceWithContextAndBatchSize( - ctx, - cf, - machineManager, - repo, - batchSize, - ) -} - -func newMockAdvancerServiceWithContextAndBatchSize( - ctx context.Context, - cancelCtx context.CancelFunc, - machineManager *MockMachineManager, - repo *MockRepository, - batchSize uint64, ) (*Service, error) { s := &Service{ inputBatchSize: batchSize, @@ -68,13 +51,15 @@ func newMockAdvancerServiceWithContextAndBatchSize( repository: repo, } serviceArgs := &service.TickServiceConfigs{ - ServiceConfigs: service.ServiceConfigs{ - Name: "advancer", - Context: ctx, - Cancel: cancelCtx, - }, + ServiceConfigs: service.ServiceConfigs{Name: "advancer"}, + } + err := service.InitTickServiceTemplate(serviceArgs, &s.TickServiceTemplate, s) + if err != nil { + return nil, err } - err := service.InitTickServiceTemplate(serviceArgs, &s.TickServiceTemplate, s, s) + supCfg := service.SupervisorConfigs{ Services: []service.ServiceImpl{s} } + supervisor := &service.ServicesSupervisor{} + err = service.InitServicesSupervisor(&supCfg, supervisor) if err != nil { return nil, err } @@ -85,7 +70,6 @@ func newMockAdvancerServiceWithContextAndBatchSize( // the mock machine manager, and the mock repository. type testEnv struct { service *Service - ctx context.Context app *MockMachineImpl mm *MockMachineManager repo *MockRepository @@ -94,14 +78,13 @@ type testEnv struct { // setupOneApp creates a standard test environment with one application. // The repository is empty; callers can configure it after the call. func (s *AdvancerSuite) setupOneApp() testEnv { - ctx, cf := context.WithCancel(context.Background()) mm := newMockMachineManager() app := newMockMachine(1) mm.Map[1] = newMockInstance(app) repo := &MockRepository{} - svc, err := newMockAdvancerServiceWithContextAndBatchSize(ctx, cf, mm, repo, defaultBatchSize) + svc, err := newMockAdvancerServiceWithBatchSize(mm, repo, defaultBatchSize) s.Require().NoError(err) - return testEnv{service: svc, ctx: ctx, app: app, mm: mm, repo: repo} + return testEnv{service: svc, app: app, mm: mm, repo: repo} } func (s *AdvancerSuite) TestServiceInterface() { @@ -136,7 +119,7 @@ func (s *AdvancerSuite) TestServiceInterface() { // Stop must be called last to cleanly shut down the service. // It should complete without returning any errors. - require.Empty(advancer.Stop(false)) + require.Empty(advancer.Supervisor.Stop(false)) }) } @@ -433,7 +416,7 @@ func (s *AdvancerSuite) TestProcess() { require.Len(env.repo.StoredResults, 1) // Verify that the node shutdown was triggered (context cancelled) - require.Error(env.ctx.Err(), "shared context should be cancelled") + require.Error(env.service.Supervisor.Context().Err(), "shared context should be cancelled") }) }) } @@ -531,7 +514,7 @@ func (s *AdvancerSuite) TestErrorRecovery() { err := env.service.processInputs(context.Background(), env.app.Application, inputs) require.Error(err) require.Contains(err.Error(), "temporary failure") - require.Error(env.ctx.Err(), "shared context should be cancelled") + require.Error(env.service.Supervisor.Context().Err(), "shared context should be cancelled") }) } @@ -1053,7 +1036,7 @@ func (s *AdvancerSuite) TestRemoveSnapshot() { tmpDir := s.T().TempDir() advancer := &Service{snapshotsDir: tmpDir} serviceArgs := &service.TickServiceConfigs{ServiceConfigs: service.ServiceConfigs{Name: "advancer"}} - require.Nil(service.InitTickServiceTemplate(serviceArgs, &advancer.TickServiceTemplate, advancer, advancer)) + require.Nil(service.InitTickServiceTemplate(serviceArgs, &advancer.TickServiceTemplate, advancer)) // Create a snapshot directory snapshotPath := filepath.Join(tmpDir, "myapp_epoch0_input0") @@ -1072,7 +1055,7 @@ func (s *AdvancerSuite) TestRemoveSnapshot() { tmpDir := s.T().TempDir() advancer := &Service{snapshotsDir: tmpDir} serviceArgs := &service.TickServiceConfigs{ServiceConfigs: service.ServiceConfigs{Name: "advancer"}} - require.Nil(service.InitTickServiceTemplate(serviceArgs, &advancer.TickServiceTemplate, advancer, advancer)) + require.Nil(service.InitTickServiceTemplate(serviceArgs, &advancer.TickServiceTemplate, advancer)) snapshotPath := filepath.Join(tmpDir, "myapp_epoch0_input0") err := advancer.removeSnapshot(snapshotPath, "myapp") @@ -1085,7 +1068,7 @@ func (s *AdvancerSuite) TestRemoveSnapshot() { tmpDir := s.T().TempDir() advancer := &Service{snapshotsDir: tmpDir} serviceArgs := &service.TickServiceConfigs{ServiceConfigs: service.ServiceConfigs{Name: "advancer"}} - require.Nil(service.InitTickServiceTemplate(serviceArgs, &advancer.TickServiceTemplate, advancer, advancer)) + require.Nil(service.InitTickServiceTemplate(serviceArgs, &advancer.TickServiceTemplate, advancer)) // Try to traverse outside snapshotsDir maliciousPath := filepath.Join(tmpDir, "..", "outside", "myapp_evil") @@ -1100,7 +1083,7 @@ func (s *AdvancerSuite) TestRemoveSnapshot() { tmpDir := s.T().TempDir() advancer := &Service{snapshotsDir: tmpDir} serviceArgs := &service.TickServiceConfigs{ServiceConfigs: service.ServiceConfigs{Name: "advancer"}} - require.Nil(service.InitTickServiceTemplate(serviceArgs, &advancer.TickServiceTemplate, advancer, advancer)) + require.Nil(service.InitTickServiceTemplate(serviceArgs, &advancer.TickServiceTemplate, advancer)) snapshotPath := filepath.Join(tmpDir, "otherapp_epoch0_input0") err := advancer.removeSnapshot(snapshotPath, "myapp") diff --git a/internal/advancer/service.go b/internal/advancer/service.go index 8829b31a9..ce4f1fc82 100644 --- a/internal/advancer/service.go +++ b/internal/advancer/service.go @@ -39,13 +39,14 @@ type Service struct { // CreateInfo contains the configuration for creating an advancer service type CreateInfo struct { + service.SupervisorConfigs service.TickServiceConfigs Config config.AdvancerConfig Repository repository.Repository } // Create initializes a new advancer service -func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { +func Create(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) { var err error if err = ctx.Err(); err != nil { return nil, err // This returns context.Canceled or context.DeadlineExceeded. @@ -53,7 +54,7 @@ func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { s := &Service{} - err = service.InitTickServiceTemplate(&c.TickServiceConfigs, &s.TickServiceTemplate, s, s) + err = service.InitTickServiceTemplate(&c.TickServiceConfigs, &s.TickServiceTemplate, s) if err != nil { return nil, err } @@ -100,7 +101,7 @@ func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { s.snapshotsDir = c.Config.SnapshotsDir - s.LogConfig(c.Config) + service.LogConfig(s.Logger, c.Config) return s, nil } @@ -134,7 +135,7 @@ func (s *Service) Tick(ctx context.Context) (bool, []error) { return hadWork, []error{err} } -func (s *Service) OnStop(b bool) []error { +func (s *Service) Stop(b bool) []error { var errs []error if s.inspector != nil { s.Logger.Info("Shutting down inspect HTTP server") @@ -150,9 +151,10 @@ func (s *Service) OnStop(b bool) []error { errs = append(errs, fmt.Errorf("failed to close machine manager: %w", err)) } } - return append(errs, s.TickServiceTemplate.OnStop(b)...) + return append(errs, s.TickServiceTemplate.Stop(b)...) } -func (s *Service) OnServe(ctx context.Context) error { + +func (s *Service) Serve() []error { if s.inspector != nil { go func() { if err := s.inspector.Serve(); err != nil && !errors.Is(err, http.ErrServerClosed) { @@ -161,5 +163,5 @@ func (s *Service) OnServe(ctx context.Context) error { } }() } - return s.TickServiceTemplate.OnServe(ctx) + return s.TickServiceTemplate.Serve() } diff --git a/internal/claimer/claimer_test.go b/internal/claimer/claimer_test.go index 09e80964b..c51898c68 100644 --- a/internal/claimer/claimer_test.go +++ b/internal/claimer/claimer_test.go @@ -36,16 +36,11 @@ func TestTickInterleavesStagesWithPinnedBlockAndReschedulesOnProgress(t *testing defer r.AssertExpectations(t) defer b.AssertExpectations(t) - ctx := context.Background() err := service.InitTickServiceTemplate(&service.TickServiceConfigs{ - ServiceConfigs: service.ServiceConfigs{ - Name: "claimer-test", - Context: ctx, - }, - PollInterval: time.Hour, - }, &m.TickServiceTemplate, m, m) + ServiceConfigs: service.ServiceConfigs{Name: "claimer-test"}, + PollInterval: time.Hour, + }, &m.TickServiceTemplate, m) require.NoError(t, err) - t.Cleanup(func() { m.Stop(false) }) tickBlock := big.NewInt(100) app := makeApplication() @@ -80,7 +75,7 @@ func TestTickInterleavesStagesWithPinnedBlockAndReschedulesOnProgress(t *testing }), repository.Pagination{}, false). Return([]*model.Application{}, 0, nil).Once() - reschedule, errs := m.Tick(ctx) + reschedule, errs := m.Tick(context.Background()) require.Empty(t, errs) assert.True(t, reschedule, "a successful stage transition should request an immediate follow-up tick") diff --git a/internal/claimer/service.go b/internal/claimer/service.go index 5f62e9c74..566731f17 100644 --- a/internal/claimer/service.go +++ b/internal/claimer/service.go @@ -20,6 +20,7 @@ import ( ) type CreateInfo struct { + service.SupervisorConfigs service.TickServiceConfigs Config config.ClaimerConfig @@ -75,7 +76,7 @@ type PersistentConfig struct { ChainID uint64 } -func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { +func Create(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) { var err error if c == nil { @@ -93,7 +94,7 @@ func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { s := &Service{} - err = service.InitTickServiceTemplate(&c.TickServiceConfigs, &s.TickServiceTemplate, s, s) + err = service.InitTickServiceTemplate(&c.TickServiceConfigs, &s.TickServiceTemplate, s) if err != nil { return nil, fmt.Errorf("creating base service: %w", err) } @@ -140,7 +141,7 @@ func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { defaultBlock: nodeConfig.DefaultBlock, } - s.LogConfig(c.Config) + service.LogConfig(s.Logger, c.Config) return s, nil } diff --git a/internal/claimer/service_test.go b/internal/claimer/service_test.go index fd6e7537c..bc71dec6f 100644 --- a/internal/claimer/service_test.go +++ b/internal/claimer/service_test.go @@ -19,9 +19,6 @@ import ( ) func TestCreateUsesPersistedDefaultBlock(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - t.Cleanup(cancel) - persistedConfig := PersistentConfig{ DefaultBlock: model.DefaultBlock_Latest, ClaimSubmissionEnabled: false, @@ -34,11 +31,8 @@ func TestCreateUsesPersistedDefaultBlock(t *testing.T) { repo.On("LoadNodeConfigRaw", mock.Anything, ClaimerConfigKey). Return(rawConfig, time.Now(), time.Now(), nil).Once() - s, err := Create(ctx, &CreateInfo{ + s, err := Create(context.Background(), &CreateInfo{ TickServiceConfigs: service.TickServiceConfigs{ - ServiceConfigs: service.ServiceConfigs{ - Context: ctx, - }, PollInterval: time.Hour, }, Config: config.ClaimerConfig{ @@ -50,7 +44,6 @@ func TestCreateUsesPersistedDefaultBlock(t *testing.T) { Repository: repo, }) require.NoError(t, err) - t.Cleanup(func() { s.Stop(false) }) impl := s.(*Service) // expose struct API for whitebox testing. diff --git a/internal/cli/cobra.go b/internal/cli/cobra.go index 8da67f4c9..10cf8a01a 100644 --- a/internal/cli/cobra.go +++ b/internal/cli/cobra.go @@ -4,6 +4,8 @@ package cli import ( + "context" + "errors" "fmt" "log/slog" @@ -32,8 +34,8 @@ func AddFlagStrVarP(flags *pflag.FlagSet, varRef *string, flagName string, flagS cobra.CheckErr(viper.BindPFlag(cfgName, flags.Lookup(flagName))) } -func CheckErr(logger *slog.Logger, err error, args ...any) { - if err == nil { +func logErr(logger *slog.Logger, err error, args ...any) { + if err == nil || errors.Is(err, context.Canceled) { return } @@ -55,5 +57,16 @@ func CheckErr(logger *slog.Logger, err error, args ...any) { args = append([]any{"error", err}, args...) logger.Error(msg, args...) +} + +func CheckErr(logger *slog.Logger, err error, args ...any) { + logErr(logger, err, args...) cobra.CheckErr(err) } + +func CheckErrs(logger *slog.Logger, errs []error, args ...any) { + for _, err := range errs { + logErr(logger, err, args...) + } + cobra.CheckErr(errors.Join(errs...)) +} diff --git a/internal/evmreader/accounts_drive_proved_test.go b/internal/evmreader/accounts_drive_proved_test.go index 2cd825ef5..faf7a9b9b 100644 --- a/internal/evmreader/accounts_drive_proved_test.go +++ b/internal/evmreader/accounts_drive_proved_test.go @@ -43,7 +43,6 @@ func newPostForeclosureFixture(t *testing.T) ( }, &s.TickServiceTemplate, s, - s, )) return s, appContract, repo } diff --git a/internal/evmreader/edge_cases_test.go b/internal/evmreader/edge_cases_test.go index cf30f8a4c..aa7072488 100644 --- a/internal/evmreader/edge_cases_test.go +++ b/internal/evmreader/edge_cases_test.go @@ -370,8 +370,8 @@ func (s *EvmReaderSuite) TestAdapterCacheInvalidationOnConfigChange() { done := make(chan struct{}) go func() { - err := s.evmReader.Serve() - s.Require().NoError(err) + errs := s.evmReader.Serve() + s.Require().Empty(errs) close(done) }() diff --git a/internal/evmreader/evmreader_test.go b/internal/evmreader/evmreader_test.go index 3d46d214e..9f1232b41 100644 --- a/internal/evmreader/evmreader_test.go +++ b/internal/evmreader/evmreader_test.go @@ -70,15 +70,22 @@ func (s *EvmReaderSuite) SetupTest() { ServiceConfigs: service.ServiceConfigs{ Name: "evm-reader", LogLevel: logLevel, - Context: s.ctx, - Cancel: s.cancel, }, PollInterval: 100 * time.Millisecond, } - err = service.InitTickServiceTemplate(serviceArgs, &s.evmReader.TickServiceTemplate, s.evmReader, s.evmReader) + err = service.InitTickServiceTemplate(serviceArgs, &s.evmReader.TickServiceTemplate, s.evmReader) s.Require().NoError(err) s.evmReader.resolver = newApplicationAdapterResolver(s.evmReader.Logger, s.contractFactory) + + supCfg := service.SupervisorConfigs{ + Services: []service.ServiceImpl{s.evmReader}, + Context: s.ctx, + Cancel: s.cancel, + } + supervisor := &service.ServicesSupervisor{} + err = service.InitServicesSupervisor(&supCfg, supervisor) + s.Require().NoError(err) } func (s *EvmReaderSuite) TearDownTest() { @@ -87,14 +94,14 @@ func (s *EvmReaderSuite) TearDownTest() { // Service tests func (s *EvmReaderSuite) TestItStopsWhenContextIsCanceled() { - errChannel := make(chan error, 1) + errChannel := make(chan []error, 1) go func() { errChannel <- s.evmReader.Serve() }() s.cancel() - err := <-errChannel - s.Require().Nil(err, "stopped with an error when canceled") + errs := <-errChannel + s.Require().Empty(errs, "stopped with an error when canceled") } func newCallNotification(c *mock.Call) <-chan struct{} { @@ -136,8 +143,8 @@ func (s *EvmReaderSuite) TestItStopsWhenContextIsAlreadyCanceled() { done := make(chan struct{}) go func() { s.cancel() - err := s.evmReader.Serve() - s.Require().NoError(err) + errs := s.evmReader.Serve() + s.Require().Empty(errs) close(done) }() @@ -149,8 +156,8 @@ func (s *EvmReaderSuite) TestItStopsWhenContextIsCanceledAfterFirstHeader() { done := make(chan struct{}) go func() { - err := s.evmReader.Serve() - s.Require().NoError(err) + errs := s.evmReader.Serve() + s.Require().Empty(errs) close(done) }() @@ -168,8 +175,8 @@ func (s *EvmReaderSuite) TestReadyReflectsServeLifecycle() { done := make(chan struct{}) go func() { - err := s.evmReader.Serve() - s.Require().NoError(err) + errs := s.evmReader.Serve() + s.Require().Empty(errs) close(done) }() @@ -192,8 +199,8 @@ func (s *EvmReaderSuite) TestReadyDoesNotDependOnPollingSuccess() { done := make(chan struct{}) go func() { - err := s.evmReader.Serve() - s.Require().NoError(err) + errs := s.evmReader.Serve() + s.Require().Empty(errs) close(done) }() @@ -297,8 +304,8 @@ func (s *EvmReaderSuite) TestItRunsWhenConnectionFails() { done := make(chan struct{}) go func() { - err := s.evmReader.Serve() - s.Require().NoError(err) + errs := s.evmReader.Serve() + s.Require().Empty(errs) close(done) }() @@ -318,8 +325,8 @@ func (s *EvmReaderSuite) TestRunResetsRetriesAfterProcessingHeaders() { done := make(chan struct{}) go func() { - err := s.evmReader.Serve() - s.Require().NoError(err) + errs := s.evmReader.Serve() + s.Require().Empty(errs) close(done) }() diff --git a/internal/evmreader/foreclosure_test.go b/internal/evmreader/foreclosure_test.go index e94d089d9..8375364fd 100644 --- a/internal/evmreader/foreclosure_test.go +++ b/internal/evmreader/foreclosure_test.go @@ -53,7 +53,6 @@ func newForeclosureServiceFixture(t *testing.T) ( }, &s.TickServiceTemplate, s, - s, )) return s, appContract, repo } diff --git a/internal/evmreader/output_test.go b/internal/evmreader/output_test.go index 566ee47c8..9ec4d8938 100644 --- a/internal/evmreader/output_test.go +++ b/internal/evmreader/output_test.go @@ -612,16 +612,23 @@ func (s *EvmReaderSuite) setupOutputMismatchTest() { ServiceConfigs: service.ServiceConfigs{ Name: "evm-reader", LogLevel: logLevel, - Context: s.ctx, - Cancel: s.cancel, }, PollInterval: 100 * time.Millisecond, } - err = service.InitTickServiceTemplate(serviceArgs, &s.evmReader.TickServiceTemplate, s.evmReader, s.evmReader) + err = service.InitTickServiceTemplate(serviceArgs, &s.evmReader.TickServiceTemplate, s.evmReader) s.Require().NoError(err) s.evmReader.resolver = newApplicationAdapterResolver(s.evmReader.Logger, s.contractFactory) + supCfg := service.SupervisorConfigs{ + Services: []service.ServiceImpl{s.evmReader}, + Context: s.ctx, + Cancel: s.cancel, + } + supervisor := &service.ServicesSupervisor{} + err = service.InitServicesSupervisor(&supCfg, supervisor) + s.Require().NoError(err) + apps := copyApplications(applications) for _, app := range apps { app.LastForecloseCheckBlock = 0x100 diff --git a/internal/evmreader/sealedepochs_test.go b/internal/evmreader/sealedepochs_test.go index e05345377..8e62a5034 100644 --- a/internal/evmreader/sealedepochs_test.go +++ b/internal/evmreader/sealedepochs_test.go @@ -57,7 +57,7 @@ func (s *SealedEpochsSuite) SetupTest() { logLevel, err := config.GetLogLevel() s.Require().NoError(err) serviceArgs := &service.ServiceConfigs{Name: "evm-reader", LogLevel: logLevel} - err = service.InitServiceTemplate(serviceArgs, &s.evmReader.ServiceTemplate, s.evmReader) + err = service.InitServiceTemplate(serviceArgs, &s.evmReader.ServiceTemplate) s.Require().NoError(err) } diff --git a/internal/evmreader/service.go b/internal/evmreader/service.go index b33d0096e..3ecc162f6 100644 --- a/internal/evmreader/service.go +++ b/internal/evmreader/service.go @@ -19,6 +19,7 @@ import ( ) type CreateInfo struct { + service.SupervisorConfigs service.TickServiceConfigs Config config.EvmreaderConfig @@ -52,7 +53,7 @@ type PersistentConfig struct { ChainID uint64 } -func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { +func Create(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) { var err error if err = ctx.Err(); err != nil { return nil, err // This returns context.Canceled or context.DeadlineExceeded. @@ -60,7 +61,7 @@ func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { s := &Service{} - err = service.InitTickServiceTemplate(&c.TickServiceConfigs, &s.TickServiceTemplate, s, s) + err = service.InitTickServiceTemplate(&c.TickServiceConfigs, &s.TickServiceTemplate, s) if err != nil { return nil, err } @@ -107,7 +108,7 @@ func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { } s.resolver = newApplicationAdapterResolver(s.Logger, s.adapterFactory) - s.LogConfig(c.Config) + service.LogConfig(s.Logger, c.Config) return s, nil } @@ -120,12 +121,12 @@ func (s *Service) Ready() bool { return s.ready.Load() } -func (s *Service) OnServe(ctx context.Context) error { +func (s *Service) Serve() []error { s.alive.Store(true) s.ready.Store(true) defer s.alive.Store(false) defer s.ready.Store(false) - return s.TickServiceTemplate.OnServe(ctx) + return s.TickServiceTemplate.Serve() } func (s *Service) setupPersistentConfig( diff --git a/internal/jsonrpc/service.go b/internal/jsonrpc/service.go index d4124f906..0ccdb8563 100644 --- a/internal/jsonrpc/service.go +++ b/internal/jsonrpc/service.go @@ -40,6 +40,7 @@ type Service struct { } type CreateInfo struct { + service.SupervisorConfigs service.ServiceConfigs Config config.JsonrpcConfig @@ -47,7 +48,7 @@ type CreateInfo struct { Repository repository.Repository } -func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { +func Create(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) { var err error if err = ctx.Err(); err != nil { return nil, err // This returns context.Canceled or context.DeadlineExceeded. @@ -55,7 +56,7 @@ func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { s := &Service{} - err = service.InitServiceTemplate(&c.ServiceConfigs, &s.ServiceTemplate, s) + err = service.InitServiceTemplate(&c.ServiceConfigs, &s.ServiceTemplate) if err != nil { return nil, err } @@ -96,12 +97,12 @@ func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { s.listen = net.Listen } - s.LogConfig(c.Config) + service.LogConfig(s.Logger, c.Config) return s, nil } -func (s *Service) OnStop(_ bool) []error { +func (s *Service) Stop(_ bool) []error { var errs []error s.Logger.Info("Shutting down JSON-RPC HTTP server", "addr", s.server.Addr) ctx, cancel := context.WithTimeout(context.Background(), jsonrpcShutdownTimeout) @@ -112,10 +113,11 @@ func (s *Service) OnStop(_ bool) []error { return errs } -func (s *Service) OnServe(ctx context.Context) error { +func (s *Service) Serve() []error { + ctx := s.Supervisor.Context() listener, err := s.listen("tcp", s.server.Addr) if err != nil { - return err + return []error{err} } s.Logger.Info("Listening", "addr", listener.Addr().String()) @@ -139,10 +141,10 @@ func (s *Service) OnServe(ctx context.Context) error { // The HTTP loop exited first. This is unexpected unless the listener // failed or the server was already closed, so cancel the framework // loop and wait for it to observe the cancellation before returning. - s.Stop(true) + s.Supervisor.Stop(true) <-ctx.Done() if err != nil { - return err + return []error{err} } return nil case <-ctx.Done(): @@ -152,7 +154,7 @@ func (s *Service) OnServe(ctx context.Context) error { // returns only after the listener is fully closed. serverErr := <-serverDone if serverErr != nil { - return serverErr + return []error{serverErr} } return nil } diff --git a/internal/node/node.go b/internal/node/node.go index 17ac2f9a9..cb1f94f02 100644 --- a/internal/node/node.go +++ b/internal/node/node.go @@ -25,11 +25,12 @@ import ( // serviceResult carries either a successfully created service or an error // back from the goroutines in createServices. type serviceResult struct { - service service.IService + service service.ServiceImpl err error } type CreateInfo struct { + service.SupervisorConfigs service.ServiceConfigs Config config.NodeConfig @@ -40,159 +41,114 @@ type CreateInfo struct { Repository repository.Repository } -type Service struct { - service.ServiceTemplate - - Children []service.IService - Repository repository.Repository -} - -func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { +func Create(ctx context.Context, cfg *CreateInfo) (service.IService, error) { var err error if err = ctx.Err(); err != nil { return nil, err // This returns context.Canceled or context.DeadlineExceeded. } - // setup node and all child services to share the same context. - ctx, cancel := context.WithCancel(context.Background()) - c.ServiceConfigs.Context = ctx - c.ServiceConfigs.Cancel = cancel - - s := &Service{} - - err = service.InitServiceTemplate(&c.ServiceConfigs, &s.ServiceTemplate, s) - if err != nil { - return nil, err - } + supervisor := &service.ServicesSupervisor{} - err = createServices(ctx, c, s) + err = createServices(ctx, cfg, supervisor) if err != nil { - s.Logger.Error(fmt.Sprint(err)) + supervisor.Logger.Error(fmt.Sprint(err)) return nil, err } - s.LogConfig(c.Config) + service.LogConfig(supervisor.Logger, cfg.Config) - return s, nil + return supervisor, nil } -type serviceCreator func(context.Context, *CreateInfo, *Service) (service.IService, error) +type serviceCreator func(context.Context, *CreateInfo) (service.ServiceImpl, error) -func createServices(ctx context.Context, c *CreateInfo, s *Service) error { +func createServices( + ctx context.Context, + cfg *CreateInfo, + supervisor *service.ServicesSupervisor, +) error { creators := []serviceCreator{ + newNode, newEVMReader, newAdvancer, newValidator, newClaimer, newPrt, } - if c.Config.FeatureJsonrpcApiEnabled { + if cfg.Config.FeatureJsonrpcApiEnabled { creators = append(creators, newJsonrpc) } ch := make(chan serviceResult, len(creators)) for _, create := range creators { go func() { - svc, err := create(ctx, c, s) + svc, err := create(ctx, cfg) ch <- serviceResult{service: svc, err: err} }() } + services := make([]service.ServiceImpl, 0) + errs := make([]error, 0) for range len(creators) { - select { - case result := <-ch: - if result.err != nil { - stopAndDrain(s.Children, ch, len(creators)-len(s.Children)-1) - return fmt.Errorf("failed to create service: %w", result.err) - } - s.Children = append(s.Children, result.service) - case <-ctx.Done(): - stopAndDrain(s.Children, ch, len(creators)-len(s.Children)) - return fmt.Errorf("failed to create services: %w", ctx.Err()) + result := <-ch + if result.service != nil { + services = append(services, result.service) + } else { + errs = append(errs, result.err) } } - return nil -} -// stopAndDrain stops already-created children and drains remaining results -// from the channel, stopping any successful services to prevent resource leaks. -func stopAndDrain(children []service.IService, ch <-chan serviceResult, remaining int) { - for _, child := range children { - child.Stop(true) - } - go func() { - for range remaining { - if r := <-ch; r.err == nil && r.service != nil { - r.service.Stop(true) + if len(errs) > 0 { + for _, svc := range services { + errs := svc.Stop(true) + if errs != nil { + errs = append(errs, errs...) // TODO: replace this by a warn log } } - }() -} - -func (me *Service) Alive() bool { - allAlive := true - for _, s := range me.Children { - allAlive = allAlive && s.Alive() + return errors.Join(errs...) } - return allAlive + + supCfg := cfg.SupervisorConfigs + supCfg.Services = services + return service.InitServicesSupervisor(&supCfg, supervisor) } -func (me *Service) Ready() bool { - allReady := true - for _, s := range me.Children { - allReady = allReady && s.Ready() - } - return allReady +type Service struct { + service.ServiceTemplate + + Children []service.IService + Repository repository.Repository } -func (me *Service) OnStop(force bool) []error { - errs := []error{} - for _, s := range me.Children { - errs = append(errs, s.Stop(force)...) - } - return errs +func (me *Service) Serve() []error { + ctx := me.Supervisor.Context() + <-ctx.Done() + return nil } -func (me *Service) OnServe(ctx context.Context) error { - childrenCount := len(me.Children) - errCh := make(chan error, childrenCount) - for _, child := range me.Children { - child := child - go func() { errCh <- child.Serve() }() - } +func newNode(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) { + s := &Service{} - errs := make([]error, 0) - for range childrenCount { - err := <-errCh - if err != nil && !errors.Is(err, context.Canceled) { - me.Stop(true) - errs = append(errs, err) - } - } - if len(errs) > 0 { - return errors.Join(errs...) + err := service.InitServiceTemplate(&c.ServiceConfigs, &s.ServiceTemplate) + if err != nil { + return nil, err } - return nil + return s, nil } // services creation -func newEVMReader(ctx context.Context, c *CreateInfo, s *Service) (service.IService, error) { +func newEVMReader(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) { readerArgs := evmreader.CreateInfo{ TickServiceConfigs: service.TickServiceConfigs{ + PollInterval: c.Config.EvmReaderPollingInterval, ServiceConfigs: service.ServiceConfigs{ Name: config.ServiceEvmReader, - Context: c.ServiceConfigs.Context, - Cancel: c.ServiceConfigs.Cancel, LogLevel: config.ResolveServiceLogLevel(config.ServiceEvmReader, c.Config.LogLevel), LogColor: c.Config.LogColor, - EnableSignalHandling: false, - TelemetryCreate: false, - ServeMux: s.ServeMux, }, - PollInterval: c.Config.EvmReaderPollingInterval, }, EthClient: c.ReaderClient, Repository: c.Repository, @@ -206,18 +162,14 @@ func newEVMReader(ctx context.Context, c *CreateInfo, s *Service) (service.IServ return readerService, nil } -func newAdvancer(ctx context.Context, c *CreateInfo, s *Service) (service.IService, error) { +func newAdvancer(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) { advancerArgs := advancer.CreateInfo{ TickServiceConfigs: service.TickServiceConfigs{ PollInterval: c.Config.AdvancerPollingInterval, ServiceConfigs: service.ServiceConfigs{ Name: config.ServiceAdvancer, - Context: c.ServiceConfigs.Context, - Cancel: c.ServiceConfigs.Cancel, LogLevel: config.ResolveServiceLogLevel(config.ServiceAdvancer, c.Config.LogLevel), LogColor: c.Config.LogColor, - EnableSignalHandling: false, - TelemetryCreate: false, }, }, Repository: c.Repository, @@ -231,18 +183,14 @@ func newAdvancer(ctx context.Context, c *CreateInfo, s *Service) (service.IServi return advancerService, nil } -func newValidator(ctx context.Context, c *CreateInfo, s *Service) (service.IService, error) { +func newValidator(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) { validatorArgs := validator.CreateInfo{ TickServiceConfigs: service.TickServiceConfigs{ PollInterval: c.Config.ValidatorPollingInterval, ServiceConfigs: service.ServiceConfigs{ Name: config.ServiceValidator, - Context: c.ServiceConfigs.Context, - Cancel: c.ServiceConfigs.Cancel, LogLevel: config.ResolveServiceLogLevel(config.ServiceValidator, c.Config.LogLevel), LogColor: c.Config.LogColor, - EnableSignalHandling: false, - TelemetryCreate: false, }, }, Repository: c.Repository, @@ -256,18 +204,14 @@ func newValidator(ctx context.Context, c *CreateInfo, s *Service) (service.IServ return validatorService, nil } -func newClaimer(ctx context.Context, c *CreateInfo, s *Service) (service.IService, error) { +func newClaimer(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) { claimerArgs := claimer.CreateInfo{ TickServiceConfigs: service.TickServiceConfigs{ PollInterval: c.Config.ClaimerPollingInterval, ServiceConfigs: service.ServiceConfigs{ Name: config.ServiceClaimer, - Context: c.ServiceConfigs.Context, - Cancel: c.ServiceConfigs.Cancel, LogLevel: config.ResolveServiceLogLevel(config.ServiceClaimer, c.Config.LogLevel), LogColor: c.Config.LogColor, - EnableSignalHandling: false, - TelemetryCreate: false, }, }, EthConn: c.ClaimerClient, @@ -282,17 +226,12 @@ func newClaimer(ctx context.Context, c *CreateInfo, s *Service) (service.IServic return claimerService, nil } -func newJsonrpc(ctx context.Context, c *CreateInfo, s *Service) (service.IService, error) { +func newJsonrpc(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) { jsonrpcArgs := jsonrpc.CreateInfo{ ServiceConfigs: service.ServiceConfigs{ Name: config.ServiceJsonrpc, - Context: c.ServiceConfigs.Context, - Cancel: c.ServiceConfigs.Cancel, LogLevel: config.ResolveServiceLogLevel(config.ServiceJsonrpc, c.Config.LogLevel), LogColor: c.Config.LogColor, - EnableSignalHandling: false, - TelemetryCreate: false, - ServeMux: s.ServeMux, }, Repository: c.Repository, Config: *c.Config.ToJsonrpcConfig(), @@ -305,18 +244,14 @@ func newJsonrpc(ctx context.Context, c *CreateInfo, s *Service) (service.IServic return jsonrpcService, nil } -func newPrt(ctx context.Context, c *CreateInfo, s *Service) (service.IService, error) { +func newPrt(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) { prtArgs := prt.CreateInfo{ TickServiceConfigs: service.TickServiceConfigs{ PollInterval: c.Config.PrtPollingInterval, ServiceConfigs: service.ServiceConfigs{ Name: config.ServicePrt, - Context: c.ServiceConfigs.Context, - Cancel: c.ServiceConfigs.Cancel, LogLevel: config.ResolveServiceLogLevel(config.ServicePrt, c.Config.LogLevel), LogColor: c.Config.LogColor, - EnableSignalHandling: false, - TelemetryCreate: false, }, }, EthClient: c.PrtClient, diff --git a/internal/node/node_test.go b/internal/node/node_test.go deleted file mode 100644 index 65130d10d..000000000 --- a/internal/node/node_test.go +++ /dev/null @@ -1,173 +0,0 @@ -// (c) Cartesi and individual authors (see AUTHORS) -// SPDX-License-Identifier: Apache-2.0 (see LICENSE) - -package node - -import ( - "context" - "fmt" - "log/slog" - "sync" - "testing" - "time" - - "github.com/cartesi/rollups-node/pkg/service" - - "github.com/stretchr/testify/require" - "github.com/stretchr/testify/suite" -) - -type blockingChildImpl struct { - service.ServiceTemplate - started chan struct{} - done chan struct{} - once sync.Once -} - -func (c *blockingChildImpl) OnServe(ctx context.Context) error { - close(c.started) - <-ctx.Done() - c.once.Do(func() { close(c.done) }) - return nil -} - -func createBlockingChild(t *testing.T, cfg *service.ServiceConfigs, name string) *blockingChildImpl { - t.Helper() - childCfg := *cfg - childCfg.Name = name - - child := &blockingChildImpl{ - started: make(chan struct{}), - done: make(chan struct{}), - } - require.NoError(t, service.InitServiceTemplate(&childCfg, &child.ServiceTemplate, child)) - return child -} - -type NodeSuite struct { - suite.Suite -} - -func TestServe(t *testing.T) { - suite.Run(t, new(NodeSuite)) -} - -func (s *NodeSuite) TestNodeStopCancelsChildContexts() { - ctx, cancel := context.WithCancel(context.Background()) - parentCfg := service.ServiceConfigs{ - Name: "node", - LogLevel: slog.LevelError, - Context: ctx, - Cancel: cancel, - } - - nodeSvc := &Service{} - require.NoError(s.T(), service.InitServiceTemplate(&parentCfg, &nodeSvc.ServiceTemplate, nodeSvc)) - - child1 := createBlockingChild(s.T(), &parentCfg, "child-1") - child2 := createBlockingChild(s.T(), &parentCfg, "child-2") - nodeSvc.Children = []service.IService{child1, child2} - - done := make(chan struct{}) - go func() { - _ = nodeSvc.Serve() - close(done) - }() - - select { - case <-child1.started: - case <-time.After(2 * time.Second): - s.Fail("child-1 did not start") - } - select { - case <-child2.started: - case <-time.After(2 * time.Second): - s.Fail("child-2 did not start") - } - - nodeSvc.Stop(false) - - select { - case <-child1.done: - case <-time.After(2 * time.Second): - s.Fail("child-1 did not observe ctx.Done()") - } - select { - case <-child2.done: - case <-time.After(2 * time.Second): - s.Fail("child-2 did not observe ctx.Done()") - } - - select { - case <-done: - case <-time.After(2 * time.Second): - s.Fail("node did not exit after Stop()") - } -} - -type errorChildImpl struct { - service.ServiceTemplate - started chan struct{} -} - -func (c *errorChildImpl) OnServe(ctx context.Context) error { - close(c.started) - time.Sleep(10 * time.Millisecond) - return fmt.Errorf("Oops %s!", c.Name) -} - -func createErrorChild(t *testing.T, cfg *service.ServiceConfigs, name string) *errorChildImpl { - t.Helper() - childCfg := *cfg - childCfg.Name = name - - child := &errorChildImpl{ - started: make(chan struct{}), - } - require.NoError(t, service.InitServiceTemplate(&childCfg, &child.ServiceTemplate, child)) - return child -} - -func (s *NodeSuite) TestNodeReturnChildErrors() { - - ctx, cancel := context.WithCancel(context.Background()) - parentCfg := service.ServiceConfigs{ - Name: "node", - LogLevel: slog.LevelError, - Context: ctx, - Cancel: cancel, - } - - nodeSvc := &Service{} - require.NoError(s.T(), service.InitServiceTemplate(&parentCfg, &nodeSvc.ServiceTemplate, nodeSvc)) - - child1 := createErrorChild(s.T(), &parentCfg, "child-1") - child2 := createErrorChild(s.T(), &parentCfg, "child-2") - nodeSvc.Children = []service.IService{child1, child2} - - done := make(chan error) - go func() { - err := nodeSvc.Serve() - done <- err - close(done) - }() - - select { - case <-child1.started: - case <-time.After(2 * time.Second): - s.Fail("child-1 did not start") - } - select { - case <-child2.started: - case <-time.After(2 * time.Second): - s.Fail("child-2 did not start") - } - - select { - case err := <-done: - s.ErrorContains(err, "Oops child-1!") - s.ErrorContains(err, "Oops child-2!") - case <-time.After(2 * time.Second): - s.Fail("node did not exit after child errors") - } -} diff --git a/internal/prt/service.go b/internal/prt/service.go index b60acb362..f4d4c4cb9 100644 --- a/internal/prt/service.go +++ b/internal/prt/service.go @@ -21,6 +21,7 @@ import ( ) type CreateInfo struct { + service.SupervisorConfigs service.TickServiceConfigs Config config.PrtConfig Repository repository.Repository @@ -49,7 +50,7 @@ type PersistentConfig struct { ChainID uint64 } -func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { +func Create(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) { var err error if err = ctx.Err(); err != nil { return nil, err // This returns context.Canceled or context.DeadlineExceeded. @@ -57,7 +58,7 @@ func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { s := &Service{} - err = service.InitTickServiceTemplate(&c.TickServiceConfigs, &s.TickServiceTemplate, s, s) + err = service.InitTickServiceTemplate(&c.TickServiceConfigs, &s.TickServiceTemplate, s) if err != nil { return nil, err } @@ -117,7 +118,7 @@ func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { } } - s.LogConfig(c.Config) + service.LogConfig(s.Logger, c.Config) return s, nil } diff --git a/internal/validator/validator.go b/internal/validator/validator.go index 4a89f6b0c..57bb5bfdb 100644 --- a/internal/validator/validator.go +++ b/internal/validator/validator.go @@ -33,6 +33,7 @@ type Service struct { } type CreateInfo struct { + service.SupervisorConfigs service.TickServiceConfigs Config config.ValidatorConfig @@ -40,7 +41,7 @@ type CreateInfo struct { Repository repository.Repository } -func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { +func Create(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) { var err error if err = ctx.Err(); err != nil { return nil, err // This returns context.Canceled or context.DeadlineExceeded. @@ -48,7 +49,7 @@ func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { s := &Service{} - err = service.InitTickServiceTemplate(&c.TickServiceConfigs, &s.TickServiceTemplate, s, s) + err = service.InitTickServiceTemplate(&c.TickServiceConfigs, &s.TickServiceTemplate, s) if err != nil { return nil, err } @@ -61,7 +62,7 @@ func Create(ctx context.Context, c *CreateInfo) (service.IService, error) { s.pristinePostContext = merkle.CreatePostContext() s.pristineRootHash = s.pristinePostContext[merkle.TREE_DEPTH] - s.LogConfig(c.Config) + service.LogConfig(s.Logger, c.Config) return s, nil } diff --git a/internal/validator/validator_test.go b/internal/validator/validator_test.go index 886baa4fe..32d0615f1 100644 --- a/internal/validator/validator_test.go +++ b/internal/validator/validator_test.go @@ -52,7 +52,7 @@ func (s *ValidatorSuite) SetupSubTest() { pristineRootHash: postContext[merkle.TREE_DEPTH], } serviceArgs := &service.ServiceConfigs{Name: "validator"} - err := service.InitServiceTemplate(serviceArgs, &validator.ServiceTemplate, validator) + err := service.InitServiceTemplate(serviceArgs, &validator.ServiceTemplate) s.Require().Nil(err) dummyOutputsMerkleRoot := common.HexToHash("0x0a162946e56158bac0673e6dd3bdfdc1e4a0e7744a120fdb640050c8d7abe1c6") dummyEpochs = []Epoch{ diff --git a/pkg/service/service.go b/pkg/service/service.go index 8c894240c..4359717ce 100644 --- a/pkg/service/service.go +++ b/pkg/service/service.go @@ -53,25 +53,16 @@ package service import ( - "context" - "errors" "fmt" "log/slog" - "net/http" "os" - "os/signal" - "sync/atomic" - "syscall" - "time" "github.com/cartesi/rollups-node/internal/version" "github.com/lmittmann/tint" ) -const telemetryShutdownTimeout = 5 * time.Second - var ( - ErrInvalid = fmt.Errorf("Invalid Argument") // invalid argument + ErrInvalid = fmt.Errorf("Invalid Argument") // invalid argument ErrServiceStopped = fmt.Errorf("Service was stopped") ) @@ -81,318 +72,66 @@ type IService interface { Ready() bool Reload() []error Stop(bool) []error - Serve() error + Serve() []error String() string } -/* - * Service template for services that do continuous processing. - */ - -// Internal interface with abstract methods called by ServiceTemplate. -// These methods are not part of the public service interface. -type LifecycleImpl interface { - Alive() bool - Ready() bool - OnReload() []error - OnStop(bool) []error - OnServe(ctx context.Context) error +// Public interface with methods to manipulate the service. +type ServiceImpl interface { + IService + SetSupervisor(*ServicesSupervisor) } -// ServiceTemplate stores runtime information. +// Supervisor of multiple services under a single management. type ServiceTemplate struct { Name string + Supervisor *ServicesSupervisor Logger *slog.Logger - lifecycleImpl LifecycleImpl - context context.Context - cancelContext context.CancelFunc - sigHangUp chan os.Signal // SIGHUP to reload - sigShutdown chan os.Signal // SIGINT/SIGTERM to exit gracefully - ServeMux *http.ServeMux - telemetry *http.Server - telemetryFunc func() error - - // stopped server Stop() run exactly once, even when Stop() is called - // multiple times (by the child's Serve() loop and by the parent orchestrator). - stopped atomic.Bool - stoppedChan chan struct{} + LogLevel slog.Level + LogColor bool } // ServiceConfigs stores configuration for the InitServiceTemplate function type ServiceConfigs struct { - Name string - Logger *slog.Logger - LogLevel slog.Level - LogColor bool - Context context.Context - Cancel context.CancelFunc - EnableSignalHandling bool - TelemetryCreate bool - TelemetryAddress string - ServeMux *http.ServeMux // used only for unit testing + Name string + Logger *slog.Logger + LogLevel slog.Level + LogColor bool } -// Initialize the 'ServiceTemplate' structure using values from 'CreateInfo'. +// Initialize the 'ServiceTemplate' structure using values from 'ServiceConfigs'. // 'impl' must be a reference to the concrete service implementation that // embeds 'ServiceTemplate' -func InitServiceTemplate(c *ServiceConfigs, s *ServiceTemplate, impl LifecycleImpl) error { - if c == nil || s == nil || impl == nil { - return ErrInvalid - } - - s.stoppedChan = make(chan struct{}) - - s.lifecycleImpl = impl - +func InitServiceTemplate(c *ServiceConfigs, s *ServiceTemplate) error { s.Name = c.Name s.Logger = c.Logger // log if s.Logger == nil { - s.Logger = NewServiceLogger(c) - } - - // context and cancelation - if c.Context == nil { - c.Context = context.Background() - } - s.context = c.Context - if c.Cancel == nil { - s.context, c.Cancel = context.WithCancel(c.Context) - } - s.cancelContext = c.Cancel - - // signal handling - if c.EnableSignalHandling { - if s.sigHangUp == nil { - s.sigHangUp = make(chan os.Signal, 1) - signal.Notify(s.sigHangUp, syscall.SIGHUP) - } - if s.sigShutdown == nil { - s.sigShutdown = make(chan os.Signal, 1) - signal.Notify(s.sigShutdown, syscall.SIGINT, syscall.SIGTERM) - } - } - - // telemetry - if c.TelemetryCreate { - if s.ServeMux == nil { - if c.ServeMux == nil { - c.ServeMux = http.NewServeMux() - } - s.ServeMux = c.ServeMux - } - if c.TelemetryAddress == "" { - c.TelemetryAddress = ":8080" - } - s.telemetry, s.telemetryFunc = s.CreateDefaultTelemetry(c.TelemetryAddress) - go func() { - if err := s.telemetryFunc(); err != nil { - s.Logger.Error("Telemetry HTTP server failed", "error", err) - } - }() + s.Logger = NewLogger(c.LogLevel, c.LogColor).With("service", c.Name) } s.Logger.Info("Create", "version", version.BuildVersion, "log_level", c.LogLevel, "pid", os.Getpid()) - if s.telemetry != nil { - s.Logger.Info("Telemetry", "address", s.telemetry.Addr) - } + return nil } -// Default implementation of some abstract methods (except `OnServe`). +// Default implementation of some abstract methods (except `Serve`). // Remove them to force concrete services to provide implementation for them. -func (s *ServiceTemplate) OnReload() []error { return nil } -func (s *ServiceTemplate) OnStop(bool) []error { return nil } +func (s *ServiceTemplate) Reload() []error { return nil } +func (s *ServiceTemplate) Stop(bool) []error { return nil } func (s *ServiceTemplate) Alive() bool { return true } func (s *ServiceTemplate) Ready() bool { return true } func (s *ServiceTemplate) String() string { return s.Name } -func (s *ServiceTemplate) Reload() []error { - if s.stopped.Load() { - return []error{ErrServiceStopped} - } - - start := time.Now() - errs := s.lifecycleImpl.OnReload() - elapsed := time.Since(start) - - if len(errs) > 0 { - s.Logger.Error("Reload", - "duration", elapsed, - "error", errs) - } else { - s.Logger.Info("Reload", - "duration", elapsed) - } - return errs -} - -func (s *ServiceTemplate) Stop(force bool) []error { - // CAS achieves once-semantics: the second caller returns immediately - // (fire-and-forget) rather than blocking like sync.Once. This is safe - // because the orchestrator calls Cancel() after Stop() and waits for - // the Serve goroutine to exit. - if !s.stopped.CompareAndSwap(false, true) { - return nil // already stopped - } - - start := time.Now() - errs := s.lifecycleImpl.OnStop(force) - if s.telemetry != nil { - shutdownCtx, cancel := context.WithTimeout(context.Background(), telemetryShutdownTimeout) - defer cancel() - if err := s.telemetry.Shutdown(shutdownCtx); err != nil { - errs = append(errs, err) - } - } - if s.sigShutdown != nil { - signal.Stop(s.sigShutdown) - } - if s.sigHangUp != nil { - signal.Stop(s.sigHangUp) - } - elapsed := time.Since(start) - - s.cancelContext() - close(s.stoppedChan) - - if len(errs) > 0 { - s.Logger.Error("Stop", - "force", force, - "duration", elapsed, - "error", errs) - } else { - s.Logger.Info("Stop", - "force", force, - "duration", elapsed) - } - return errs -} - -func (s *ServiceTemplate) Serve() error { - if s.stopped.Load() { - return ErrServiceStopped - } - - go func() { - for { - select { - case <-s.sigHangUp: - s.Reload() - case <-s.sigShutdown: - s.Stop(false) // Graceful shutdown; errors are logged by Stop. - return - case <-s.context.Done(): - s.Stop(true) // Stop logs errors internally. - return - } - } - }() - - err := s.lifecycleImpl.OnServe(s.context) - - go s.Stop(true) - <-s.stoppedChan - - return err +func (s *ServiceTemplate) SetSupervisor(supervisor *ServicesSupervisor) { + s.Supervisor = supervisor } // LogConfig logs the service configuration at debug level. // Intended for use by standalone service binaries after Create. -func (s *ServiceTemplate) LogConfig(config any) { - s.Logger.Info("Starting service", "config", config) -} - -/* - * Alternative service template that implements the tick-based processing. - */ - -type TickImpl interface { - Tick(ctx context.Context) (bool, []error) -} - -type TickServiceTemplate struct { - ServiceTemplate - tickImpl TickImpl - ticker *time.Ticker -} - -type TickServiceConfigs struct { - ServiceConfigs - PollInterval time.Duration -} - -func InitTickServiceTemplate( - cfg *TickServiceConfigs, - tmpl *TickServiceTemplate, - lifecycleImpl LifecycleImpl, - tickImpl TickImpl, -) error { - if cfg == nil || tmpl == nil || tickImpl == nil { - return ErrInvalid - } - - err := InitServiceTemplate(&cfg.ServiceConfigs, &tmpl.ServiceTemplate, lifecycleImpl) - if err != nil { - return err - } - - tmpl.tickImpl = tickImpl - - // ticker - if cfg.PollInterval < 0 { - return fmt.Errorf("PollInterval must be non-negative, got %v", cfg.PollInterval) - } - if cfg.PollInterval == 0 { - cfg.PollInterval = time.Minute - } - tmpl.ticker = time.NewTicker(cfg.PollInterval) - - return nil -} - -func (s *TickServiceTemplate) tick(ctx context.Context) bool { - if ctx.Err() != nil { - return false - } - start := time.Now() - reschedule, errs := s.tickImpl.Tick(ctx) - elapsed := time.Since(start) - - if len(errs) > 0 { - s.Logger.Error("Tick", - "duration", elapsed, - "reschedule", reschedule, - "error", errs, - ) - } else { - s.Logger.Debug("Tick", - "duration", elapsed, - "reschedule", reschedule, - ) - } - return reschedule -} - -func (s *TickServiceTemplate) OnStop(bool) []error { - s.ticker.Stop() - return nil -} - -func (s *TickServiceTemplate) OnServe(ctx context.Context) error { - if ctx.Err() != nil { - return nil - } - for s.tick(ctx) {} - for { - select { - case <-ctx.Done(): - return nil - case <-s.ticker.C: - for s.tick(ctx) {} - } - } +func LogConfig(logger *slog.Logger, config any) { + logger.Info("Starting service", "config", config) } /* @@ -414,52 +153,3 @@ func NewLogger(level slog.Level, color bool) *slog.Logger { func NewServiceLogger(c *ServiceConfigs) *slog.Logger { return NewLogger(c.LogLevel, c.LogColor).With("service", c.Name) } - -/* - * Service Telemetry - */ - -func (s *ServiceTemplate) CreateDefaultTelemetry(addr string) (*http.Server, func() error) { - s.ServeMux.Handle("/readyz", http.HandlerFunc(s.ReadyHandler)) - s.ServeMux.Handle("/livez", http.HandlerFunc(s.AliveHandler)) - - // Telemetry deliberately omits RequestIDMiddleware. /livez and /readyz are - // hit every few seconds per pod per service by orchestrators like - // Kubernetes; burning a crypto/rand UUID per probe is measurable overhead - // for 1-byte idempotent responses that have nothing to correlate against. - // RecoverMiddleware is kept so panics still become clean 500s. - // A static request ID is set so panic logs show "telemetry" instead of "". - handler := RecoverMiddleware(s.Logger)(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.Header().Set(requestIDHeader, "telemetry") - s.ServeMux.ServeHTTP(w, r) - })) - server := NewHTTPServer(addr, handler, DefaultTelemetryOptions(), s.Logger) - StartupBindWarning(s.Logger, s.Name+"/telemetry", addr) - - return server, func() error { - if err := server.ListenAndServe(); !errors.Is(err, http.ErrServerClosed) { - return err - } - return nil - } -} - -// HTTP handler for `/s.Name/readyz` that exposes the value of Ready() -func (s *ServiceTemplate) ReadyHandler(w http.ResponseWriter, r *http.Request) { - if !s.lifecycleImpl.Ready() { - http.Error(w, s.Name+": ready check failed", - http.StatusInternalServerError) - } else { - fmt.Fprintf(w, "%s: ready\n", s.Name) - } -} - -// HTTP handler for `/s.Name/livez` that exposes the value of Alive() -func (s *ServiceTemplate) AliveHandler(w http.ResponseWriter, r *http.Request) { - if !s.lifecycleImpl.Alive() { - http.Error(w, s.Name+": alive check failed", - http.StatusInternalServerError) - } else { - fmt.Fprintf(w, "%s: alive\n", s.Name) - } -} diff --git a/pkg/service/service_test.go b/pkg/service/service_test.go index 68842f0c0..3de18f111 100644 --- a/pkg/service/service_test.go +++ b/pkg/service/service_test.go @@ -5,205 +5,25 @@ package service import ( "context" + "errors" "log/slog" - "sync/atomic" - "testing" "time" - - "github.com/stretchr/testify/require" - "github.com/stretchr/testify/suite" ) -// mockImpl is a minimal ServiceImpl for testing the Serve() loop. -type mockImpl struct { - TickServiceTemplate - tickCount atomic.Int32 - onTick func(n int32) bool // called on each Tick with the tick count (1-based) -} - -func (m *mockImpl) OnReload() []error { return nil } -func (m *mockImpl) OnStop(bool) []error { return nil } -func (m *mockImpl) Tick(ctx context.Context) (bool, []error) { - n := m.tickCount.Add(1) - reschedule := false - if m.onTick != nil { - reschedule = m.onTick(n) - } - return reschedule, nil -} - -// createTestService creates a Service for testing with the given mock and -// optional reschedule support. It uses a long poll interval so timer ticks -// do not interfere with test assertions. -func createTestService( - t *testing.T, - impl *mockImpl, -) (IService, context.CancelFunc) { - t.Helper() - ctx, cancel := context.WithCancel(context.Background()) - err := InitTickServiceTemplate(&TickServiceConfigs{ - ServiceConfigs: ServiceConfigs{ - Name: "test", - LogLevel: slog.LevelError, - Context: ctx, - Cancel: cancel, - }, - PollInterval: 10 * time.Minute, // long: tests control wakeup explicitly - }, &impl.TickServiceTemplate, impl, impl) - require.NoError(t, err) - return impl, cancel -} - -type ServeSuite struct { - suite.Suite -} - -func TestServe(t *testing.T) { - suite.Run(t, new(ServeSuite)) -} - -func TestCreateRejectsNegativePollInterval(t *testing.T) { - impl := &mockImpl{} - svc := &TickServiceTemplate{} - - require.NotPanics(t, func() { - err := InitTickServiceTemplate(&TickServiceConfigs{ - ServiceConfigs: ServiceConfigs{ - Name: "test-negative-poll", - LogLevel: slog.LevelError, - }, - PollInterval: -time.Second, - }, svc, impl, impl) - require.ErrorContains(t, err, "PollInterval must be non-negative") - }) - require.Nil(t, svc.ticker) -} - -func (s *ServeSuite) TestDisabledReschedulePreservesExistingBehavior() { - // With rescheduling disabled and a short poll interval, - // Serve() should tick only on timer fires. - impl := &mockImpl{} - ctx, cancel := context.WithCancel(context.Background()) - err := InitTickServiceTemplate(&TickServiceConfigs{ - ServiceConfigs: ServiceConfigs{ - Name: "test-no-resched", - LogLevel: slog.LevelError, - Context: ctx, - Cancel: cancel, - }, - PollInterval: 20 * time.Millisecond, - }, &impl.TickServiceTemplate, impl, impl) - s.Require().NoError(err) - - done := make(chan struct{}) - go func() { - _ = impl.Serve() - close(done) - }() - - // Let a few timer ticks fire. - time.Sleep(90 * time.Millisecond) - cancel() - <-done - - // The initial tick + ~3-4 timer ticks at 20ms intervals over 90ms. - // We just verify it ticked more than once (timer is working) and - // not an unreasonable number (no busy-loop). - ticks := impl.tickCount.Load() - s.GreaterOrEqual(ticks, int32(2), "should have at least 2 ticks (initial + timer)") - s.LessOrEqual(ticks, int32(10), "should not have an unreasonable number of ticks") -} - -func (s *ServeSuite) TestRescheduleTriggersImmediateRetick() { - // When SignalReschedule() is called from Tick(), Serve() should call - // Tick() again immediately without waiting for the timer. - var impl *mockImpl - impl = &mockImpl{ - onTick: func(n int32) bool { - // Signal reschedule on ticks 1 and 2 (the initial tick - // and the first rescheduled tick). Stop on tick 3. - return n <= 2 - }, - } - - svc, cancel := createTestService(s.T(), impl) - defer cancel() - - done := make(chan struct{}) - go func() { - _ = svc.Serve() - close(done) - }() - - // Wait briefly. With a 10-minute poll interval, the only way to get - // 3 ticks quickly is via SignalReschedule. - time.Sleep(100 * time.Millisecond) - cancel() - <-done - - ticks := impl.tickCount.Load() - s.GreaterOrEqual(ticks, int32(3), - "should have at least 3 ticks: initial + 2 rescheduled") -} - -func (s *ServeSuite) TestContextCancellationExitsPromptly() { - // When context is cancelled with a reschedule signal pending, - // Serve() should exit promptly. - var impl *mockImpl - impl = &mockImpl{ - onTick: func(_ int32) bool { - return true - }, - } - - svc, cancel := createTestService(s.T(), impl) - - done := make(chan struct{}) - go func() { - _ = svc.Serve() - close(done) - }() - - // Let the initial tick fire and signal reschedule. - time.Sleep(20 * time.Millisecond) - cancel() - - // Serve() should exit promptly. - select { - case <-done: - // OK - case <-time.After(2 * time.Second): - s.Fail("Serve() did not exit within 2 seconds after context cancellation") - } -} - -func (s *ServeSuite) TestServeExitsOnContextCancelledBeforeFirstTick() { - impl := &mockImpl{} - - // Create the service with a live context, then cancel before Serve(). - svc, cancel := createTestService(s.T(), impl) - cancel() - - err := svc.Serve() - s.NoError(err) - // No ticks should have fired since context was already cancelled. - s.Equal(int32(0), impl.tickCount.Load()) -} - type delayedCloseImpl struct { ServiceTemplate onServeInitChan chan struct{} onStopInitChan chan struct{} } -func (s *delayedCloseImpl) OnStop(bool) []error { +func (s *delayedCloseImpl) Stop(bool) []error { <-s.onStopInitChan // wait signal to initiate stop return nil } -func (s *delayedCloseImpl) OnServe(ctx context.Context) error { +func (s *delayedCloseImpl) Serve() []error { close(s.onServeInitChan) // signal service was initiated - <-ctx.Done() + <-s.Supervisor.Context().Done() return nil } @@ -217,12 +37,16 @@ func (s *ServeSuite) TestServeExitsAfterStopIsComplete() { err := InitServiceTemplate(&ServiceConfigs{ Name: "stopOnChanClose", LogLevel: slog.LevelError, - }, &svc.ServiceTemplate, svc) + }, &svc.ServiceTemplate) + + supervisorcfg := &SupervisorConfigs{ Services: []ServiceImpl{ svc } } + supervisor := &ServicesSupervisor{} + err = InitServicesSupervisor(supervisorcfg, supervisor) s.NoError(err) onServeEndChan := make(chan error) go func() { - err = svc.Serve() + err = errors.Join(supervisor.Serve()...) onServeEndChan <- err // signal service ended and provide error close(onServeEndChan) }() @@ -232,7 +56,7 @@ func (s *ServeSuite) TestServeExitsAfterStopIsComplete() { case <-svc.onServeInitChan: // wait service to initiate, so can be stopped. // initiate service shutdown through context cancelation go func() { - errs := svc.Stop(true) + errs := supervisor.Stop(true) onStopEndChan <- errs // signal stop ended and provide the errors close(onStopEndChan) }() @@ -275,20 +99,25 @@ func (s *ServeSuite) TestServeExitsAfterStopIsCompleteOnContextCancelation() { onStopInitChan: make(chan struct{}), } - ctx, cancel := context.WithCancel(context.Background()) - // Create the service with a live context, then cancel before Serve(). err := InitServiceTemplate(&ServiceConfigs{ Name: "stopOnChanClose", LogLevel: slog.LevelError, - Context: ctx, - Cancel: cancel, - }, &svc.ServiceTemplate, svc) + }, &svc.ServiceTemplate) + + ctx, cancel := context.WithCancel(context.Background()) + supervisorcfg := &SupervisorConfigs{ + Services: []ServiceImpl{ svc }, + Context: ctx, + Cancel: cancel, + } + supervisor := &ServicesSupervisor{} + err = InitServicesSupervisor(supervisorcfg, supervisor) s.NoError(err) onServeEndChan := make(chan error) go func() { - err = svc.Serve() + err = errors.Join(supervisor.Serve()...) onServeEndChan <- err // signal service ended and provide error close(onServeEndChan) }() diff --git a/pkg/service/supervisor.go b/pkg/service/supervisor.go new file mode 100644 index 000000000..da1040c82 --- /dev/null +++ b/pkg/service/supervisor.go @@ -0,0 +1,329 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +package service + +import ( + "context" + "errors" + "fmt" + "log/slog" + "net/http" + "os" + "os/signal" + "slices" + "sync/atomic" + "syscall" + "time" +) + +type SupervisorConfigs struct { + Name string + Services []ServiceImpl + Logger *slog.Logger + LogLevel slog.Level + LogColor bool + Context context.Context + Cancel context.CancelFunc + EnableSignalHandling bool + TelemetryCreate bool + TelemetryAddress string +} + +// Supervisor of multiple services under a single management. +type ServicesSupervisor struct { + Name string + Logger *slog.Logger + ServeMux *http.ServeMux + services []IService + telemetry *http.Server + telemetryFunc func() error + context context.Context + cancelContext context.CancelFunc + sigHangUp chan os.Signal // SIGHUP to reload + sigShutdown chan os.Signal // SIGINT/SIGTERM to exit gracefully + + // stopped server Stop() run exactly once, even when Stop() is called + // multiple times (by the child's Serve() loop and by the parent orchestrator). + stopped atomic.Bool + stoppedChan chan struct{} +} + +func InitServicesSupervisor(c *SupervisorConfigs, s *ServicesSupervisor) error { + s.stoppedChan = make(chan struct{}) + + s.Logger = c.Logger + + // log + if s.Logger == nil { + s.Logger = NewLogger(c.LogLevel, c.LogColor).With("supervisor", c.Name) + } + + // context and cancelation + if c.Context == nil { + c.Context = context.Background() + } + s.context = c.Context + if c.Cancel == nil { + s.context, c.Cancel = context.WithCancel(c.Context) + } + s.cancelContext = c.Cancel + + // signal handling + if c.EnableSignalHandling { + if s.sigHangUp == nil { + s.sigHangUp = make(chan os.Signal, 1) + signal.Notify(s.sigHangUp, syscall.SIGHUP) + } + if s.sigShutdown == nil { + s.sigShutdown = make(chan os.Signal, 1) + signal.Notify(s.sigShutdown, syscall.SIGINT, syscall.SIGTERM) + } + } + + // telemetry + if c.TelemetryCreate { + if s.ServeMux == nil { + s.ServeMux = http.NewServeMux() + } + if c.TelemetryAddress == "" { + c.TelemetryAddress = ":8080" + } + s.telemetry, s.telemetryFunc = s.CreateDefaultTelemetry(c.TelemetryAddress) + go func() { + if err := s.telemetryFunc(); err != nil { + s.Logger.Error("Telemetry HTTP server failed", "error", err) + } + }() + } + + s.services = make([]IService, len(c.Services)) + for i, svc := range c.Services { + s.services[i] = svc + svc.SetSupervisor(s) + } + + if s.telemetry != nil { + s.Logger.Info("Telemetry", "address", s.telemetry.Addr) + } + + return nil +} + +func (s *ServicesSupervisor) Context() context.Context { + return s.context +} + +func (s *ServicesSupervisor) String() string { + return s.Name +} + +func (s *ServicesSupervisor) Alive() bool { + for _, svc := range s.services { + if !svc.Alive() { + s.Logger.Info("Service still not alive", "service", svc.String()) + return false + } + } + return true +} + +func (s *ServicesSupervisor) Ready() bool { + for _, svc := range s.services { + if !svc.Ready() { + s.Logger.Info("Service still not ready", "service", svc.String()) + return false + } + } + return true +} + +func (s *ServicesSupervisor) Reload() []error { + if s.stopped.Load() { + return []error{ErrServiceStopped} + } + + var allErrs []error + for _, svc := range s.services { + start := time.Now() + errs := svc.Reload() + elapsed := time.Since(start) + + if len(errs) > 0 { + allErrs = slices.Concat(allErrs, errs) + + s.Logger.Error("Reload", + "service", svc.String(), + "duration", elapsed, + "errors", errs) + } else { + s.Logger.Info("Reload", + "service", svc.String(), + "duration", elapsed) + } + } + + return allErrs +} + +func (s *ServicesSupervisor) Serve() []error { + if s.stopped.Load() { + return nil + } + + // Watch for conditions to shutdown services. + go func() { + for !s.stopped.Load() { + select { + case <-s.sigHangUp: + err := s.Reload() + if err != nil { + s.Logger.Error("Service failed to restart", "error", err) + } + case <-s.sigShutdown: + s.Stop(false) // Graceful shutdown; errors are logged by Stop. + return + case <-s.context.Done(): + s.Stop(true) // Stop logs errors internally. + return + } + } + }() + + svcEndChan := make(chan []error) + + svcCount := 0 + for _, svc := range s.services { + svcCount++ + go func(svc IService) { + var errs []error + + defer func() { svcEndChan <- errs }() + + s.Logger.Info("Starting service", "service", svc.String()) + for _, err := range svc.Serve() { + if err != nil && !errors.Is(err, context.Canceled) { + errs = append(errs, err) + } + } + switch { + case len(errs) > 0: + s.Logger.Error("Service failed, stopping other services", + "service", svc.String(), + "errors", errs, + ) + s.Stop(false) + case s.stopped.Load(): + s.Logger.Info("Service stopped", + "service", svc.String(), + ) + default: + s.Logger.Warn("Service stopped before supervisor termination", + "service", svc.String(), + ) + } + }(svc) + } + + var allErrs []error + for range svcCount { + allErrs = slices.Concat(allErrs, <-svcEndChan) + } + + go s.Stop(true) + <-s.stoppedChan + + return allErrs +} + +func (s *ServicesSupervisor) Stop(force bool) []error { + // CAS achieves once-semantics: the second caller returns immediately + // (fire-and-forget) rather than blocking like sync.Once. This is safe + // because the orchestrator calls Cancel() after Stop() and waits for + // the Serve goroutine to exit. + if !s.stopped.CompareAndSwap(false, true) { + return nil // already stopped + } + + if s.sigShutdown != nil { + signal.Stop(s.sigShutdown) + } + if s.sigHangUp != nil { + signal.Stop(s.sigHangUp) + } + + var allErrs []error + for i := len(s.services)-1; i >= 0; i-- { + svc := s.services[i] + start := time.Now() + errs := svc.Stop(force) + elapsed := time.Since(start) + + if len(errs) > 0 { + s.Logger.Error("Stop", + "force", force, + "duration", elapsed, + "error", errs) + } else { + s.Logger.Info("Stop", + "force", force, + "duration", elapsed) + } + + allErrs = slices.Concat(allErrs, errs) + } + + s.cancelContext() + close(s.stoppedChan) + + return allErrs +} + +/* + * Service Telemetry + */ + +func (s *ServicesSupervisor) CreateDefaultTelemetry(addr string) (*http.Server, func() error) { + s.ServeMux.Handle("/readyz", http.HandlerFunc(s.ReadyHandler)) + s.ServeMux.Handle("/livez", http.HandlerFunc(s.AliveHandler)) + + // Telemetry deliberately omits RequestIDMiddleware. /livez and /readyz are + // hit every few seconds per pod per service by orchestrators like + // Kubernetes; burning a crypto/rand UUID per probe is measurable overhead + // for 1-byte idempotent responses that have nothing to correlate against. + // RecoverMiddleware is kept so panics still become clean 500s. + // A static request ID is set so panic logs show "telemetry" instead of "". + handler := RecoverMiddleware(s.Logger)(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set(requestIDHeader, "telemetry") + s.ServeMux.ServeHTTP(w, r) + })) + server := NewHTTPServer(addr, handler, DefaultTelemetryOptions(), s.Logger) + StartupBindWarning(s.Logger, s.Name+"/telemetry", addr) + + return server, func() error { + if err := server.ListenAndServe(); !errors.Is(err, http.ErrServerClosed) { + return err + } + return nil + } +} + +// HTTP handler for `/s.Name/readyz` that exposes the value of Ready() +func (s *ServicesSupervisor) ReadyHandler(w http.ResponseWriter, r *http.Request) { + if !s.Ready() { + http.Error(w, s.Name+": ready check failed", + http.StatusInternalServerError) + } else { + fmt.Fprintf(w, "%s: ready\n", s.Name) + } +} + +// HTTP handler for `/s.Name/livez` that exposes the value of Alive() +func (s *ServicesSupervisor) AliveHandler(w http.ResponseWriter, r *http.Request) { + if !s.Alive() { + http.Error(w, s.Name+": alive check failed", + http.StatusInternalServerError) + } else { + fmt.Fprintf(w, "%s: alive\n", s.Name) + } +} diff --git a/pkg/service/supervisor_test.go b/pkg/service/supervisor_test.go new file mode 100644 index 000000000..c81e80ff7 --- /dev/null +++ b/pkg/service/supervisor_test.go @@ -0,0 +1,256 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +package service + +import ( + "errors" + "fmt" + "log/slog" + "sync" + "testing" + "time" + + "github.com/stretchr/testify/require" + "github.com/stretchr/testify/suite" +) + +type blockingChildImpl struct { + ServiceTemplate + started chan struct{} + done chan struct{} + once sync.Once +} + +func (c *blockingChildImpl) Serve() []error { + close(c.started) + <-c.Supervisor.Context().Done() + c.once.Do(func() { close(c.done) }) + return nil +} + +func createBlockingChild(t *testing.T, cfg *SupervisorConfigs, name string) *blockingChildImpl { + t.Helper() + childCfg := ServiceConfigs{ + Name: name, + LogLevel: cfg.LogLevel, + } + + child := &blockingChildImpl{ + started: make(chan struct{}), + done: make(chan struct{}), + } + require.NoError(t, InitServiceTemplate(&childCfg, &child.ServiceTemplate)) + return child +} + +type SupervisorSuite struct { + suite.Suite +} + +func TestSupervisor(t *testing.T) { + suite.Run(t, new(SupervisorSuite)) +} + +func (s *SupervisorSuite) TestNodeStopCancelsChildContexts() { + parentCfg := &SupervisorConfigs{ + Name: "node", + LogLevel: slog.LevelError, + } + + child1 := createBlockingChild(s.T(), parentCfg, "child-1") + child2 := createBlockingChild(s.T(), parentCfg, "child-2") + + parentCfg.Services = []ServiceImpl{child1, child2} + supervisor := &ServicesSupervisor{} + require.NoError(s.T(), InitServicesSupervisor(parentCfg, supervisor)) + + done := make(chan struct{}) + go func() { + _ = supervisor.Serve() + close(done) + }() + + select { + case <-child1.started: + case <-time.After(2 * time.Second): + s.Fail("child-1 did not start") + } + select { + case <-child2.started: + case <-time.After(2 * time.Second): + s.Fail("child-2 did not start") + } + + supervisor.Stop(false) + + select { + case <-child1.done: + case <-time.After(2 * time.Second): + s.Fail("child-1 did not observe ctx.Done()") + } + select { + case <-child2.done: + case <-time.After(2 * time.Second): + s.Fail("child-2 did not observe ctx.Done()") + } + + select { + case <-done: + case <-time.After(2 * time.Second): + s.Fail("node did not exit after Stop()") + } +} + +type errorChildImpl struct { + ServiceTemplate + started chan struct{} +} + +func (c *errorChildImpl) Serve() []error { + close(c.started) + time.Sleep(10 * time.Millisecond) + return []error{ fmt.Errorf("Oops %s!", c.Name) } +} + +func createErrorChild(t *testing.T, cfg *SupervisorConfigs, name string) *errorChildImpl { + t.Helper() + childCfg := &ServiceConfigs{ + Name: name, + LogLevel: cfg.LogLevel, + } + + child := &errorChildImpl{ + started: make(chan struct{}), + } + require.NoError(t, InitServiceTemplate(childCfg, &child.ServiceTemplate)) + return child +} + +type stopAwareChildImpl struct { + ServiceTemplate + started chan struct{} + stopped chan struct{} + done chan struct{} + stopOnce sync.Once + doneOnce sync.Once +} + +func (c *stopAwareChildImpl) Serve() []error { + close(c.started) + <-c.Supervisor.Context().Done() + c.doneOnce.Do(func() { close(c.done) }) + return nil +} + +func (c *stopAwareChildImpl) Stop(bool) []error { + c.stopOnce.Do(func() { close(c.stopped) }) + return nil +} + +func createStopAwareChild(t *testing.T, cfg *SupervisorConfigs, name string) *stopAwareChildImpl { + t.Helper() + childCfg := &ServiceConfigs{ + Name: name, + LogLevel: cfg.LogLevel, + } + + child := &stopAwareChildImpl{ + started: make(chan struct{}), + stopped: make(chan struct{}), + done: make(chan struct{}), + } + require.NoError(t, InitServiceTemplate(childCfg, &child.ServiceTemplate)) + return child +} + +func (s *SupervisorSuite) TestNodeReturnChildErrors() { + parentCfg := &SupervisorConfigs{ + Name: "node", + LogLevel: slog.LevelError, + } + + child1 := createErrorChild(s.T(), parentCfg, "child-1") + child2 := createErrorChild(s.T(), parentCfg, "child-2") + + parentCfg.Services = []ServiceImpl{child1, child2} + supervisor := &ServicesSupervisor{} + require.NoError(s.T(), InitServicesSupervisor(parentCfg, supervisor)) + + done := make(chan error) + go func() { + errs := supervisor.Serve() + done <- errors.Join(errs...) + close(done) + }() + + select { + case <-child1.started: + case <-time.After(2 * time.Second): + s.Fail("child-1 did not start") + } + select { + case <-child2.started: + case <-time.After(2 * time.Second): + s.Fail("child-2 did not start") + } + + select { + case err := <-done: + s.ErrorContains(err, "Oops child-1!") + s.ErrorContains(err, "Oops child-2!") + case <-time.After(2 * time.Second): + s.Fail("node did not exit after child errors") + } +} + +func (s *SupervisorSuite) TestNodeStopsChildrenWhenOneChildErrors() { + parentCfg := &SupervisorConfigs{ + Name: "node", + LogLevel: slog.LevelError, + } + + healthyChild := createStopAwareChild(s.T(), parentCfg, "healthy-child") + errorChild := createErrorChild(s.T(), parentCfg, "error-child") + + parentCfg.Services = []ServiceImpl{healthyChild, errorChild} + supervisor := &ServicesSupervisor{} + require.NoError(s.T(), InitServicesSupervisor(parentCfg, supervisor)) + + done := make(chan error) + go func() { + errs := supervisor.Serve() + done <- errors.Join(errs...) + close(done) + }() + + select { + case <-healthyChild.started: + case <-time.After(2 * time.Second): + s.Fail("healthy child did not start") + } + select { + case <-errorChild.started: + case <-time.After(2 * time.Second): + s.Fail("error child did not start") + } + + select { + case <-healthyChild.stopped: + case <-time.After(2 * time.Second): + s.Fail("supervisor did not stop the healthy child after the error") + } + + select { + case <-healthyChild.done: + case <-time.After(2 * time.Second): + s.Fail("healthy child did not exit after stop") + } + + select { + case err := <-done: + s.ErrorContains(err, "Oops error-child!") + case <-time.After(2 * time.Second): + s.Fail("supervisor did not exit after child error") + } +} diff --git a/pkg/service/telemetry_test.go b/pkg/service/telemetry_test.go index 801d7e56e..8feb1a99f 100644 --- a/pkg/service/telemetry_test.go +++ b/pkg/service/telemetry_test.go @@ -4,7 +4,6 @@ package service import ( - "context" "io" "net/http" "net/http/httptest" @@ -15,19 +14,22 @@ import ( // newTelemetryTestService returns a *Service ready to have CreateDefaultTelemetry // called on it. It wires a ServeMux, a mockImpl, and a discard logger. -func newTelemetryTestService() *ServiceTemplate { - impl := &mockImpl{} - return &ServiceTemplate{ - Name: "test", - Logger: discardLogger(), - ServeMux: http.NewServeMux(), - lifecycleImpl: impl, +func newTelemetryTestService(t *testing.T) *ServicesSupervisor { + cfg := &SupervisorConfigs{ + Name: "test", + Logger: discardLogger(), + TelemetryCreate: true, + TelemetryAddress: ":0", } + supervisor := &ServicesSupervisor{} + err := InitServicesSupervisor(cfg, supervisor) + require.NoError(t, err) + return supervisor } func TestCreateDefaultTelemetry_Hardened(t *testing.T) { - s := newTelemetryTestService() - srv, _ := s.CreateDefaultTelemetry(":0") + s := newTelemetryTestService(t) + srv := s.telemetry opts := DefaultTelemetryOptions() require.Equal(t, opts.ReadHeaderTimeout, srv.ReadHeaderTimeout) @@ -39,8 +41,8 @@ func TestCreateDefaultTelemetry_Hardened(t *testing.T) { } func TestCreateDefaultTelemetry_HandlersWired(t *testing.T) { - s := newTelemetryTestService() - srv, _ := s.CreateDefaultTelemetry(":0") + s := newTelemetryTestService(t) + srv := s.telemetry // /readyz: mockImpl.Ready() is true, so expect 200. rr := httptest.NewRecorder() @@ -61,8 +63,8 @@ func TestCreateDefaultTelemetry_HandlersWired(t *testing.T) { // against. A static "telemetry" sentinel is used instead so panic logs are // greppable without the cost of crypto/rand per probe. func TestCreateDefaultTelemetry_StaticRequestID(t *testing.T) { - s := newTelemetryTestService() - srv, _ := s.CreateDefaultTelemetry(":0") + s := newTelemetryTestService(t) + srv := s.telemetry for _, path := range []string{"/livez", "/readyz"} { rr := httptest.NewRecorder() @@ -82,12 +84,12 @@ func TestCreateDefaultTelemetry_StaticRequestID(t *testing.T) { } func TestCreateDefaultTelemetry_PanicRecovered(t *testing.T) { - s := newTelemetryTestService() + s := newTelemetryTestService(t) s.ServeMux.Handle("/boom", http.HandlerFunc(func(_ http.ResponseWriter, _ *http.Request) { panic("kaboom") })) - srv, _ := s.CreateDefaultTelemetry(":0") + srv := s.telemetry rr := httptest.NewRecorder() srv.Handler.ServeHTTP(rr, httptest.NewRequest(http.MethodGet, "/boom", nil)) @@ -100,23 +102,29 @@ func TestCreateDefaultTelemetry_PanicRecovered(t *testing.T) { type falseLifecycleImpl struct{ ServiceTemplate } -func (*falseLifecycleImpl) Alive() bool { return false } -func (*falseLifecycleImpl) Ready() bool { return false } -func (*falseLifecycleImpl) OnServe(context.Context) error { return nil } +func (*falseLifecycleImpl) Alive() bool { return false } +func (*falseLifecycleImpl) Ready() bool { return false } +func (*falseLifecycleImpl) Serve() []error { return nil } func TestCreateDefaultTelemetry_Returns500WhenLifecycleFails(t *testing.T) { - service := &ServiceTemplate{ - Name: "test", - Logger: discardLogger(), - ServeMux: http.NewServeMux(), - lifecycleImpl: &falseLifecycleImpl{}, + svc := &falseLifecycleImpl{} + err := InitServiceTemplate(&ServiceConfigs{}, &svc.ServiceTemplate) + require.NoError(t, err) + + cfg := &SupervisorConfigs{ + Name: "test", + Logger: discardLogger(), + TelemetryCreate: true, + TelemetryAddress: ":0", + Services: []ServiceImpl{ svc }, } - - srv, _ := service.CreateDefaultTelemetry(":0") + supervisor := &ServicesSupervisor{} + err = InitServicesSupervisor(cfg, supervisor) + require.NoError(t, err) for _, path := range []string{"/readyz", "/livez"} { rr := httptest.NewRecorder() - srv.Handler.ServeHTTP(rr, httptest.NewRequest(http.MethodGet, path, nil)) + supervisor.telemetry.Handler.ServeHTTP(rr, httptest.NewRequest(http.MethodGet, path, nil)) require.Equal(t, http.StatusInternalServerError, rr.Code, "path=%s", path) } } diff --git a/pkg/service/tick.go b/pkg/service/tick.go new file mode 100644 index 000000000..1c586b80d --- /dev/null +++ b/pkg/service/tick.go @@ -0,0 +1,97 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +/* + * Alternative service template that implements the tick-based processing. + */ + +package service + +import ( + "context" + "time" +) + +type TickImpl interface { + Tick(ctx context.Context) (bool, []error) +} + +type TickServiceTemplate struct { + ServiceTemplate + tickImpl TickImpl + ticker *time.Ticker +} + +type TickServiceConfigs struct { + ServiceConfigs + PollInterval time.Duration +} + +func InitTickServiceTemplate( + cfg *TickServiceConfigs, + tmpl *TickServiceTemplate, + tickImpl TickImpl, +) error { + if cfg == nil || tmpl == nil || tickImpl == nil { + return ErrInvalid + } + + err := InitServiceTemplate(&cfg.ServiceConfigs, &tmpl.ServiceTemplate) + if err != nil { + return err + } + + tmpl.tickImpl = tickImpl + + // ticker + if cfg.PollInterval == 0 { + cfg.PollInterval = time.Minute + } + tmpl.ticker = time.NewTicker(cfg.PollInterval) + + return nil +} + +func (s *TickServiceTemplate) tick(ctx context.Context) bool { + if ctx.Err() != nil { + return false + } + start := time.Now() + reschedule, errs := s.tickImpl.Tick(ctx) + elapsed := time.Since(start) + + if len(errs) > 0 { + s.Logger.Error("Tick", + "duration", elapsed, + "reschedule", reschedule, + "error", errs, + ) + } else { + s.Logger.Debug("Tick", + "duration", elapsed, + "reschedule", reschedule, + ) + } + return reschedule +} + +func (s *TickServiceTemplate) Stop(bool) []error { + s.ticker.Stop() + return nil +} + +func (s *TickServiceTemplate) Serve() []error { + ctx := s.Supervisor.context + if ctx.Err() != nil { + return nil + } + for s.tick(ctx) {} + for { + select { + case <-ctx.Done(): + return nil + case <-s.ticker.C: + for s.tick(ctx) {} + } + } +} diff --git a/pkg/service/tick_test.go b/pkg/service/tick_test.go new file mode 100644 index 000000000..4b4f521c2 --- /dev/null +++ b/pkg/service/tick_test.go @@ -0,0 +1,173 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +package service + +import ( + "context" + "log/slog" + "sync/atomic" + "testing" + "time" + + "github.com/stretchr/testify/require" + "github.com/stretchr/testify/suite" +) + +// mockImpl is a minimal ServiceImpl for testing the Serve() loop. +type mockImpl struct { + TickServiceTemplate + tickCount atomic.Int32 + onTick func(n int32) bool // called on each Tick with the tick count (1-based) +} + +func (m *mockImpl) Tick(ctx context.Context) (bool, []error) { + n := m.tickCount.Add(1) + reschedule := false + if m.onTick != nil { + reschedule = m.onTick(n) + } + return reschedule, nil +} + +// createTestService creates a Service for testing with the given mock and +// optional reschedule support. It uses a long poll interval so timer ticks +// do not interfere with test assertions. +func createTestService( + t *testing.T, + impl *mockImpl, + interval time.Duration, +) IService { + t.Helper() + + err := InitTickServiceTemplate(&TickServiceConfigs{ + ServiceConfigs: ServiceConfigs{ + Name: "test", + LogLevel: slog.LevelError, + }, + PollInterval: interval, + }, &impl.TickServiceTemplate, impl) + require.NoError(t, err) + + supervisor := &ServicesSupervisor{} + err = InitServicesSupervisor( + &SupervisorConfigs{ + Name: "supervisor", + Services: []ServiceImpl{ impl }, + }, + supervisor, + ) + require.NoError(t, err) + + return supervisor +} + +type ServeSuite struct { + suite.Suite +} + +func TestServe(t *testing.T) { + suite.Run(t, new(ServeSuite)) +} + +func (s *ServeSuite) TestDisabledReschedulePreservesExistingBehavior() { + // With rescheduling disabled and a short poll interval, + // Serve() should tick only on timer fires. + impl := &mockImpl{} + + svc := createTestService(s.T(), impl, 20 * time.Millisecond) + + done := make(chan struct{}) + go func() { + _ = svc.Serve() + close(done) + }() + + // Let a few timer ticks fire. + time.Sleep(90 * time.Millisecond) + svc.Stop(true) + <-done + + // The initial tick + ~3-4 timer ticks at 20ms intervals over 90ms. + // We just verify it ticked more than once (timer is working) and + // not an unreasonable number (no busy-loop). + ticks := impl.tickCount.Load() + s.GreaterOrEqual(ticks, int32(2), "should have at least 2 ticks (initial + timer)") + s.LessOrEqual(ticks, int32(10), "should not have an unreasonable number of ticks") +} + +func (s *ServeSuite) TestRescheduleTriggersImmediateRetick() { + // When SignalReschedule() is called from Tick(), Serve() should call + // Tick() again immediately without waiting for the timer. + var impl *mockImpl + impl = &mockImpl{ + onTick: func(n int32) bool { + // Signal reschedule on ticks 1 and 2 (the initial tick + // and the first rescheduled tick). Stop on tick 3. + return n <= 2 + }, + } + + svc := createTestService(s.T(), impl, 10 * time.Minute) + defer svc.Stop(true) + + done := make(chan struct{}) + go func() { + _ = svc.Serve() + close(done) + }() + + // Wait briefly. With a 10-minute poll interval, the only way to get + // 3 ticks quickly is via SignalReschedule. + time.Sleep(100 * time.Millisecond) + svc.Stop(true) + <-done + + ticks := impl.tickCount.Load() + s.GreaterOrEqual(ticks, int32(3), + "should have at least 3 ticks: initial + 2 rescheduled") +} + +func (s *ServeSuite) TestContextCancellationExitsPromptly() { + // When context is cancelled with a reschedule signal pending, + // Serve() should exit promptly. + var impl *mockImpl + impl = &mockImpl{ + onTick: func(_ int32) bool { + return true + }, + } + + svc := createTestService(s.T(), impl, 10 * time.Minute) + + done := make(chan struct{}) + go func() { + _ = svc.Serve() + close(done) + }() + + // Let the initial tick fire and signal reschedule. + time.Sleep(20 * time.Millisecond) + svc.Stop(true) + + // Serve() should exit promptly. + select { + case <-done: + // OK + case <-time.After(2 * time.Second): + s.Fail("Serve() did not exit within 2 seconds after context cancellation") + } +} + +func (s *ServeSuite) TestServeExitsOnContextCancelledBeforeFirstTick() { + impl := &mockImpl{} + + // Create the service with a live context, then cancel before Serve(). + svc := createTestService(s.T(), impl, 10 * time.Minute) + svc.Stop(true) + + errs := svc.Serve() + s.Empty(errs) + // No ticks should have fired since context was already cancelled. + s.Equal(int32(0), impl.tickCount.Load()) +} From 6d9faa55a916e67f91a08d2329af8f0014e7dfab Mon Sep 17 00:00:00 2001 From: Renato Maia <1887792+renatomaia@users.noreply.github.com> Date: Fri, 19 Jun 2026 20:12:55 -0300 Subject: [PATCH 13/16] refactor(services): return joined errors instead of '[]error' --- cmd/cartesi-rollups-advancer/root/root.go | 2 +- cmd/cartesi-rollups-claimer/root/root.go | 2 +- cmd/cartesi-rollups-evm-reader/root/root.go | 2 +- cmd/cartesi-rollups-jsonrpc-api/root/root.go | 2 +- cmd/cartesi-rollups-node/root/root.go | 2 +- cmd/cartesi-rollups-prt/root/root.go | 2 +- cmd/cartesi-rollups-validator/root/root.go | 2 +- internal/advancer/advancer_test.go | 28 ++--- internal/advancer/service.go | 13 ++- internal/claimer/accept.go | 17 +-- internal/claimer/accept_test.go | 109 +++++++++---------- internal/claimer/claimer.go | 49 ++++----- internal/claimer/claimer_test.go | 8 +- internal/claimer/divergence_test.go | 4 +- internal/claimer/foreclosed_apps_test.go | 19 +++- internal/claimer/foreclosure.go | 11 +- internal/claimer/inflight_test.go | 21 ++-- internal/claimer/reverts_test.go | 36 +++--- internal/claimer/stage.go | 9 +- internal/claimer/stage_test.go | 34 +++--- internal/claimer/submit.go | 10 +- internal/claimer/submit_test.go | 108 +++++++++--------- internal/cli/cobra.go | 7 -- internal/evmreader/evmreader.go | 4 +- internal/evmreader/evmreader_test.go | 11 +- internal/evmreader/service.go | 2 +- internal/jsonrpc/service.go | 21 ++-- internal/node/node.go | 8 +- internal/prt/service.go | 8 +- internal/validator/validator.go | 6 +- pkg/service/service.go | 10 +- pkg/service/service_test.go | 19 ++-- pkg/service/supervisor.go | 59 +++++----- pkg/service/supervisor_test.go | 17 ++- pkg/service/telemetry_test.go | 2 +- pkg/service/tick.go | 12 +- pkg/service/tick_test.go | 2 +- test/validator/validator_test.go | 16 +-- 38 files changed, 338 insertions(+), 356 deletions(-) diff --git a/cmd/cartesi-rollups-advancer/root/root.go b/cmd/cartesi-rollups-advancer/root/root.go index bd802bacf..48b4fd851 100644 --- a/cmd/cartesi-rollups-advancer/root/root.go +++ b/cmd/cartesi-rollups-advancer/root/root.go @@ -114,5 +114,5 @@ func run(cmd *cobra.Command, args []string) { err = service.InitServicesSupervisor(&createInfo.SupervisorConfigs, supervisor) cli.CheckErr(logger, err) - cli.CheckErrs(logger, supervisor.Serve()) + cli.CheckErr(logger, supervisor.Serve()) } diff --git a/cmd/cartesi-rollups-claimer/root/root.go b/cmd/cartesi-rollups-claimer/root/root.go index e77b0b2bb..6a2026602 100644 --- a/cmd/cartesi-rollups-claimer/root/root.go +++ b/cmd/cartesi-rollups-claimer/root/root.go @@ -126,5 +126,5 @@ func run(cmd *cobra.Command, args []string) { err = service.InitServicesSupervisor(&createInfo.SupervisorConfigs, supervisor) cli.CheckErr(logger, err) - cli.CheckErrs(logger, supervisor.Serve()) + cli.CheckErr(logger, supervisor.Serve()) } diff --git a/cmd/cartesi-rollups-evm-reader/root/root.go b/cmd/cartesi-rollups-evm-reader/root/root.go index ac051b956..75fbcf6c6 100644 --- a/cmd/cartesi-rollups-evm-reader/root/root.go +++ b/cmd/cartesi-rollups-evm-reader/root/root.go @@ -127,5 +127,5 @@ func run(cmd *cobra.Command, args []string) { err = service.InitServicesSupervisor(&createInfo.SupervisorConfigs, supervisor) cli.CheckErr(logger, err) - cli.CheckErrs(logger, supervisor.Serve()) + cli.CheckErr(logger, supervisor.Serve()) } diff --git a/cmd/cartesi-rollups-jsonrpc-api/root/root.go b/cmd/cartesi-rollups-jsonrpc-api/root/root.go index b540c3e73..c47aa4eee 100644 --- a/cmd/cartesi-rollups-jsonrpc-api/root/root.go +++ b/cmd/cartesi-rollups-jsonrpc-api/root/root.go @@ -99,5 +99,5 @@ func run(cmd *cobra.Command, args []string) { err = service.InitServicesSupervisor(&createInfo.SupervisorConfigs, supervisor) cli.CheckErr(logger, err) - cli.CheckErrs(logger, supervisor.Serve()) + cli.CheckErr(logger, supervisor.Serve()) } diff --git a/cmd/cartesi-rollups-node/root/root.go b/cmd/cartesi-rollups-node/root/root.go index 63252aa20..723870fca 100644 --- a/cmd/cartesi-rollups-node/root/root.go +++ b/cmd/cartesi-rollups-node/root/root.go @@ -187,5 +187,5 @@ func run(cmd *cobra.Command, args []string) { nodeService, err := node.Create(ctx, &createInfo) cli.CheckErr(logger, err) - cli.CheckErrs(logger, nodeService.Serve()) + cli.CheckErr(logger, nodeService.Serve()) } diff --git a/cmd/cartesi-rollups-prt/root/root.go b/cmd/cartesi-rollups-prt/root/root.go index 8f77a81b2..5b40d0ac7 100644 --- a/cmd/cartesi-rollups-prt/root/root.go +++ b/cmd/cartesi-rollups-prt/root/root.go @@ -115,5 +115,5 @@ func run(cmd *cobra.Command, args []string) { err = service.InitServicesSupervisor(&createInfo.SupervisorConfigs, supervisor) cli.CheckErr(logger, err) - cli.CheckErrs(logger, supervisor.Serve()) + cli.CheckErr(logger, supervisor.Serve()) } diff --git a/cmd/cartesi-rollups-validator/root/root.go b/cmd/cartesi-rollups-validator/root/root.go index c0ef3bf4e..76e24d262 100644 --- a/cmd/cartesi-rollups-validator/root/root.go +++ b/cmd/cartesi-rollups-validator/root/root.go @@ -101,5 +101,5 @@ func run(cmd *cobra.Command, args []string) { err = service.InitServicesSupervisor(&createInfo.SupervisorConfigs, supervisor) cli.CheckErr(logger, err) - cli.CheckErrs(logger, supervisor.Serve()) + cli.CheckErr(logger, supervisor.Serve()) } diff --git a/internal/advancer/advancer_test.go b/internal/advancer/advancer_test.go index 054621f2b..88b69bcb8 100644 --- a/internal/advancer/advancer_test.go +++ b/internal/advancer/advancer_test.go @@ -100,7 +100,7 @@ func (s *AdvancerSuite) TestServiceInterface() { // Test service interface methods require.True(advancer.Alive()) require.True(advancer.Ready()) - require.Empty(advancer.Reload()) + require.NoError(advancer.Reload()) require.Equal(advancer.Name, advancer.String()) // Test Tick method @@ -108,18 +108,18 @@ func (s *AdvancerSuite) TestServiceInterface() { repository.GetEpochsReturn = map[common.Address][]*Epoch{ machineManager.Map[1].application.IApplicationAddress: {}, } - _, tickErrors := advancer.Tick(context.Background()) - require.Empty(tickErrors) + _, tickErr := advancer.Tick(context.Background()) + require.NoError(tickErr) // Test Tick with error repository.GetEpochsError = errors.New("list epochs error") - _, tickErrors = advancer.Tick(context.Background()) - require.NotEmpty(tickErrors) - require.Contains(tickErrors[0].Error(), "list epochs error") + _, tickErr = advancer.Tick(context.Background()) + require.Error(tickErr) + require.Contains(tickErr.Error(), "list epochs error") // Stop must be called last to cleanly shut down the service. // It should complete without returning any errors. - require.Empty(advancer.Supervisor.Stop(false)) + require.NoError(advancer.Supervisor.Stop(false)) }) } @@ -1579,7 +1579,8 @@ func (s *AdvancerSuite) TestSelfWakeOnSuccess() { require.NoError(err) // Call Tick() which internally calls Step() and signals reschedule. - reschedule, _ := svc.Tick(context.Background()) + reschedule, err := svc.Tick(context.Background()) + require.NoError(err) // The reschedule channel should have a pending signal. require.True(reschedule, @@ -1603,7 +1604,8 @@ func (s *AdvancerSuite) TestNoSelfWakeWhenIdle() { svc, err := newMockAdvancerService(mm, repo) require.NoError(err) - reschedule, _ := svc.Tick(context.Background()) + reschedule, err := svc.Tick(context.Background()) + require.NoError(err) require.False(reschedule, "reschedule channel should be empty when no work exists") @@ -1622,8 +1624,8 @@ func (s *AdvancerSuite) TestNoSelfWakeOnError() { svc, err := newMockAdvancerService(mm, repo) require.NoError(err) - reschedule, errs := svc.Tick(context.Background()) - require.NotEmpty(errs) + reschedule, err := svc.Tick(context.Background()) + require.Error(err) require.False(reschedule, "reschedule should NOT be signaled on error") @@ -1663,8 +1665,8 @@ func (s *AdvancerSuite) TestPartialSuccessStillReschedules() { // Call Tick — app1 fails, app2 succeeds with more work remaining (batch limit hit). // Tick should surface the error AND signal reschedule for app2's pending work. - reschedule, errs := svc.Tick(context.Background()) - require.NotEmpty(errs, "Tick should surface app1's error") + reschedule, err := svc.Tick(context.Background()) + require.Error(err, "Tick should surface app1's error") // Reschedule SHOULD fire: app2 had work, and one failing app must not // delay healthy apps by suppressing the reschedule signal. diff --git a/internal/advancer/service.go b/internal/advancer/service.go index ce4f1fc82..889076019 100644 --- a/internal/advancer/service.go +++ b/internal/advancer/service.go @@ -107,7 +107,7 @@ func Create(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) { } // Service interface implementation -func (s *Service) Tick(ctx context.Context) (bool, []error) { +func (s *Service) Tick(ctx context.Context) (bool, error) { // Signal reschedule whenever work was done, even if some apps errored. // Failed apps are marked Failed and removed by the machine manager, // so they won't cause amplified retries on the next tick. @@ -132,10 +132,10 @@ func (s *Service) Tick(ctx context.Context) (bool, []error) { s.Logger.Debug("Tick cancelled (shutdown)", "error", err) return hadWork, nil } - return hadWork, []error{err} + return hadWork, err } -func (s *Service) Stop(b bool) []error { +func (s *Service) Stop(b bool) error { var errs []error if s.inspector != nil { s.Logger.Info("Shutting down inspect HTTP server") @@ -151,10 +151,13 @@ func (s *Service) Stop(b bool) []error { errs = append(errs, fmt.Errorf("failed to close machine manager: %w", err)) } } - return append(errs, s.TickServiceTemplate.Stop(b)...) + if err := s.TickServiceTemplate.Stop(b); err != nil { + errs = append(errs, err) + } + return errors.Join(errs...) } -func (s *Service) Serve() []error { +func (s *Service) Serve() error { if s.inspector != nil { go func() { if err := s.inspector.Serve(); err != nil && !errors.Is(err, http.ErrServerClosed) { diff --git a/internal/claimer/accept.go b/internal/claimer/accept.go index 21fbb7a47..e65fcf6c6 100644 --- a/internal/claimer/accept.go +++ b/internal/claimer/accept.go @@ -5,6 +5,7 @@ package claimer import ( "context" + "errors" "fmt" "math/big" @@ -88,9 +89,9 @@ func (s *Service) acceptClaimsAndUpdateDatabase( stagedEpochs map[int64]*model.Epoch, apps map[int64]*model.Application, defaultBlockNumber *big.Int, -) (int, []error) { +) (int, error) { transitions := 0 - errs := []error{} + var err error for key, currEpoch := range stagedEpochs { result := s.processAcceptedClaimEvent(ctx, stagedClaimWork{ @@ -100,13 +101,13 @@ func (s *Service) acceptClaimsAndUpdateDatabase( }, defaultBlockNumber) transitions += result.progress if result.err != nil { - errs = append(errs, result.err) + err = errors.Join(err, result.err) } if result.drop { delete(stagedEpochs, key) } } - return transitions, errs + return transitions, err } func (s *Service) processAcceptedClaimEvent( @@ -190,9 +191,9 @@ func (s *Service) acceptStagedClaimsAndIssueAcceptTx( stagedEpochs map[int64]*model.Epoch, apps map[int64]*model.Application, defaultBlockNumber *big.Int, -) (int, []error) { +) (int, error) { transitions := 0 - errs := []error{} + var err error for key, currEpoch := range stagedEpochs { result := s.processStagedClaim(ctx, stagedClaimWork{ @@ -201,13 +202,13 @@ func (s *Service) acceptStagedClaimsAndIssueAcceptTx( }, defaultBlockNumber) transitions += result.progress if result.err != nil { - errs = append(errs, result.err) + err = errors.Join(err, result.err) } if result.drop { delete(stagedEpochs, key) } } - return transitions, errs + return transitions, err } func (s *Service) processStagedClaim( diff --git a/internal/claimer/accept_test.go b/internal/claimer/accept_test.go index 138c9d0ae..241046ee2 100644 --- a/internal/claimer/accept_test.go +++ b/internal/claimer/accept_test.go @@ -37,8 +37,8 @@ func TestAcceptFirstClaim(t *testing.T) { b.On("getConsensusAddress", mock.Anything, app, mock.Anything). Return(app.IConsensusAddress, nil).Once() - _, errs := m.acceptClaimsAndUpdateDatabase(ctx, makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, len(errs), 0) + _, err := m.acceptClaimsAndUpdateDatabase(ctx, makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.NoError(t, err) } func TestAcceptClaimWithAntecessor(t *testing.T) { @@ -60,8 +60,8 @@ func TestAcceptClaimWithAntecessor(t *testing.T) { r.On("UpdateEpochWithAcceptedClaim", mock.Anything, app.ID, currEpoch.Index, mock.Anything). Return(nil).Once() - transitions, errs := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 0, len(errs)) + transitions, err := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.NoError(t, err) assert.Equal(t, 1, transitions, "accepting a claim counts as a transition") } @@ -86,8 +86,8 @@ func TestFindClaimAcceptedEventAndSuccFailure0(t *testing.T) { b.On("findClaimAcceptedEventAndSucc", mock.Anything, app, currEpoch, currEpoch.LastBlock+1, endBlock.Uint64()). Return(&iconsensus.IConsensus{}, prevEvent, currEvent, expectedErr).Once() - _, errs := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 1, len(errs)) + _, err := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.Error(t, err) } func TestFindClaimAcceptedEventAndSuccFailure1(t *testing.T) { @@ -109,8 +109,8 @@ func TestFindClaimAcceptedEventAndSuccFailure1(t *testing.T) { b.On("findClaimAcceptedEventAndSucc", mock.Anything, app, prevEpoch, prevEpoch.LastBlock+1, endBlock.Uint64()). Return(&iconsensus.IConsensus{}, prevEvent, currEvent, expectedErr).Once() - _, errs := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 1, len(errs)) + _, err := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.Error(t, err) } // !claimAcceptedMatch(prevClaim, prevEvent) @@ -140,8 +140,8 @@ func TestAcceptClaimWithAntecessorMismatch(t *testing.T) { r.On("UpdateApplicationStatus", mock.Anything, mock.Anything, model.ApplicationStatus_Diverged, mock.Anything). Return(nil).Once() - _, errs := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 1, len(errs)) + _, err := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.Error(t, err) } // !claimAcceptedMatch(currClaim, currEvent) @@ -165,8 +165,8 @@ func TestAcceptClaimWithEventMismatch(t *testing.T) { r.On("UpdateApplicationStatus", mock.Anything, mock.Anything, model.ApplicationStatus_Diverged, mock.Anything). Return(nil).Once() - _, errs := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 1, len(errs)) + _, err := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.Error(t, err) } // !checkClaimsConstraint(prevClaim, currClaim) @@ -185,8 +185,8 @@ func TestAcceptClaimWithAntecessorOutOfOrder(t *testing.T) { Return(nil). Once() - _, errs := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(wrongEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), big.NewInt(0)) - assert.Equal(t, 1, len(errs)) + _, err := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(wrongEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), big.NewInt(0)) + assert.Error(t, err) } func TestErrAcceptedMissingEvent(t *testing.T) { @@ -208,8 +208,8 @@ func TestErrAcceptedMissingEvent(t *testing.T) { r.On("UpdateApplicationStatus", mock.Anything, mock.Anything, model.ApplicationStatus_Corrupted, mock.Anything). Return(nil).Once() - _, errs := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 1, len(errs)) + _, err := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.Error(t, err) } func TestUpdateEpochWithAcceptedClaimFailed(t *testing.T) { @@ -233,8 +233,8 @@ func TestUpdateEpochWithAcceptedClaimFailed(t *testing.T) { r.On("UpdateEpochWithAcceptedClaim", mock.Anything, app.ID, currEpoch.Index, mock.Anything). Return(expectedErr).Once() - _, errs := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 1, len(errs)) + _, err := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.Error(t, err) } func TestConsensusAddressChangedOnAcceptedClaims(t *testing.T) { @@ -255,8 +255,8 @@ func TestConsensusAddressChangedOnAcceptedClaims(t *testing.T) { Return(nil). Once() - _, errs := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, len(errs), 1) + _, err := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.Error(t, err) } func TestAcceptStagedFrontRunner(t *testing.T) { @@ -277,8 +277,8 @@ func TestAcceptStagedFrontRunner(t *testing.T) { r.On("UpdateEpochWithAcceptedClaim", mock.Anything, app.ID, currEpoch.Index, mock.Anything). Return(nil).Once() - transitions, errs := m.acceptStagedClaimsAndIssueAcceptTx(context.Background(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 0, len(errs)) + transitions, err := m.acceptStagedClaimsAndIssueAcceptTx(context.Background(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.NoError(t, err) assert.Equal(t, 1, transitions) assert.Equal(t, 0, len(m.acceptsInFlight)) } @@ -302,8 +302,8 @@ func TestAcceptStagedBroadcastsWhenClaimStillStaged(t *testing.T) { b.On("acceptClaimOnBlockchain", app, currEpoch). Return(txHash, nil).Once() - transitions, errs := m.acceptStagedClaimsAndIssueAcceptTx(context.Background(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 0, len(errs)) + transitions, err := m.acceptStagedClaimsAndIssueAcceptTx(context.Background(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.NoError(t, err) assert.Equal(t, 0, transitions, "broadcasting acceptClaim records in-flight work but does not update DB yet") got, ok := m.acceptsInFlight[app.ID] @@ -333,8 +333,8 @@ func TestAcceptStagedFrontRunnerOutputsMismatchSetsDiverged(t *testing.T) { r.On("UpdateApplicationStatus", mock.Anything, app.ID, model.ApplicationStatus_Diverged, mock.Anything). Return(nil).Once() - transitions, errs := m.acceptStagedClaimsAndIssueAcceptTx(context.Background(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 1, len(errs)) + transitions, err := m.acceptStagedClaimsAndIssueAcceptTx(context.Background(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.Error(t, err) assert.Equal(t, 0, transitions) assert.Equal(t, 0, len(m.acceptsInFlight)) } @@ -361,10 +361,10 @@ func TestAcceptStagedUnmodeledClaimStatusFailsClosed(t *testing.T) { r.On("UpdateApplicationStatus", mock.Anything, app.ID, model.ApplicationStatus_Failed, mock.Anything). Return(nil).Once() - transitions, errs := m.acceptStagedClaimsAndIssueAcceptTx(context.Background(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + transitions, err := m.acceptStagedClaimsAndIssueAcceptTx(context.Background(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 0, transitions) // SetFailedf returns nil on success; the FAILED write is asserted by the mock. - assert.Equal(t, 0, len(errs)) + assert.NoError(t, err) assert.Equal(t, 0, len(m.acceptsInFlight), "no acceptClaim broadcast on an unmodeled status") } @@ -392,10 +392,10 @@ func TestAcceptStagedForeclosesForeclosedApp(t *testing.T) { // an unexpected call if the guard fails. ctx := context.Background() - transitions, errs := m.acceptStagedClaimsAndIssueAcceptTx( + transitions, err := m.acceptStagedClaimsAndIssueAcceptTx( ctx, makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 0, len(errs)) + assert.NoError(t, err) assert.Equal(t, 1, transitions) assert.Equal(t, model.EpochStatus_ClaimForeclosed, currEpoch.Status) assert.Equal(t, 0, len(m.acceptsInFlight), @@ -432,10 +432,10 @@ func TestAcceptStagedForeclosesForeclosedAppOnUnstaged(t *testing.T) { // CORRUPTED write is attempted. ctx := context.Background() - transitions, errs := m.acceptStagedClaimsAndIssueAcceptTx( + transitions, err := m.acceptStagedClaimsAndIssueAcceptTx( ctx, makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 0, len(errs)) + assert.NoError(t, err) assert.Equal(t, 1, transitions) assert.Equal(t, model.EpochStatus_ClaimForeclosed, currEpoch.Status) assert.Equal(t, model.ApplicationStatus_OK, app.Status, @@ -468,9 +468,9 @@ func TestAcceptStagedCapEnforced(t *testing.T) { r.On("UpdateApplicationStatus", mock.Anything, app.ID, model.ApplicationStatus_Failed, mock.Anything). Return(nil).Once() - _, errs := m.acceptStagedClaimsAndIssueAcceptTx(context.Background(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + _, err := m.acceptStagedClaimsAndIssueAcceptTx(context.Background(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) // SetFailedf returns nil on success — no error surfaced. - assert.Equal(t, 0, len(errs)) + assert.NoError(t, err) assert.Equal(t, 0, len(m.acceptsInFlight)) // Counter cleared once FAILED is set. _, present := m.acceptAttempts[acceptAttemptKey{currEpoch.ApplicationID, currEpoch.Index}] @@ -499,11 +499,10 @@ func TestAcceptStagedUnknownBroadcastErrorsIncrementAttemptsUntilCap(t *testing. b.On("acceptClaimOnBlockchain", app, currEpoch). Return(common.Hash{}, broadcastErr).Once() - transitions, errs := m.acceptStagedClaimsAndIssueAcceptTx(ctx, makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + transitions, err := m.acceptStagedClaimsAndIssueAcceptTx(ctx, makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 0, transitions) - require.Equal(t, 1, len(errs)) - assert.ErrorIs(t, errs[0], broadcastErr) + assert.ErrorIs(t, err, broadcastErr) assert.Equal(t, i, m.acceptAttempts[attemptKey]) assert.Equal(t, 0, len(m.acceptsInFlight)) } @@ -517,10 +516,10 @@ func TestAcceptStagedUnknownBroadcastErrorsIncrementAttemptsUntilCap(t *testing. })). Return(nil).Once() - transitions, errs := m.acceptStagedClaimsAndIssueAcceptTx(ctx, makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + transitions, err := m.acceptStagedClaimsAndIssueAcceptTx(ctx, makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) assert.Equal(t, 0, transitions) - assert.Equal(t, 0, len(errs), "marking FAILED after the cap is a state transition outcome, not a tick error") + assert.NoError(t, err, "marking FAILED after the cap is a state transition outcome, not a tick error") assert.Equal(t, model.ApplicationStatus_Failed, app.Status) assert.NotContains(t, m.acceptAttempts, attemptKey) assert.Equal(t, 0, len(m.acceptsInFlight)) @@ -551,8 +550,8 @@ func TestAcceptClaimNotStagedAcceptedRechecksOutputsMismatch(t *testing.T) { Return(nil).Once() ctx := context.Background() - transitions, errs := m.acceptStagedClaimsAndIssueAcceptTx(ctx, makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 1, len(errs)) + transitions, err := m.acceptStagedClaimsAndIssueAcceptTx(ctx, makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.Error(t, err) assert.Equal(t, 0, transitions) assert.Equal(t, 0, len(m.acceptsInFlight)) } @@ -571,8 +570,8 @@ func TestAcceptStagedPeriodNotElapsed(t *testing.T) { Return(app.IConsensusAddress, nil).Once() endBlock := big.NewInt(60) // only 10 blocks elapsed; need 100. - transitions, errs := m.acceptStagedClaimsAndIssueAcceptTx(context.Background(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 0, len(errs)) + transitions, err := m.acceptStagedClaimsAndIssueAcceptTx(context.Background(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.NoError(t, err) assert.Equal(t, 0, transitions) assert.Equal(t, 0, len(m.acceptsInFlight)) } @@ -594,8 +593,8 @@ func TestAcceptStagedReaderMode(t *testing.T) { b.On("getConsensusAddress", mock.Anything, app, mock.Anything). Return(app.IConsensusAddress, nil).Once() - transitions, errs := m.acceptStagedClaimsAndIssueAcceptTx(context.Background(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 0, len(errs)) + transitions, err := m.acceptStagedClaimsAndIssueAcceptTx(context.Background(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.NoError(t, err) assert.Equal(t, 0, transitions) assert.Equal(t, 0, len(m.acceptsInFlight)) } @@ -631,8 +630,8 @@ func TestAcceptanceDivergence_QuorumStagedDoesNotRejectEpoch(t *testing.T) { })). Return(nil).Once() - _, errs := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 1, len(errs)) + _, err := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.Error(t, err) assert.Equal(t, model.ApplicationStatus_Diverged, app.Status) assert.Equal(t, model.EpochStatus_ClaimStaged, currEpoch.Status) } @@ -663,8 +662,8 @@ func TestAcceptanceDivergence_QuorumComputedRejectsEpoch(t *testing.T) { })). Return(nil).Once() - _, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 1, len(errs)) + _, err := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.Error(t, err) assert.Equal(t, model.ApplicationStatus_Diverged, app.Status) assert.Equal(t, model.EpochStatus_ClaimRejected, currEpoch.Status) } @@ -694,8 +693,8 @@ func TestAcceptanceDivergence_AuthorityComputedSetsDivergedWithoutRejectingEpoch })). Return(nil).Once() - _, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 1, len(errs)) + _, err := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.Error(t, err) assert.Equal(t, model.ApplicationStatus_Diverged, app.Status) assert.Equal(t, model.EpochStatus_ClaimComputed, currEpoch.Status) } @@ -724,8 +723,8 @@ func TestAcceptanceDivergence_AuthorityDoesNotRejectEpoch(t *testing.T) { r.On("UpdateApplicationStatus", mock.Anything, app.ID, model.ApplicationStatus_Diverged, mock.Anything). Return(nil).Once() - _, errs := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 1, len(errs)) + _, err := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.Error(t, err) assert.Equal(t, model.ApplicationStatus_Diverged, app.Status) assert.Equal(t, model.EpochStatus_ClaimStaged, currEpoch.Status) } @@ -763,8 +762,8 @@ func TestAcceptanceDivergenceReaderMode_Quorum(t *testing.T) { })). Return(nil).Once() - _, errs := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 1, len(errs), "acceptance divergence detection must fire in reader mode") + _, err := m.acceptClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.Error(t, err, "acceptance divergence detection must fire in reader mode") assert.Equal(t, model.EpochStatus_ClaimStaged, currEpoch.Status) } diff --git a/internal/claimer/claimer.go b/internal/claimer/claimer.go index 8a07be3a6..2ba877d33 100644 --- a/internal/claimer/claimer.go +++ b/internal/claimer/claimer.go @@ -47,9 +47,7 @@ import ( "errors" ) -func (s *Service) Tick(ctx context.Context) (bool, []error) { - errs := []error{} - +func (s *Service) Tick(ctx context.Context) (bool, error) { // Use the same finalized block number for all chain reads in this tick. // This is one RPC per tick even when there is no DB work. The call is // cheap, and Tick already runs on a polling interval. @@ -62,8 +60,7 @@ func (s *Service) Tick(ctx context.Context) (bool, []error) { s.Logger.Warn("Tick interrupted by shutdown", "stage", "getDefaultBlockNumber", "error", err) return false, nil } - errs = append(errs, err) - return false, errs + return false, err } s.consensusAddressChecks = map[consensusAddressCheckKey]error{} defer func() { @@ -82,11 +79,9 @@ func (s *Service) Tick(ctx context.Context) (bool, []error) { s.Logger.Warn("Tick interrupted by shutdown", "stage", "SelectClaimsToSubmitPerApp", "error", errComputed) return false, nil } - errs = append(errs, errComputed) - return false, errs + return false, errComputed } - submitted, submitErrs := s.submitClaimsAndUpdateDatabase(ctx, prevSubmittedOrStaged, computedEpochs, computedApps, defaultBlockNumber) - errs = append(errs, submitErrs...) + submitted, err := s.submitClaimsAndUpdateDatabase(ctx, prevSubmittedOrStaged, computedEpochs, computedApps, defaultBlockNumber) // Stage 2: stage. SUBMITTED -> STAGED. This read sees stage 1 updates. prevAcceptedForSubmitted, submittedEpochs, submittedApps, errSubmitted := s.repository.SelectClaimsToStagePerApp(ctx) @@ -95,11 +90,10 @@ func (s *Service) Tick(ctx context.Context) (bool, []error) { s.Logger.Warn("Tick interrupted by shutdown", "stage", "SelectClaimsToStagePerApp", "error", errSubmitted) return false, nil } - errs = append(errs, errSubmitted) - return false, errs + return false, errors.Join(err, errSubmitted) } - staged, stageErrs := s.stageClaimsAndUpdateDatabase(ctx, prevAcceptedForSubmitted, submittedEpochs, submittedApps, defaultBlockNumber) - errs = append(errs, stageErrs...) + staged, stageErr := s.stageClaimsAndUpdateDatabase(ctx, prevAcceptedForSubmitted, submittedEpochs, submittedApps, defaultBlockNumber) + err = errors.Join(err, stageErr) // Stages 3, 4, and 5: accept. STAGED -> ACCEPTED by our own transaction, // another party's event, or a getClaim read before we send acceptClaim. @@ -108,10 +102,9 @@ func (s *Service) Tick(ctx context.Context) (bool, []error) { if errStaged != nil { if errors.Is(errStaged, context.Canceled) { s.Logger.Warn("Tick interrupted by shutdown", "stage", "SelectClaimsToAcceptPerApp", "error", errStaged) - return false, nil + return false, nil // TODO:[maia] should we discard the potential database update errors from calls above? } - errs = append(errs, errStaged) - return false, errs + return false, errors.Join(err, errStaged) } // Foreclosed apps still need some read-only claim work. A claim accepted @@ -128,28 +121,24 @@ func (s *Service) Tick(ctx context.Context) (bool, []error) { // drained, the app remains enabled for L1 observation with foreclose_block // set. foreclosed, listErr := s.listEnabledForeclosedNonPRTApps(ctx) - if listErr != nil { - errs = append(errs, listErr) - } + err = errors.Join(err, listErr) // Finish the accept side of the lifecycle. First send acceptClaim for // staged epochs that are ready. Then check acceptClaim transactions sent in // previous ticks. Finally, scan for ClaimAccepted events from any party. - issuedAccepts, issueErrs := s.acceptStagedClaimsAndIssueAcceptTx(ctx, stagedEpochs, stagedApps, defaultBlockNumber) - errs = append(errs, issueErrs...) + issuedAccepts, issueErr := s.acceptStagedClaimsAndIssueAcceptTx(ctx, stagedEpochs, stagedApps, defaultBlockNumber) + err = errors.Join(err, issueErr) confirmedAccepts, confirmErr := s.checkAcceptsInFlight(ctx, stagedEpochs, stagedApps, defaultBlockNumber) - if confirmErr != nil { - errs = append(errs, confirmErr) - } + err = errors.Join(err, confirmErr) - accepted, acceptErrs := s.acceptClaimsAndUpdateDatabase(ctx, prevAcceptedForStaged, stagedEpochs, stagedApps, defaultBlockNumber) - errs = append(errs, acceptErrs...) + accepted, acceptErr := s.acceptClaimsAndUpdateDatabase(ctx, prevAcceptedForStaged, stagedEpochs, stagedApps, defaultBlockNumber) + err = errors.Join(err, acceptErr) // Keep logging foreclosed apps until all pre-foreclosure work is done. // After that, processForeclosedApps has nothing else to change. - forecloseErrs := s.processForeclosedApps(ctx, foreclosed) - errs = append(errs, forecloseErrs...) + forecloseErr := s.processForeclosedApps(ctx, foreclosed) + err = errors.Join(err, forecloseErr) s.cleanupOrphanedInFlight(computedApps, stagedApps, stagedEpochs) @@ -161,7 +150,7 @@ func (s *Service) Tick(ctx context.Context) (bool, []error) { // Signal reschedule whenever pipeline progress was made, even with errors. if submitted > 0 || staged > 0 || issuedAccepts > 0 || confirmedAccepts > 0 || accepted > 0 { - return true, errs + return true, err } - return false, errs + return false, err } diff --git a/internal/claimer/claimer_test.go b/internal/claimer/claimer_test.go index c51898c68..982596473 100644 --- a/internal/claimer/claimer_test.go +++ b/internal/claimer/claimer_test.go @@ -26,8 +26,8 @@ func TestDoNothing(t *testing.T) { prevEpochs := makeEpochMap() currEpochs := makeEpochMap() - transitions, errs := m.submitClaimsAndUpdateDatabase(context.Background(), prevEpochs, currEpochs, makeApplicationMap(), big.NewInt(0)) - assert.Equal(t, 0, len(errs)) + transitions, err := m.submitClaimsAndUpdateDatabase(context.Background(), prevEpochs, currEpochs, makeApplicationMap(), big.NewInt(0)) + assert.NoError(t, err) assert.Equal(t, 0, transitions, "no transitions when no epochs to process") } @@ -75,8 +75,8 @@ func TestTickInterleavesStagesWithPinnedBlockAndReschedulesOnProgress(t *testing }), repository.Pagination{}, false). Return([]*model.Application{}, 0, nil).Once() - reschedule, errs := m.Tick(context.Background()) + reschedule, err := m.Tick(context.Background()) - require.Empty(t, errs) + require.NoError(t, err) assert.True(t, reschedule, "a successful stage transition should request an immediate follow-up tick") } diff --git a/internal/claimer/divergence_test.go b/internal/claimer/divergence_test.go index 57c56bdc7..d77417edc 100644 --- a/internal/claimer/divergence_test.go +++ b/internal/claimer/divergence_test.go @@ -37,8 +37,8 @@ func TestVerifyClaimOutputsMismatch(t *testing.T) { r.On("UpdateApplicationStatus", mock.Anything, app.ID, model.ApplicationStatus_Diverged, mock.Anything). Return(nil).Once() - _, errs := m.acceptStagedClaimsAndIssueAcceptTx(context.Background(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 1, len(errs), "chain_claim_outputs_mismatch must surface as an error") + _, err := m.acceptStagedClaimsAndIssueAcceptTx(context.Background(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.Error(t, err, "chain_claim_outputs_mismatch must surface as an error") assert.Equal(t, 0, len(m.acceptsInFlight)) } diff --git a/internal/claimer/foreclosed_apps_test.go b/internal/claimer/foreclosed_apps_test.go index d82596613..18f071460 100644 --- a/internal/claimer/foreclosed_apps_test.go +++ b/internal/claimer/foreclosed_apps_test.go @@ -16,6 +16,22 @@ import ( "github.com/stretchr/testify/require" ) +func unjoinError(err error) []error { + type unwrapper interface { + Unwrap() []error + } + + if uw, ok := err.(unwrapper); ok { + return uw.Unwrap() + } + + if err != nil { + return []error{err} + } + + return nil +} + // foreclosedAppHelper builds a foreclosed Application instance, optionally // with a PRT consensus type. ForecloseBlock is non-zero, mirroring what // the evmreader's checkForForeclosure would have persisted. @@ -158,7 +174,8 @@ func TestProcessForeclosedApps_DrainCheckErrorsAppendAndContinue(t *testing.T) { // ForecloseUnacceptedEpochsAtOrAfterBlock nor HasUnreconciledClaimsBeforeBlock // is reached — no expectation registered for either. - errs := s.processForeclosedApps(ctx, map[int64]*model.Application{app1.ID: app1, app2.ID: app2}) + err := s.processForeclosedApps(ctx, map[int64]*model.Application{app1.ID: app1, app2.ID: app2}) + errs := unjoinError(err) assert.Len(t, errs, 2, "each app's drain error is appended; the pass does not abort early") } diff --git a/internal/claimer/foreclosure.go b/internal/claimer/foreclosure.go index ea9ae6b1c..56b9ef616 100644 --- a/internal/claimer/foreclosure.go +++ b/internal/claimer/foreclosure.go @@ -5,6 +5,7 @@ package claimer import ( "context" + "errors" "fmt" "github.com/cartesi/rollups-node/internal/model" @@ -60,8 +61,8 @@ func foreclosedClaimDrainApplicationsFilter() repository.ApplicationFilter { func (s *Service) processForeclosedApps( ctx context.Context, apps map[int64]*model.Application, -) []error { - var errs []error +) error { + var errs error for _, app := range apps { if app.ForecloseBlock == 0 { // This should have been filtered by the query. @@ -97,7 +98,7 @@ func (s *Service) processForeclosedApps( ctx, app.ID, app.ForecloseBlock, ) if err != nil { - errs = append(errs, fmt.Errorf( + errs = errors.Join(errs, fmt.Errorf( "checking input drain progress for foreclosed app %s: %w", app.IApplicationAddress, err)) continue @@ -115,7 +116,7 @@ func (s *Service) processForeclosedApps( ctx, app.ID, app.ForecloseBlock, ) if err != nil { - errs = append(errs, fmt.Errorf( + errs = errors.Join(errs, fmt.Errorf( "terminalizing unaccepted epochs for foreclosed app %s: %w", app.IApplicationAddress, err)) continue @@ -133,7 +134,7 @@ func (s *Service) processForeclosedApps( ctx, app.ID, app.ForecloseBlock, ) if err != nil { - errs = append(errs, fmt.Errorf( + errs = errors.Join(errs, fmt.Errorf( "checking drain progress for foreclosed app %s: %w", app.IApplicationAddress, err)) continue diff --git a/internal/claimer/inflight_test.go b/internal/claimer/inflight_test.go index b9bf81287..198f51a9e 100644 --- a/internal/claimer/inflight_test.go +++ b/internal/claimer/inflight_test.go @@ -52,8 +52,8 @@ func TestInFlightCompleted(t *testing.T) { r.On("UpdateEpochThroughStaging", mock.Anything, app.ID, currEpoch.Index, txHash, receiptBlock). Return(nil).Once() - transitions, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 0, len(errs)) + transitions, err := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.NoError(t, err) assert.Equal(t, 0, len(m.claimsInFlight)) // v3 fast path: submitted (1) + staged (1) = 2 transitions. assert.Equal(t, 2, transitions) @@ -94,8 +94,8 @@ func TestInFlightCompleted_QuorumNonDeciding(t *testing.T) { r.On("UpdateEpochWithSubmittedClaim", mock.Anything, app.ID, currEpoch.Index, txHash). Return(nil).Once() - transitions, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 0, len(errs)) + transitions, err := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.NoError(t, err) assert.Equal(t, 0, len(m.claimsInFlight)) // Fall-back path: one transition (COMPUTED → SUBMITTED), not the fast-path's two. assert.Equal(t, 1, transitions) @@ -133,8 +133,8 @@ func TestInFlightReverted(t *testing.T) { b.On("submitClaimToBlockchain", mock.Anything, app, currEpoch). Return(common.HexToHash("0x10"), nil).Once() - _, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, len(errs), 0) + _, err := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.NoError(t, err) assert.Equal(t, len(m.claimsInFlight), 1) } @@ -153,8 +153,8 @@ func TestClaimInFlightMissingFromCurrClaims(t *testing.T) { b.On("pollTransaction", mock.Anything, reqHash, endBlock). Return(true, receipt, nil).Once() - _, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(), makeApplicationMap(app), endBlock) - assert.Equal(t, len(errs), 0) + _, err := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(), makeApplicationMap(app), endBlock) + assert.NoError(t, err) } func TestClaimInFlightPollErrorKeepsTrackingAndStopsDuplicateSubmit(t *testing.T) { @@ -176,10 +176,9 @@ func TestClaimInFlightPollErrorKeepsTrackingAndStopsDuplicateSubmit(t *testing.T b.On("pollTransaction", mock.Anything, reqHash, endBlock). Return(false, nilReceipt, expectedErr).Once() - transitions, errs := m.submitClaimsAndUpdateDatabase( + transitions, err := m.submitClaimsAndUpdateDatabase( context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - require.Equal(t, 1, len(errs)) - assert.ErrorIs(t, errs[0], expectedErr) + assert.ErrorIs(t, err, expectedErr) assert.Equal(t, 0, transitions) assert.Contains(t, m.claimsInFlight, app.ID, "receipt lookup errors do not prove the tx failed; keep in-flight tracking") diff --git a/internal/claimer/reverts_test.go b/internal/claimer/reverts_test.go index 904f5d694..8bec4b1ef 100644 --- a/internal/claimer/reverts_test.go +++ b/internal/claimer/reverts_test.go @@ -96,9 +96,9 @@ func TestNotFirstClaimHandledGracefully(t *testing.T) { b.On("submitClaimToBlockchain", mock.Anything, app, currEpoch). Return(common.Hash{}, notFirstClaimError()).Once() - _, errs := m.submitClaimsAndUpdateDatabase( + _, err := m.submitClaimsAndUpdateDatabase( context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 0, len(errs)) + assert.NoError(t, err) assert.Equal(t, 0, len(m.claimsInFlight)) } @@ -121,9 +121,9 @@ func TestNotFirstClaimQuorumRetriesForEventSync(t *testing.T) { b.On("submitClaimToBlockchain", mock.Anything, app, currEpoch). Return(common.Hash{}, notFirstClaimError()).Once() - _, errs := m.submitClaimsAndUpdateDatabase( + _, err := m.submitClaimsAndUpdateDatabase( context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 0, len(errs)) + assert.NoError(t, err) assert.Equal(t, 0, len(m.claimsInFlight)) } @@ -146,10 +146,10 @@ func TestApplicationForeclosedIsTransient(t *testing.T) { Return(common.Hash{}, consensusRevertError("ApplicationForeclosed")).Once() currEpochs := makeEpochMap(currEpoch) - transitions, errs := m.submitClaimsAndUpdateDatabase( + transitions, err := m.submitClaimsAndUpdateDatabase( context.Background(), makeEpochMap(), currEpochs, makeApplicationMap(app), endBlock) assert.Equal(t, 0, transitions, "no DB transition on transient revert") - assert.Equal(t, 0, len(errs), "ApplicationForeclosed must not surface as an error") + assert.NoError(t, err, "ApplicationForeclosed must not surface as an error") assert.Equal(t, 1, len(currEpochs), "epoch must remain in work map for retry") assert.Equal(t, 0, len(m.claimsInFlight), "no claim in flight") } @@ -173,9 +173,9 @@ func TestInvalidOutputsMerkleRootProofSizeSetsCorrupted(t *testing.T) { Return(nil).Once() currEpochs := makeEpochMap(currEpoch) - _, errs := m.submitClaimsAndUpdateDatabase( + _, err := m.submitClaimsAndUpdateDatabase( context.Background(), makeEpochMap(), currEpochs, makeApplicationMap(app), endBlock) - assert.Equal(t, 1, len(errs), "CORRUPTED transition must surface a terminal error") + assert.Error(t, err, "CORRUPTED transition must surface a terminal error") assert.Equal(t, 0, len(currEpochs), "epoch must be dropped from work map") assert.Equal(t, 0, len(m.claimsInFlight)) } @@ -200,11 +200,11 @@ func TestCallerIsNotValidatorSetsFailed(t *testing.T) { Return(nil).Once() currEpochs := makeEpochMap(currEpoch) - _, errs := m.submitClaimsAndUpdateDatabase( + _, err := m.submitClaimsAndUpdateDatabase( context.Background(), makeEpochMap(), currEpochs, makeApplicationMap(app), endBlock) // SetFailedf returns nil on success — the call site only surfaces an // error when state-update itself failed, so no error is expected here. - assert.Equal(t, 0, len(errs)) + assert.NoError(t, err) assert.Equal(t, 0, len(currEpochs), "epoch must be dropped from work map") } @@ -228,10 +228,10 @@ func TestNotPastBlockRetriesLater(t *testing.T) { Return(common.Hash{}, notPastBlockError(currEpoch.LastBlock, currEpoch.LastBlock-1)).Once() currEpochs := makeEpochMap(currEpoch) - transitions, errs := m.submitClaimsAndUpdateDatabase( + transitions, err := m.submitClaimsAndUpdateDatabase( context.Background(), makeEpochMap(), currEpochs, makeApplicationMap(app), endBlock) assert.Equal(t, 0, transitions, "no DB transition on transient revert") - assert.Equal(t, 0, len(errs), "NotPastBlock must not surface as an error") + assert.NoError(t, err, "NotPastBlock must not surface as an error") assert.Equal(t, 1, len(currEpochs), "epoch must remain in work map for retry") assert.Equal(t, 0, len(m.claimsInFlight), "no claim in flight") } @@ -305,12 +305,12 @@ func TestSubmitClaimRevertsSetApplicationFailed(t *testing.T) { Return(nil).Once() currEpochs := makeEpochMap(currEpoch) - transitions, errs := m.submitClaimsAndUpdateDatabase( + transitions, err := m.submitClaimsAndUpdateDatabase( context.Background(), makeEpochMap(), currEpochs, makeApplicationMap(app), endBlock) // SetFailedf returns nil on success — the call site only surfaces // an error when the status update itself failed. assert.Equal(t, 0, transitions, "FAILED is not a claim transition") - assert.Equal(t, 0, len(errs)) + assert.NoError(t, err) assert.Equal(t, 0, len(currEpochs), "epoch must be dropped from work map") assert.Equal(t, 0, len(m.claimsInFlight), "no claim in flight") }) @@ -336,12 +336,10 @@ func TestSubmitClaimFailedRevertWithDBError(t *testing.T) { Return(fmt.Errorf("db down")).Once() currEpochs := makeEpochMap(currEpoch) - _, errs := m.submitClaimsAndUpdateDatabase( + _, err := m.submitClaimsAndUpdateDatabase( context.Background(), makeEpochMap(), currEpochs, makeApplicationMap(app), endBlock) - assert.Equal(t, 1, len(errs), "status-update failure must surface as an error") - if len(errs) == 1 { - assert.ErrorContains(t, errs[0], "db down", "the surfaced error must be the DB error") - } + assert.Error(t, err, "status-update failure must surface as an error") + assert.ErrorContains(t, err, "db down", "the surfaced error must be the DB error") assert.Equal(t, 0, len(currEpochs), "epoch must be dropped from work map") assert.Equal(t, 0, len(m.claimsInFlight), "no claim in flight") } diff --git a/internal/claimer/stage.go b/internal/claimer/stage.go index 195addb20..f3fa211c7 100644 --- a/internal/claimer/stage.go +++ b/internal/claimer/stage.go @@ -5,6 +5,7 @@ package claimer import ( "context" + "errors" "fmt" "math/big" @@ -137,9 +138,9 @@ func (s *Service) stageClaimsAndUpdateDatabase( submittedEpochs map[int64]*model.Epoch, apps map[int64]*model.Application, defaultBlockNumber *big.Int, -) (int, []error) { +) (int, error) { transitions := 0 - errs := []error{} + var err error for key, currEpoch := range submittedEpochs { result := s.processSubmittedClaim(ctx, submittedClaimWork{ @@ -149,13 +150,13 @@ func (s *Service) stageClaimsAndUpdateDatabase( }, defaultBlockNumber) transitions += result.progress if result.err != nil { - errs = append(errs, result.err) + err = errors.Join(err, result.err) } if result.drop { delete(submittedEpochs, key) } } - return transitions, errs + return transitions, err } func (s *Service) processSubmittedClaim( diff --git a/internal/claimer/stage_test.go b/internal/claimer/stage_test.go index 4db0fcc76..93251b449 100644 --- a/internal/claimer/stage_test.go +++ b/internal/claimer/stage_test.go @@ -18,7 +18,6 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" - "github.com/stretchr/testify/require" ) func TestStagingFastPathDivergence(t *testing.T) { @@ -51,12 +50,12 @@ func TestStagingFastPathDivergence(t *testing.T) { r.On("UpdateApplicationStatus", mock.Anything, app.ID, model.ApplicationStatus_Diverged, mock.Anything). Return(nil).Once() - _, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + _, err := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) // The fast-path consumed the receipt and triggered DIVERGED. The // divergence error is surfaced (matching the convention used by other // terminal-status setters); // UpdateEpochThroughStaging is NOT called and the in-flight tx is dropped. - assert.Equal(t, 1, len(errs), "divergence at staging fast-path must surface as an error") + assert.Error(t, err, "divergence at staging fast-path must surface as an error") assert.Equal(t, 0, len(m.claimsInFlight)) } @@ -100,11 +99,10 @@ func TestStagingFastPathDBPending(t *testing.T) { // that UpdateEpochThroughStaging guarantees in a single transaction. computedEpochs := makeEpochMap(currEpoch) - _, errs := m.submitClaimsAndUpdateDatabase( + _, err := m.submitClaimsAndUpdateDatabase( context.Background(), makeEpochMap(), computedEpochs, makeApplicationMap(app), endBlock) - require.Equal(t, 1, len(errs), "DB-pending must surface as a tick-level error") - assert.ErrorIs(t, errs[0], dbErr) + assert.ErrorIs(t, err, dbErr, "DB-pending must surface as a tick-level error") // Both work-tracking entries must remain so the next tick can retry // from the same receipt. assert.Contains(t, m.claimsInFlight, app.ID, @@ -132,8 +130,8 @@ func TestStageByObservation(t *testing.T) { r.On("UpdateEpochToStaged", mock.Anything, app.ID, currEpoch.Index, currEvent.Raw.BlockNumber). Return(nil).Once() - transitions, errs := m.stageClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 0, len(errs)) + transitions, err := m.stageClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.NoError(t, err) assert.Equal(t, 1, transitions) } @@ -153,8 +151,8 @@ func TestStageForeclosesSubmittedForeclosedApp(t *testing.T) { r.On("UpdateEpochWithForeclosedClaim", mock.Anything, app.ID, currEpoch.Index). Return(nil).Once() - transitions, errs := m.stageClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 0, len(errs)) + transitions, err := m.stageClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.NoError(t, err) assert.Equal(t, 1, transitions) assert.Equal(t, model.EpochStatus_ClaimForeclosed, currEpoch.Status) } @@ -190,8 +188,8 @@ func TestStagingDivergence_Quorum(t *testing.T) { })). Return(nil).Once() - _, errs := m.stageClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 1, len(errs)) + _, err := m.stageClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.Error(t, err) assert.Equal(t, model.ApplicationStatus_Diverged, app.Status) assert.Equal(t, model.EpochStatus_ClaimRejected, currEpoch.Status) } @@ -219,8 +217,8 @@ func TestStagingDivergence_AuthorityDoesNotRejectEpoch(t *testing.T) { r.On("UpdateApplicationStatus", mock.Anything, app.ID, model.ApplicationStatus_Diverged, mock.Anything). Return(nil).Once() - _, errs := m.stageClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 1, len(errs)) + _, err := m.stageClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.Error(t, err) assert.Equal(t, model.ApplicationStatus_Diverged, app.Status) assert.Equal(t, model.EpochStatus_ClaimSubmitted, currEpoch.Status) } @@ -245,8 +243,8 @@ func TestStagingMatcherPreconditionFailureMarksApplicationCorrupted(t *testing.T })). Return(nil).Once() - _, errs := m.stageClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 1, len(errs)) + _, err := m.stageClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.Error(t, err) assert.Equal(t, model.ApplicationStatus_Corrupted, app.Status) } @@ -281,8 +279,8 @@ func TestStagingDivergenceReaderMode_Quorum(t *testing.T) { })). Return(nil).Once() - _, errs := m.stageClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 1, len(errs), "divergence detection must fire in reader mode") + _, err := m.stageClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.Error(t, err, "divergence detection must fire in reader mode") assert.Equal(t, model.EpochStatus_ClaimRejected, currEpoch.Status) } diff --git a/internal/claimer/submit.go b/internal/claimer/submit.go index 87da00219..ccdfe6baa 100644 --- a/internal/claimer/submit.go +++ b/internal/claimer/submit.go @@ -5,6 +5,7 @@ package claimer import ( "context" + "errors" "fmt" "math/big" @@ -195,14 +196,13 @@ func (s *Service) submitClaimsAndUpdateDatabase( computedEpochs map[int64]*model.Epoch, apps map[int64]*model.Application, defaultBlockNumber *big.Int, -) (int, []error) { +) (int, error) { confirmed, err := s.checkClaimsInFlight(ctx, computedEpochs, apps, defaultBlockNumber) if err != nil { - return confirmed, []error{err} + return confirmed, err } transitions := confirmed - errs := []error{} for key, currEpoch := range computedEpochs { result := s.processComputedClaim(ctx, computedClaimWork{ app: apps[key], @@ -211,13 +211,13 @@ func (s *Service) submitClaimsAndUpdateDatabase( }, defaultBlockNumber) transitions += result.progress if result.err != nil { - errs = append(errs, result.err) + err = errors.Join(err, result.err) } if result.drop { delete(computedEpochs, key) } } - return transitions, errs + return transitions, err } func (s *Service) processComputedClaim( diff --git a/internal/claimer/submit_test.go b/internal/claimer/submit_test.go index 733ad93cf..2d764fbdf 100644 --- a/internal/claimer/submit_test.go +++ b/internal/claimer/submit_test.go @@ -38,8 +38,8 @@ func TestSubmitFirstClaim(t *testing.T) { b.On("submitClaimToBlockchain", mock.Anything, app, currEpoch). Return(common.HexToHash("0x10"), nil).Once() - transitions, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 0, len(errs)) + transitions, err := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.NoError(t, err) assert.Equal(t, 1, len(m.claimsInFlight)) assert.Equal(t, 1, transitions, "submitting a claim counts as a transition") } @@ -70,10 +70,10 @@ func TestSubmitClaimForeclosesUnstagedForeclosedApp(t *testing.T) { // an unexpected call if the guard fails. computedEpochs := makeEpochMap(currEpoch) - transitions, errs := m.submitClaimsAndUpdateDatabase( + transitions, err := m.submitClaimsAndUpdateDatabase( context.Background(), makeEpochMap(), computedEpochs, makeApplicationMap(app), endBlock) - assert.Equal(t, 0, len(errs), "foreclosing an impossible claim is not an error") + assert.NoError(t, err, "foreclosing an impossible claim is not an error") assert.Equal(t, 1, transitions, "CLAIM_FORECLOSED is a local status transition") assert.Equal(t, model.EpochStatus_ClaimForeclosed, currEpoch.Status) assert.Equal(t, 0, len(m.claimsInFlight), @@ -100,10 +100,10 @@ func TestSubmitClaimForeclosesUnstagedForeclosedAppWhenSubmissionDisabled(t *tes r.On("UpdateEpochWithForeclosedClaim", mock.Anything, app.ID, currEpoch.Index). Return(nil).Once() - transitions, errs := m.submitClaimsAndUpdateDatabase( + transitions, err := m.submitClaimsAndUpdateDatabase( context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 0, len(errs)) + assert.NoError(t, err) assert.Equal(t, 1, transitions) assert.Equal(t, model.EpochStatus_ClaimForeclosed, currEpoch.Status) assert.Equal(t, 0, len(m.claimsInFlight)) @@ -216,10 +216,10 @@ func TestSubmitClaimReconcilesAcceptedForForeclosedApp(t *testing.T) { Return(nil).Once() computedEpochs := makeEpochMap(currEpoch) - transitions, errs := m.submitClaimsAndUpdateDatabase( + transitions, err := m.submitClaimsAndUpdateDatabase( context.Background(), makeEpochMap(), computedEpochs, makeApplicationMap(app), endBlock) - assert.Equal(t, 0, len(errs)) + assert.NoError(t, err) assert.Equal(t, 1, transitions, "ACCEPTED reconciliation counts as a transition") assert.Equal(t, 0, len(m.claimsInFlight)) } @@ -247,10 +247,10 @@ func TestSubmitClaimReconcilesStagedBeforeBroadcast(t *testing.T) { Return(nil).Once() computedEpochs := makeEpochMap(currEpoch) - transitions, errs := m.submitClaimsAndUpdateDatabase( + transitions, err := m.submitClaimsAndUpdateDatabase( context.Background(), makeEpochMap(), computedEpochs, makeApplicationMap(app), endBlock) - assert.Equal(t, 0, len(errs)) + assert.NoError(t, err) assert.Equal(t, 1, transitions, "STAGED reconciliation counts as a transition") assert.Empty(t, computedEpochs, "reconciled epoch must leave the computed work map") assert.Equal(t, 0, len(m.claimsInFlight), "reconciled staged claim must not be submitted again") @@ -278,9 +278,9 @@ func TestReconcileBeforeSubmitAcceptedOutputsMismatchSetsDiverged(t *testing.T) r.On("UpdateApplicationStatus", mock.Anything, app.ID, model.ApplicationStatus_Diverged, mock.Anything). Return(nil).Once() - transitions, errs := m.submitClaimsAndUpdateDatabase( + transitions, err := m.submitClaimsAndUpdateDatabase( context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 1, len(errs)) + assert.Error(t, err) assert.Equal(t, 0, transitions) assert.Equal(t, 0, len(m.claimsInFlight)) } @@ -306,8 +306,8 @@ func TestSubmitClaimWithAntecessor(t *testing.T) { b.On("submitClaimToBlockchain", mock.Anything, app, currEpoch). Return(common.HexToHash("0x10"), nil).Once() - transitions, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 0, len(errs)) + transitions, err := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.NoError(t, err) assert.Equal(t, 1, len(m.claimsInFlight)) assert.Equal(t, 1, transitions, "submitting a claim counts as a transition") } @@ -334,9 +334,9 @@ func TestSubmitClaimWithAcceptedAntecessorWithoutClaimTransactionHash(t *testing b.On("submitClaimToBlockchain", mock.Anything, app, currEpoch). Return(common.HexToHash("0x10"), nil).Once() - transitions, errs := m.submitClaimsAndUpdateDatabase( + transitions, err := m.submitClaimsAndUpdateDatabase( context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - require.Empty(t, errs) + require.NoError(t, err) assert.Len(t, m.claimsInFlight, 1) assert.Equal(t, 1, transitions, "accepted predecessor with unknown tx hash must not block submission") } @@ -359,9 +359,9 @@ func TestSkipSubmitClaimWithStagedAntecessor(t *testing.T) { b.On("findClaimSubmittedEventAndSucc", mock.Anything, app, prevEpoch, prevEpoch.LastBlock+1, endBlock.Uint64()). Return(&iconsensus.IConsensus{}, prevEvent, currEvent, nil).Once() - transitions, errs := m.submitClaimsAndUpdateDatabase( + transitions, err := m.submitClaimsAndUpdateDatabase( context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 0, len(errs)) + assert.NoError(t, err) assert.Equal(t, 0, len(m.claimsInFlight)) assert.Equal(t, 0, transitions, "staged predecessor must block newer claim submission") } @@ -384,8 +384,8 @@ func TestSkipSubmitFirstClaim(t *testing.T) { b.On("findClaimSubmittedEventAndSucc", mock.Anything, app, currEpoch, currEpoch.LastBlock+1, endBlock.Uint64()). Return(&iconsensus.IConsensus{}, prevEvent, currEvent, nil).Once() - transitions, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 0, len(errs)) + transitions, err := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.NoError(t, err) assert.Equal(t, 0, len(m.claimsInFlight)) assert.Equal(t, 0, transitions, "no transition when submission is disabled") } @@ -409,8 +409,8 @@ func TestSkipSubmitClaimWithAntecessor(t *testing.T) { b.On("findClaimSubmittedEventAndSucc", mock.Anything, app, prevEpoch, prevEpoch.LastBlock+1, endBlock.Uint64()). Return(&iconsensus.IConsensus{}, prevEvent, currEvent, nil).Once() - _, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, len(errs), 0) + _, err := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.NoError(t, err) assert.Equal(t, len(m.claimsInFlight), 0) } @@ -433,8 +433,8 @@ func TestUpdateFirstClaim(t *testing.T) { r.On("UpdateEpochWithSubmittedClaim", mock.Anything, app.ID, currEpoch.Index, currEvent.Raw.TxHash). Return(nil).Once() - transitions, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 0, len(errs)) + transitions, err := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.NoError(t, err) assert.Equal(t, 0, len(m.claimsInFlight)) assert.Equal(t, 1, transitions, "finding on-chain event counts as a transition") } @@ -459,8 +459,8 @@ func TestUpdateClaimWithAntecessor(t *testing.T) { r.On("UpdateEpochWithSubmittedClaim", mock.Anything, app.ID, currEpoch.Index, currEvent.Raw.TxHash). Return(nil).Once() - _, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, len(errs), 0) + _, err := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.NoError(t, err) assert.Equal(t, len(m.claimsInFlight), 0) } @@ -490,8 +490,8 @@ func TestQuorumSubmittedEventsIgnoresForeignDifferentOutputsAndUpdatesMatchingEv r.On("UpdateEpochWithSubmittedClaim", mock.Anything, app.ID, currEpoch.Index, currEvent.Raw.TxHash). Return(nil).Once() - transitions, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 0, len(errs)) + transitions, err := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.NoError(t, err) assert.Equal(t, 0, len(m.claimsInFlight)) assert.Equal(t, 1, transitions, "matching later event counts as a transition") } @@ -522,8 +522,8 @@ func TestQuorumDifferentOutputSubmittedEventStillSubmitsLocalClaim(t *testing.T) b.On("submitClaimToBlockchain", mock.Anything, app, currEpoch). Return(txHash, nil).Once() - transitions, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 0, len(errs)) + transitions, err := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.NoError(t, err) assert.Equal(t, txHash, m.claimsInFlight[app.ID].txHash) assert.Equal(t, 1, transitions) } @@ -550,8 +550,8 @@ func TestQuorumForeignMatchingSubmittedEventStillSubmitsLocalClaim(t *testing.T) b.On("submitClaimToBlockchain", mock.Anything, app, currEpoch). Return(txHash, nil).Once() - transitions, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 0, len(errs)) + transitions, err := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.NoError(t, err) assert.Equal(t, txHash, m.claimsInFlight[app.ID].txHash) assert.Equal(t, 1, transitions) } @@ -577,8 +577,8 @@ func TestQuorumReaderModeRecordsForeignMatchingSubmittedEvent(t *testing.T) { r.On("UpdateEpochWithSubmittedClaim", mock.Anything, app.ID, currEpoch.Index, foreignEvent.Raw.TxHash). Return(nil).Once() - transitions, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 0, len(errs)) + transitions, err := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.NoError(t, err) assert.Equal(t, 0, len(m.claimsInFlight)) assert.Equal(t, 1, transitions, "reader mode must mirror a matching Quorum ClaimSubmitted from any validator") } @@ -617,8 +617,8 @@ func TestQuorumSubmittedEventsIgnoresForeignAdversarialProofAndSubmitsLocalClaim b.On("submitClaimToBlockchain", mock.Anything, app, currEpoch). Return(txHash, nil).Once() - transitions, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 0, len(errs)) + transitions, err := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.NoError(t, err) assert.Equal(t, txHash, m.claimsInFlight[app.ID].txHash) assert.Equal(t, 1, transitions) } @@ -649,8 +649,8 @@ func TestQuorumSubmittedEventsOwnMismatchSetsDiverged(t *testing.T) { Return(nil).Once() currEpochs := makeEpochMap(currEpoch) - transitions, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), currEpochs, makeApplicationMap(app), endBlock) - assert.Equal(t, 1, len(errs)) + transitions, err := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), currEpochs, makeApplicationMap(app), endBlock) + assert.Error(t, err) assert.Equal(t, 0, len(currEpochs)) assert.Equal(t, 0, len(m.claimsInFlight)) assert.Equal(t, 0, transitions) @@ -681,8 +681,8 @@ func TestQuorumReaderModeIgnoresNonMatchingSubmittedEvent(t *testing.T) { b.On("findClaimSubmittedEventAndSucc", mock.Anything, app, currEpoch, currEpoch.LastBlock+1, endBlock.Uint64()). Return(&iconsensus.IConsensus{}, []*iconsensus.IConsensusClaimSubmitted{foreignEvent}, nil).Once() - transitions, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 0, len(errs)) + transitions, err := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.NoError(t, err) assert.Equal(t, 0, len(m.claimsInFlight)) assert.Equal(t, 0, transitions) } @@ -717,8 +717,8 @@ func TestSubmitClaimWithAntecessorMismatch(t *testing.T) { r.On("UpdateApplicationStatus", mock.Anything, int64(0), model.ApplicationStatus_Corrupted, mock.Anything). Return(nil).Once() - _, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 1, len(errs)) + _, err := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.Error(t, err) } // !claimMatchesEvent(currClaim, currEvent) @@ -747,8 +747,8 @@ func TestSubmitClaimWithEventMismatch(t *testing.T) { r.On("UpdateApplicationStatus", mock.Anything, int64(0), model.ApplicationStatus_Diverged, mock.Anything). Return(nil).Once() - _, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 1, len(errs)) + _, err := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.Error(t, err) } func TestQuorumPreviousSubmittedEventsIgnoresForeignMismatchAndSubmitsCurrentClaim(t *testing.T) { @@ -781,8 +781,8 @@ func TestQuorumPreviousSubmittedEventsIgnoresForeignMismatchAndSubmitsCurrentCla b.On("submitClaimToBlockchain", mock.Anything, app, currEpoch). Return(txHash, nil).Once() - transitions, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 0, len(errs)) + transitions, err := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.NoError(t, err) assert.Equal(t, txHash, m.claimsInFlight[app.ID].txHash) assert.Equal(t, 1, transitions) } @@ -814,8 +814,8 @@ func TestQuorumPreviousSubmittedEventsOwnMismatchSetsDiverged(t *testing.T) { Return(nil).Once() currEpochs := makeEpochMap(currEpoch) - transitions, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), currEpochs, makeApplicationMap(app), endBlock) - assert.Equal(t, 1, len(errs)) + transitions, err := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), currEpochs, makeApplicationMap(app), endBlock) + assert.Error(t, err) assert.Equal(t, 0, len(currEpochs)) assert.Equal(t, 0, len(m.claimsInFlight)) assert.Equal(t, 0, transitions) @@ -837,8 +837,8 @@ func TestSubmitClaimWithAntecessorOutOfOrder(t *testing.T) { r.On("UpdateApplicationStatus", mock.Anything, int64(0), model.ApplicationStatus_Corrupted, mock.Anything). Return(nil).Once() - _, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), big.NewInt(0)) - assert.Equal(t, 1, len(errs)) + _, err := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), big.NewInt(0)) + assert.Error(t, err) } func TestCheckEpochSequenceConstraintAllowsAcceptedPredecessorWithoutClaimTransactionHash(t *testing.T) { @@ -870,8 +870,8 @@ func TestErrSubmittedMissingEvent(t *testing.T) { r.On("UpdateApplicationStatus", mock.Anything, int64(0), model.ApplicationStatus_Corrupted, mock.Anything). Return(nil).Once() - _, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, 1, len(errs)) + _, err := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(prevEpoch), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.Error(t, err) } func TestConsensusAddressChangedOnSubmittedClaims(t *testing.T) { @@ -891,8 +891,8 @@ func TestConsensusAddressChangedOnSubmittedClaims(t *testing.T) { r.On("UpdateApplicationStatus", mock.Anything, int64(0), model.ApplicationStatus_Corrupted, mock.Anything). Return(nil).Once() - _, errs := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) - assert.Equal(t, len(errs), 1) + _, err := m.submitClaimsAndUpdateDatabase(context.Background(), makeEpochMap(), makeEpochMap(currEpoch), makeApplicationMap(app), endBlock) + assert.Error(t, err) } func TestCheckConsensusForAddressChangeUsesTickBlock(t *testing.T) { diff --git a/internal/cli/cobra.go b/internal/cli/cobra.go index 10cf8a01a..b2df6b5e4 100644 --- a/internal/cli/cobra.go +++ b/internal/cli/cobra.go @@ -63,10 +63,3 @@ func CheckErr(logger *slog.Logger, err error, args ...any) { logErr(logger, err, args...) cobra.CheckErr(err) } - -func CheckErrs(logger *slog.Logger, errs []error, args ...any) { - for _, err := range errs { - logErr(logger, err, args...) - } - cobra.CheckErr(errors.Join(errs...)) -} diff --git a/internal/evmreader/evmreader.go b/internal/evmreader/evmreader.go index 331efe888..72c63f97b 100644 --- a/internal/evmreader/evmreader.go +++ b/internal/evmreader/evmreader.go @@ -101,13 +101,13 @@ func (r *Service) setApplicationCorrupted(ctx context.Context, app *Application, return appstatus.SetCorruptedf(ctx, r.Logger, r.repository, app, reasonFmt, args...) } -func (r *Service) Tick(ctx context.Context) (bool, []error) { +func (r *Service) Tick(ctx context.Context) (bool, error) { blockNumber, err := r.fetchMostRecentHeader(ctx, r.defaultBlock) if err != nil { if errors.Is(err, context.Canceled) { return false, nil } - return false, []error{err} + return false, err } if blockNumber != r.lastBlockNumber.Load() { diff --git a/internal/evmreader/evmreader_test.go b/internal/evmreader/evmreader_test.go index 9f1232b41..edd3d6a96 100644 --- a/internal/evmreader/evmreader_test.go +++ b/internal/evmreader/evmreader_test.go @@ -94,14 +94,14 @@ func (s *EvmReaderSuite) TearDownTest() { // Service tests func (s *EvmReaderSuite) TestItStopsWhenContextIsCanceled() { - errChannel := make(chan []error, 1) + errChannel := make(chan error, 1) go func() { errChannel <- s.evmReader.Serve() }() s.cancel() - errs := <-errChannel - s.Require().Empty(errs, "stopped with an error when canceled") + err := <-errChannel + s.Require().NoError(err, "stopped with an error when canceled") } func newCallNotification(c *mock.Call) <-chan struct{} { @@ -287,10 +287,9 @@ func (s *EvmReaderSuite) TestTickReturnsHeaderFetchErrorWithoutLocalErrorLog() { mock.Anything, ).Return(hdr, headerErr).Once() - _, errs := s.evmReader.Tick(s.ctx) + _, err := s.evmReader.Tick(s.ctx) - s.Require().Len(errs, 1) - s.Require().ErrorIs(errs[0], headerErr) + s.Require().ErrorIs(err, headerErr) s.Require().NotContains(logBuffer.String(), "Error fetching most recent block") s.repository.AssertNumberOfCalls(s.T(), "ListApplications", 0) } diff --git a/internal/evmreader/service.go b/internal/evmreader/service.go index 3ecc162f6..fc27aed1b 100644 --- a/internal/evmreader/service.go +++ b/internal/evmreader/service.go @@ -121,7 +121,7 @@ func (s *Service) Ready() bool { return s.ready.Load() } -func (s *Service) Serve() []error { +func (s *Service) Serve() error { s.alive.Store(true) s.ready.Store(true) defer s.alive.Store(false) diff --git a/internal/jsonrpc/service.go b/internal/jsonrpc/service.go index 0ccdb8563..d3d90c9a4 100644 --- a/internal/jsonrpc/service.go +++ b/internal/jsonrpc/service.go @@ -102,22 +102,21 @@ func Create(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) { return s, nil } -func (s *Service) Stop(_ bool) []error { - var errs []error +func (s *Service) Stop(_ bool) error { s.Logger.Info("Shutting down JSON-RPC HTTP server", "addr", s.server.Addr) ctx, cancel := context.WithTimeout(context.Background(), jsonrpcShutdownTimeout) defer cancel() if err := s.server.Shutdown(ctx); err != nil { - errs = append(errs, err) + return err } - return errs + return nil } -func (s *Service) Serve() []error { +func (s *Service) Serve() error { ctx := s.Supervisor.Context() listener, err := s.listen("tcp", s.server.Addr) if err != nil { - return []error{err} + return err } s.Logger.Info("Listening", "addr", listener.Addr().String()) @@ -143,19 +142,13 @@ func (s *Service) Serve() []error { // loop and wait for it to observe the cancellation before returning. s.Supervisor.Stop(true) <-ctx.Done() - if err != nil { - return []error{err} - } - return nil + return err case <-ctx.Done(): // The framework loop exited first because it handled a shutdown signal // or context cancellation and called Stop(), which should trigger // s.server.Shutdown(). Wait for the HTTP loop to finish so Serve() // returns only after the listener is fully closed. serverErr := <-serverDone - if serverErr != nil { - return []error{serverErr} - } - return nil + return serverErr } } diff --git a/internal/node/node.go b/internal/node/node.go index cb1f94f02..98244f046 100644 --- a/internal/node/node.go +++ b/internal/node/node.go @@ -101,9 +101,9 @@ func createServices( if len(errs) > 0 { for _, svc := range services { - errs := svc.Stop(true) - if errs != nil { - errs = append(errs, errs...) // TODO: replace this by a warn log + stopErr := svc.Stop(true) + if stopErr != nil { + errs = append(errs, stopErr) } } return errors.Join(errs...) @@ -121,7 +121,7 @@ type Service struct { Repository repository.Repository } -func (me *Service) Serve() []error { +func (me *Service) Serve() error { ctx := me.Supervisor.Context() <-ctx.Done() return nil diff --git a/internal/prt/service.go b/internal/prt/service.go index f4d4c4cb9..d248be2f3 100644 --- a/internal/prt/service.go +++ b/internal/prt/service.go @@ -125,7 +125,7 @@ func Create(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) { // Tick executes the Validator main logic of producing claims and/or proofs // for processed epochs of all running applications. -func (s *Service) Tick(ctx context.Context) (bool, []error) { +func (s *Service) Tick(ctx context.Context) (bool, error) { // Check for shutdown before starting work, consistent with the advancer. if ctx.Err() != nil { return false, nil @@ -138,14 +138,14 @@ func (s *Service) Tick(ctx context.Context) (bool, []error) { s.Logger.Warn("Tick interrupted by shutdown", "error", err) return false, nil } - return false, []error{fmt.Errorf("failed to get running applications. %w", err)} + return false, fmt.Errorf("failed to get running applications. %w", err) } // validate each application errs := []error{} for idx := range apps { if ctx.Err() != nil { - return false, errs + return false, errors.Join(errs...) } app := apps[idx] // Foreclosed apps: run the drain path (reconcile accepted epochs, @@ -172,7 +172,7 @@ func (s *Service) Tick(ctx context.Context) (bool, []error) { errs = append(errs, err) } } - return false, errs + return false, errors.Join(errs...) } // handleForeclosedApp drains a foreclosed DaveConsensus application's epochs to diff --git a/internal/validator/validator.go b/internal/validator/validator.go index 57bb5bfdb..8997f15cb 100644 --- a/internal/validator/validator.go +++ b/internal/validator/validator.go @@ -69,7 +69,7 @@ func Create(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) { // Tick executes the Validator main logic of producing claims and/or proofs // for processed epochs of all running applications. -func (s *Service) Tick(ctx context.Context) (bool, []error) { +func (s *Service) Tick(ctx context.Context) (bool, error) { apps, _, err := getAllRunningApplications(ctx, s.repository) if err != nil { // During shutdown the parent context is canceled and every in- @@ -80,7 +80,7 @@ func (s *Service) Tick(ctx context.Context) (bool, []error) { s.Logger.Warn("Tick interrupted by shutdown", "error", err) return false, nil } - return false, []error{fmt.Errorf("failed to get running applications. %w", err)} + return false, fmt.Errorf("failed to get running applications. %w", err) } // validate each application @@ -96,7 +96,7 @@ func (s *Service) Tick(ctx context.Context) (bool, []error) { errs = append(errs, err) } } - return false, errs + return false, errors.Join(errs...) } type ValidatorRepository interface { diff --git a/pkg/service/service.go b/pkg/service/service.go index 4359717ce..b6b5144ec 100644 --- a/pkg/service/service.go +++ b/pkg/service/service.go @@ -70,9 +70,9 @@ var ( type IService interface { Alive() bool Ready() bool - Reload() []error - Stop(bool) []error - Serve() []error + Reload() error + Stop(bool) error + Serve() error String() string } @@ -118,8 +118,8 @@ func InitServiceTemplate(c *ServiceConfigs, s *ServiceTemplate) error { // Default implementation of some abstract methods (except `Serve`). // Remove them to force concrete services to provide implementation for them. -func (s *ServiceTemplate) Reload() []error { return nil } -func (s *ServiceTemplate) Stop(bool) []error { return nil } +func (s *ServiceTemplate) Reload() error { return nil } +func (s *ServiceTemplate) Stop(bool) error { return nil } func (s *ServiceTemplate) Alive() bool { return true } func (s *ServiceTemplate) Ready() bool { return true } func (s *ServiceTemplate) String() string { return s.Name } diff --git a/pkg/service/service_test.go b/pkg/service/service_test.go index 3de18f111..7b7ae9d75 100644 --- a/pkg/service/service_test.go +++ b/pkg/service/service_test.go @@ -5,7 +5,6 @@ package service import ( "context" - "errors" "log/slog" "time" ) @@ -16,12 +15,12 @@ type delayedCloseImpl struct { onStopInitChan chan struct{} } -func (s *delayedCloseImpl) Stop(bool) []error { +func (s *delayedCloseImpl) Stop(bool) error { <-s.onStopInitChan // wait signal to initiate stop return nil } -func (s *delayedCloseImpl) Serve() []error { +func (s *delayedCloseImpl) Serve() error { close(s.onServeInitChan) // signal service was initiated <-s.Supervisor.Context().Done() return nil @@ -46,18 +45,18 @@ func (s *ServeSuite) TestServeExitsAfterStopIsComplete() { onServeEndChan := make(chan error) go func() { - err = errors.Join(supervisor.Serve()...) + err = supervisor.Serve() onServeEndChan <- err // signal service ended and provide error close(onServeEndChan) }() - onStopEndChan := make(chan []error) + onStopEndChan := make(chan error) select { case <-svc.onServeInitChan: // wait service to initiate, so can be stopped. // initiate service shutdown through context cancelation go func() { - errs := supervisor.Stop(true) - onStopEndChan <- errs // signal stop ended and provide the errors + err := supervisor.Stop(true) + onStopEndChan <- err // signal stop ended and provide the errors close(onStopEndChan) }() case <-time.After(2 * time.Second): @@ -86,8 +85,8 @@ func (s *ServeSuite) TestServeExitsAfterStopIsComplete() { // Stop() should exit without errors. select { - case errs := <-onStopEndChan: - s.Empty(errs) + case err := <-onStopEndChan: + s.NoError(err) case <-time.After(2 * time.Second): s.Fail("Stop() did not exit within 2 seconds after 'OnStop' concluded") } @@ -117,7 +116,7 @@ func (s *ServeSuite) TestServeExitsAfterStopIsCompleteOnContextCancelation() { onServeEndChan := make(chan error) go func() { - err = errors.Join(supervisor.Serve()...) + err = supervisor.Serve() onServeEndChan <- err // signal service ended and provide error close(onServeEndChan) }() diff --git a/pkg/service/supervisor.go b/pkg/service/supervisor.go index da1040c82..a5eeac853 100644 --- a/pkg/service/supervisor.go +++ b/pkg/service/supervisor.go @@ -11,7 +11,6 @@ import ( "net/http" "os" "os/signal" - "slices" "sync/atomic" "syscall" "time" @@ -138,35 +137,34 @@ func (s *ServicesSupervisor) Ready() bool { return true } -func (s *ServicesSupervisor) Reload() []error { +func (s *ServicesSupervisor) Reload() error { if s.stopped.Load() { - return []error{ErrServiceStopped} + return ErrServiceStopped } - var allErrs []error + var err error for _, svc := range s.services { start := time.Now() - errs := svc.Reload() + svcErr := svc.Reload() elapsed := time.Since(start) - if len(errs) > 0 { - allErrs = slices.Concat(allErrs, errs) - + if svcErr != nil { s.Logger.Error("Reload", "service", svc.String(), "duration", elapsed, - "errors", errs) + "error", svcErr) } else { s.Logger.Info("Reload", "service", svc.String(), "duration", elapsed) } - } - return allErrs + err = errors.Join(err, svcErr) + } + return err } -func (s *ServicesSupervisor) Serve() []error { +func (s *ServicesSupervisor) Serve() error { if s.stopped.Load() { return nil } @@ -190,27 +188,19 @@ func (s *ServicesSupervisor) Serve() []error { } }() - svcEndChan := make(chan []error) + svcEndChan := make(chan error) svcCount := 0 for _, svc := range s.services { svcCount++ go func(svc IService) { - var errs []error - - defer func() { svcEndChan <- errs }() - s.Logger.Info("Starting service", "service", svc.String()) - for _, err := range svc.Serve() { - if err != nil && !errors.Is(err, context.Canceled) { - errs = append(errs, err) - } - } + err := svc.Serve() switch { - case len(errs) > 0: + case err != nil && !errors.Is(err, context.Canceled): s.Logger.Error("Service failed, stopping other services", "service", svc.String(), - "errors", errs, + "error", err, ) s.Stop(false) case s.stopped.Load(): @@ -222,21 +212,24 @@ func (s *ServicesSupervisor) Serve() []error { "service", svc.String(), ) } + svcEndChan <- err }(svc) } var allErrs []error for range svcCount { - allErrs = slices.Concat(allErrs, <-svcEndChan) + if err := <-svcEndChan; err != nil { + allErrs = append(allErrs, err) + } } go s.Stop(true) <-s.stoppedChan - return allErrs + return errors.Join(allErrs...) } -func (s *ServicesSupervisor) Stop(force bool) []error { +func (s *ServicesSupervisor) Stop(force bool) error { // CAS achieves once-semantics: the second caller returns immediately // (fire-and-forget) rather than blocking like sync.Once. This is safe // because the orchestrator calls Cancel() after Stop() and waits for @@ -252,31 +245,31 @@ func (s *ServicesSupervisor) Stop(force bool) []error { signal.Stop(s.sigHangUp) } - var allErrs []error + var err error for i := len(s.services)-1; i >= 0; i-- { svc := s.services[i] start := time.Now() - errs := svc.Stop(force) + svcErr := svc.Stop(force) elapsed := time.Since(start) - if len(errs) > 0 { + if svcErr != nil { s.Logger.Error("Stop", "force", force, "duration", elapsed, - "error", errs) + "error", svcErr) } else { s.Logger.Info("Stop", "force", force, "duration", elapsed) } - allErrs = slices.Concat(allErrs, errs) + err = errors.Join(err, svcErr) } s.cancelContext() close(s.stoppedChan) - return allErrs + return err } /* diff --git a/pkg/service/supervisor_test.go b/pkg/service/supervisor_test.go index c81e80ff7..a5930c6d9 100644 --- a/pkg/service/supervisor_test.go +++ b/pkg/service/supervisor_test.go @@ -4,7 +4,6 @@ package service import ( - "errors" "fmt" "log/slog" "sync" @@ -22,7 +21,7 @@ type blockingChildImpl struct { once sync.Once } -func (c *blockingChildImpl) Serve() []error { +func (c *blockingChildImpl) Serve() error { close(c.started) <-c.Supervisor.Context().Done() c.once.Do(func() { close(c.done) }) @@ -107,10 +106,10 @@ type errorChildImpl struct { started chan struct{} } -func (c *errorChildImpl) Serve() []error { +func (c *errorChildImpl) Serve() error { close(c.started) time.Sleep(10 * time.Millisecond) - return []error{ fmt.Errorf("Oops %s!", c.Name) } + return fmt.Errorf("Oops %s!", c.Name) } func createErrorChild(t *testing.T, cfg *SupervisorConfigs, name string) *errorChildImpl { @@ -136,14 +135,14 @@ type stopAwareChildImpl struct { doneOnce sync.Once } -func (c *stopAwareChildImpl) Serve() []error { +func (c *stopAwareChildImpl) Serve() error { close(c.started) <-c.Supervisor.Context().Done() c.doneOnce.Do(func() { close(c.done) }) return nil } -func (c *stopAwareChildImpl) Stop(bool) []error { +func (c *stopAwareChildImpl) Stop(bool) error { c.stopOnce.Do(func() { close(c.stopped) }) return nil } @@ -179,8 +178,7 @@ func (s *SupervisorSuite) TestNodeReturnChildErrors() { done := make(chan error) go func() { - errs := supervisor.Serve() - done <- errors.Join(errs...) + done <- supervisor.Serve() close(done) }() @@ -219,8 +217,7 @@ func (s *SupervisorSuite) TestNodeStopsChildrenWhenOneChildErrors() { done := make(chan error) go func() { - errs := supervisor.Serve() - done <- errors.Join(errs...) + done <- supervisor.Serve() close(done) }() diff --git a/pkg/service/telemetry_test.go b/pkg/service/telemetry_test.go index 8feb1a99f..b3754a1a5 100644 --- a/pkg/service/telemetry_test.go +++ b/pkg/service/telemetry_test.go @@ -104,7 +104,7 @@ type falseLifecycleImpl struct{ ServiceTemplate } func (*falseLifecycleImpl) Alive() bool { return false } func (*falseLifecycleImpl) Ready() bool { return false } -func (*falseLifecycleImpl) Serve() []error { return nil } +func (*falseLifecycleImpl) Serve() error { return nil } func TestCreateDefaultTelemetry_Returns500WhenLifecycleFails(t *testing.T) { svc := &falseLifecycleImpl{} diff --git a/pkg/service/tick.go b/pkg/service/tick.go index 1c586b80d..b908e8348 100644 --- a/pkg/service/tick.go +++ b/pkg/service/tick.go @@ -13,7 +13,7 @@ import ( ) type TickImpl interface { - Tick(ctx context.Context) (bool, []error) + Tick(ctx context.Context) (bool, error) } type TickServiceTemplate struct { @@ -57,14 +57,14 @@ func (s *TickServiceTemplate) tick(ctx context.Context) bool { return false } start := time.Now() - reschedule, errs := s.tickImpl.Tick(ctx) + reschedule, err := s.tickImpl.Tick(ctx) elapsed := time.Since(start) - if len(errs) > 0 { + if err != nil { s.Logger.Error("Tick", "duration", elapsed, "reschedule", reschedule, - "error", errs, + "error", err, ) } else { s.Logger.Debug("Tick", @@ -75,12 +75,12 @@ func (s *TickServiceTemplate) tick(ctx context.Context) bool { return reschedule } -func (s *TickServiceTemplate) Stop(bool) []error { +func (s *TickServiceTemplate) Stop(bool) error { s.ticker.Stop() return nil } -func (s *TickServiceTemplate) Serve() []error { +func (s *TickServiceTemplate) Serve() error { ctx := s.Supervisor.context if ctx.Err() != nil { return nil diff --git a/pkg/service/tick_test.go b/pkg/service/tick_test.go index 4b4f521c2..272e7fdca 100644 --- a/pkg/service/tick_test.go +++ b/pkg/service/tick_test.go @@ -21,7 +21,7 @@ type mockImpl struct { onTick func(n int32) bool // called on each Tick with the tick count (1-based) } -func (m *mockImpl) Tick(ctx context.Context) (bool, []error) { +func (m *mockImpl) Tick(ctx context.Context) (bool, error) { n := m.tickCount.Add(1) reschedule := false if m.onTick != nil { diff --git a/test/validator/validator_test.go b/test/validator/validator_test.go index b1c861be2..5e802df7e 100644 --- a/test/validator/validator_test.go +++ b/test/validator/validator_test.go @@ -139,8 +139,8 @@ func (s *ValidatorRepositoryIntegrationSuite) TestItReturnsPristineClaim() { err = s.repository.StoreAdvanceResult(s.ctx, 1, &advanceResult) s.Require().Nil(err) - _, errs := s.validator.Tick(s.ctx) - s.Require().Equal(0, len(errs)) + _, err = s.validator.Tick(s.ctx) + s.Require().NoError(err) updatedEpoch, err := s.repository.GetEpoch(s.ctx, app.IApplicationAddress.String(), epoch.Index) s.Require().Nil(err) @@ -262,8 +262,8 @@ func (s *ValidatorRepositoryIntegrationSuite) TestItReturnsPreviousClaim() { err = s.repository.StoreAdvanceResult(s.ctx, 1, &advanceResult) s.Require().Nil(err) - _, errs := s.validator.Tick(s.ctx) - s.Require().Equal(0, len(errs)) + _, err = s.validator.Tick(s.ctx) + s.Require().NoError(err) updatedEpoch, err := s.repository.GetEpoch(s.ctx, app.IApplicationAddress.String(), secondEpoch.Index) s.Require().Nil(err) @@ -345,8 +345,8 @@ func (s *ValidatorRepositoryIntegrationSuite) TestItReturnsANewClaimAndProofs() err = s.repository.StoreAdvanceResult(s.ctx, 1, &advanceResult) s.Require().Nil(err) - _, errs := s.validator.Tick(s.ctx) - s.Require().Equal(0, len(errs)) + _, err = s.validator.Tick(s.ctx) + s.Require().NoError(err) updatedEpoch, err := s.repository.GetEpoch(s.ctx, app.IApplicationAddress.String(), epoch.Index) s.Require().Nil(err) @@ -499,8 +499,8 @@ func (s *ValidatorRepositoryIntegrationSuite) TestItReturnsANewClaimAndProofs() err = s.repository.StoreAdvanceResult(s.ctx, 1, &advanceResult) s.Require().Nil(err) - _, errs := s.validator.Tick(s.ctx) - s.Require().Equal(0, len(errs)) + _, err = s.validator.Tick(s.ctx) + s.Require().NoError(err) updatedSecondEpoch, err := s.repository.GetEpoch( s.ctx, From 7d148998f6602dedc296a3d316c49a234a7d2056 Mon Sep 17 00:00:00 2001 From: Renato Maia <1887792+renatomaia@users.noreply.github.com> Date: Tue, 26 May 2026 09:18:38 -0300 Subject: [PATCH 14/16] refactor(services): support services that expose HTTP interface --- cmd/cartesi-rollups-advancer/root/root.go | 22 ++-- cmd/cartesi-rollups-claimer/root/root.go | 20 +-- cmd/cartesi-rollups-evm-reader/root/root.go | 20 +-- cmd/cartesi-rollups-jsonrpc-api/root/root.go | 20 +-- cmd/cartesi-rollups-node/root/root.go | 4 +- cmd/cartesi-rollups-prt/root/root.go | 20 +-- cmd/cartesi-rollups-validator/root/root.go | 18 +-- internal/advancer/advancer_test.go | 16 +-- internal/advancer/service.go | 53 +++----- internal/claimer/claimer_test.go | 2 +- internal/claimer/fixtures_test.go | 2 +- internal/claimer/service.go | 3 +- .../evmreader/accounts_drive_proved_test.go | 4 +- internal/evmreader/evmreader_test.go | 10 +- internal/evmreader/foreclosure_test.go | 4 +- internal/evmreader/input_scan_units_test.go | 6 +- internal/evmreader/output_test.go | 6 +- internal/evmreader/sealedepochs_test.go | 4 +- internal/evmreader/service.go | 3 +- internal/evmreader/service_config_test.go | 10 +- internal/inspect/hardening_test.go | 118 ++++++++++-------- internal/inspect/inspect.go | 81 +++--------- internal/inspect/inspect_test.go | 26 ++-- internal/jsonrpc/service.go | 100 +++------------ internal/jsonrpc/service_test.go | 51 ++++---- internal/jsonrpc/util_test.go | 2 +- internal/node/node.go | 115 ++++++++--------- internal/prt/handle_foreclosed_test.go | 2 +- internal/prt/service.go | 3 +- internal/validator/validator.go | 3 +- internal/validator/validator_test.go | 4 +- pkg/service/http_service.go | 113 +++++++++++++++++ pkg/service/service.go | 54 ++++---- pkg/service/service_test.go | 24 ++-- pkg/service/supervisor.go | 115 ++++------------- pkg/service/supervisor_test.go | 62 ++++----- pkg/service/telemetry.go | 69 ++++++++++ pkg/service/telemetry_test.go | 44 ++++--- pkg/service/tick.go | 6 +- pkg/service/tick_test.go | 16 ++- test/validator/validator_test.go | 2 +- 41 files changed, 629 insertions(+), 628 deletions(-) create mode 100644 pkg/service/http_service.go create mode 100644 pkg/service/telemetry.go diff --git a/cmd/cartesi-rollups-advancer/root/root.go b/cmd/cartesi-rollups-advancer/root/root.go index 48b4fd851..3c9a3cb33 100644 --- a/cmd/cartesi-rollups-advancer/root/root.go +++ b/cmd/cartesi-rollups-advancer/root/root.go @@ -85,15 +85,9 @@ func run(cmd *cobra.Command, args []string) { logger := service.NewLogger(logLevel, logColor).With("service", svcName) createInfo := advancer.CreateInfo{ - SupervisorConfigs: service.SupervisorConfigs{ - Logger: logger, - EnableSignalHandling: true, - TelemetryCreate: true, - TelemetryAddress: cfg.AdvancerTelemetryAddress, - }, TickServiceConfigs: service.TickServiceConfigs{ PollInterval: cfg.AdvancerPollingInterval, - ServiceConfigs: service.ServiceConfigs{ + BaseConfigs: service.BaseConfigs{ Name: svcName, Logger: logger, }, @@ -106,12 +100,18 @@ func run(cmd *cobra.Command, args []string) { cli.CheckErr(logger, err) defer createInfo.Repository.Close() - advancerService, err := advancer.Create(ctx, &createInfo) + services, err := advancer.Create(ctx, &createInfo) cli.CheckErr(logger, err) - createInfo.SupervisorConfigs.Services = []service.ServiceImpl{ advancerService } - supervisor := &service.ServicesSupervisor{} - err = service.InitServicesSupervisor(&createInfo.SupervisorConfigs, supervisor) + supCfg := &service.SupervisorConfigs{ + BaseConfigs: service.BaseConfigs{Logger: logger}, + EnableSignalHandling: true, + TelemetryCreate: true, + TelemetryAddress: cfg.AdvancerTelemetryAddress, + Services: services, + } + supervisor := &service.Supervisor{} + err = service.InitServicesSupervisor(supCfg, supervisor) cli.CheckErr(logger, err) cli.CheckErr(logger, supervisor.Serve()) diff --git a/cmd/cartesi-rollups-claimer/root/root.go b/cmd/cartesi-rollups-claimer/root/root.go index 6a2026602..c392e7c41 100644 --- a/cmd/cartesi-rollups-claimer/root/root.go +++ b/cmd/cartesi-rollups-claimer/root/root.go @@ -86,15 +86,9 @@ func run(cmd *cobra.Command, args []string) { logger := service.NewLogger(logLevel, logColor).With("service", svcName) createInfo := claimer.CreateInfo{ - SupervisorConfigs: service.SupervisorConfigs{ - Logger: logger, - EnableSignalHandling: true, - TelemetryCreate: true, - TelemetryAddress: cfg.ClaimerTelemetryAddress, - }, TickServiceConfigs: service.TickServiceConfigs{ PollInterval: cfg.ClaimerPollingInterval, - ServiceConfigs: service.ServiceConfigs{ + BaseConfigs: service.BaseConfigs{ Name: svcName, Logger: logger, }, @@ -121,9 +115,15 @@ func run(cmd *cobra.Command, args []string) { claimerService, err := claimer.Create(ctx, &createInfo) cli.CheckErr(logger, err) - createInfo.SupervisorConfigs.Services = []service.ServiceImpl{ claimerService } - supervisor := &service.ServicesSupervisor{} - err = service.InitServicesSupervisor(&createInfo.SupervisorConfigs, supervisor) + supCfg := &service.SupervisorConfigs{ + BaseConfigs: service.BaseConfigs{Logger: logger}, + EnableSignalHandling: true, + TelemetryCreate: true, + TelemetryAddress: cfg.ClaimerTelemetryAddress, + Services: []service.SupervisedService{ claimerService }, + } + supervisor := &service.Supervisor{} + err = service.InitServicesSupervisor(supCfg, supervisor) cli.CheckErr(logger, err) cli.CheckErr(logger, supervisor.Serve()) diff --git a/cmd/cartesi-rollups-evm-reader/root/root.go b/cmd/cartesi-rollups-evm-reader/root/root.go index 75fbcf6c6..3d870f842 100644 --- a/cmd/cartesi-rollups-evm-reader/root/root.go +++ b/cmd/cartesi-rollups-evm-reader/root/root.go @@ -86,14 +86,8 @@ func run(cmd *cobra.Command, args []string) { logger := service.NewLogger(logLevel, logColor).With("service", svcName) createInfo := evmreader.CreateInfo{ - SupervisorConfigs: service.SupervisorConfigs{ - Logger: logger, - EnableSignalHandling: true, - TelemetryCreate: true, - TelemetryAddress: cfg.EvmReaderTelemetryAddress, - }, TickServiceConfigs: service.TickServiceConfigs{ - ServiceConfigs: service.ServiceConfigs{ + BaseConfigs: service.BaseConfigs{ Name: svcName, Logger: logger, }, @@ -122,9 +116,15 @@ func run(cmd *cobra.Command, args []string) { readerService, err := evmreader.Create(ctx, &createInfo) cli.CheckErr(logger, err) - createInfo.SupervisorConfigs.Services = []service.ServiceImpl{ readerService } - supervisor := &service.ServicesSupervisor{} - err = service.InitServicesSupervisor(&createInfo.SupervisorConfigs, supervisor) + supCfg := &service.SupervisorConfigs{ + BaseConfigs: service.BaseConfigs{Logger: logger}, + EnableSignalHandling: true, + TelemetryCreate: true, + TelemetryAddress: cfg.EvmReaderTelemetryAddress, + Services: []service.SupervisedService{ readerService }, + } + supervisor := &service.Supervisor{} + err = service.InitServicesSupervisor(supCfg, supervisor) cli.CheckErr(logger, err) cli.CheckErr(logger, supervisor.Serve()) diff --git a/cmd/cartesi-rollups-jsonrpc-api/root/root.go b/cmd/cartesi-rollups-jsonrpc-api/root/root.go index c47aa4eee..341f952a3 100644 --- a/cmd/cartesi-rollups-jsonrpc-api/root/root.go +++ b/cmd/cartesi-rollups-jsonrpc-api/root/root.go @@ -73,13 +73,7 @@ func run(cmd *cobra.Command, args []string) { logger := service.NewLogger(logLevel, logColor).With("service", svcName) createInfo := jsonrpc.CreateInfo{ - SupervisorConfigs: service.SupervisorConfigs{ - Logger: logger, - EnableSignalHandling: true, - TelemetryCreate: true, - TelemetryAddress: cfg.JsonrpcTelemetryAddress, - }, - ServiceConfigs: service.ServiceConfigs{ + BaseConfigs: service.BaseConfigs{ Name: svcName, Logger: logger, }, @@ -94,9 +88,15 @@ func run(cmd *cobra.Command, args []string) { jsonrpcService, err := jsonrpc.Create(ctx, &createInfo) cli.CheckErr(logger, err) - createInfo.SupervisorConfigs.Services = []service.ServiceImpl{ jsonrpcService } - supervisor := &service.ServicesSupervisor{} - err = service.InitServicesSupervisor(&createInfo.SupervisorConfigs, supervisor) + supCfg := &service.SupervisorConfigs{ + BaseConfigs: service.BaseConfigs{Logger: logger}, + EnableSignalHandling: true, + TelemetryCreate: true, + TelemetryAddress: cfg.JsonrpcTelemetryAddress, + Services: []service.SupervisedService{ jsonrpcService }, + } + supervisor := &service.Supervisor{} + err = service.InitServicesSupervisor(supCfg, supervisor) cli.CheckErr(logger, err) cli.CheckErr(logger, supervisor.Serve()) diff --git a/cmd/cartesi-rollups-node/root/root.go b/cmd/cartesi-rollups-node/root/root.go index 723870fca..15b653361 100644 --- a/cmd/cartesi-rollups-node/root/root.go +++ b/cmd/cartesi-rollups-node/root/root.go @@ -158,12 +158,12 @@ func run(cmd *cobra.Command, args []string) { createInfo := node.CreateInfo{ SupervisorConfigs: service.SupervisorConfigs{ - Logger: logger, + BaseConfigs: service.BaseConfigs{Logger: logger}, EnableSignalHandling: true, TelemetryCreate: true, TelemetryAddress: cfg.NodeTelemetryAddress, }, - ServiceConfigs: service.ServiceConfigs{ + BaseConfigs: service.BaseConfigs{ Name: config.ServiceNode, Logger: logger, }, diff --git a/cmd/cartesi-rollups-prt/root/root.go b/cmd/cartesi-rollups-prt/root/root.go index 5b40d0ac7..3f75b5b0c 100644 --- a/cmd/cartesi-rollups-prt/root/root.go +++ b/cmd/cartesi-rollups-prt/root/root.go @@ -74,15 +74,9 @@ func run(cmd *cobra.Command, args []string) { logger := service.NewLogger(logLevel, logColor).With("service", svcName) createInfo := prt.CreateInfo{ - SupervisorConfigs: service.SupervisorConfigs{ - Logger: logger, - EnableSignalHandling: true, - TelemetryCreate: true, - TelemetryAddress: cfg.PrtTelemetryAddress, - }, TickServiceConfigs: service.TickServiceConfigs{ PollInterval: cfg.PrtPollingInterval, - ServiceConfigs: service.ServiceConfigs{ + BaseConfigs: service.BaseConfigs{ Name: svcName, Logger: logger, }, @@ -110,9 +104,15 @@ func run(cmd *cobra.Command, args []string) { prtService, err := prt.Create(ctx, &createInfo) cli.CheckErr(logger, err) - createInfo.SupervisorConfigs.Services = []service.ServiceImpl{ prtService } - supervisor := &service.ServicesSupervisor{} - err = service.InitServicesSupervisor(&createInfo.SupervisorConfigs, supervisor) + supCfg := &service.SupervisorConfigs{ + BaseConfigs: service.BaseConfigs{Logger: logger}, + EnableSignalHandling: true, + TelemetryCreate: true, + TelemetryAddress: cfg.PrtTelemetryAddress, + Services: []service.SupervisedService{ prtService }, + } + supervisor := &service.Supervisor{} + err = service.InitServicesSupervisor(supCfg, supervisor) cli.CheckErr(logger, err) cli.CheckErr(logger, supervisor.Serve()) diff --git a/cmd/cartesi-rollups-validator/root/root.go b/cmd/cartesi-rollups-validator/root/root.go index 76e24d262..986dc55ea 100644 --- a/cmd/cartesi-rollups-validator/root/root.go +++ b/cmd/cartesi-rollups-validator/root/root.go @@ -73,14 +73,9 @@ func run(cmd *cobra.Command, args []string) { logger := service.NewLogger(logLevel, logColor).With("service", svcName) createInfo := validator.CreateInfo{ - SupervisorConfigs: service.SupervisorConfigs{ - EnableSignalHandling: true, - TelemetryCreate: true, - TelemetryAddress: cfg.ValidatorTelemetryAddress, - }, TickServiceConfigs: service.TickServiceConfigs{ PollInterval: cfg.ValidatorPollingInterval, - ServiceConfigs: service.ServiceConfigs{ + BaseConfigs: service.BaseConfigs{ Name: config.ServiceValidator, Logger: logger, }, @@ -96,9 +91,14 @@ func run(cmd *cobra.Command, args []string) { validatorService, err := validator.Create(ctx, &createInfo) cli.CheckErr(logger, err) - createInfo.SupervisorConfigs.Services = []service.ServiceImpl{ validatorService } - supervisor := &service.ServicesSupervisor{} - err = service.InitServicesSupervisor(&createInfo.SupervisorConfigs, supervisor) + supCfg := &service.SupervisorConfigs{ + EnableSignalHandling: true, + TelemetryCreate: true, + TelemetryAddress: cfg.ValidatorTelemetryAddress, + Services: []service.SupervisedService{ validatorService }, + } + supervisor := &service.Supervisor{} + err = service.InitServicesSupervisor(supCfg, supervisor) cli.CheckErr(logger, err) cli.CheckErr(logger, supervisor.Serve()) diff --git a/internal/advancer/advancer_test.go b/internal/advancer/advancer_test.go index 88b69bcb8..ab5c90922 100644 --- a/internal/advancer/advancer_test.go +++ b/internal/advancer/advancer_test.go @@ -51,15 +51,15 @@ func newMockAdvancerServiceWithBatchSize( repository: repo, } serviceArgs := &service.TickServiceConfigs{ - ServiceConfigs: service.ServiceConfigs{Name: "advancer"}, + BaseConfigs: service.BaseConfigs{Name: "advancer"}, } err := service.InitTickServiceTemplate(serviceArgs, &s.TickServiceTemplate, s) if err != nil { return nil, err } - supCfg := service.SupervisorConfigs{ Services: []service.ServiceImpl{s} } - supervisor := &service.ServicesSupervisor{} - err = service.InitServicesSupervisor(&supCfg, supervisor) + supCfg := &service.SupervisorConfigs{Services: []service.SupervisedService{s}} + supervisor := &service.Supervisor{} + err = service.InitServicesSupervisor(supCfg, supervisor) if err != nil { return nil, err } @@ -1035,7 +1035,7 @@ func (s *AdvancerSuite) TestRemoveSnapshot() { tmpDir := s.T().TempDir() advancer := &Service{snapshotsDir: tmpDir} - serviceArgs := &service.TickServiceConfigs{ServiceConfigs: service.ServiceConfigs{Name: "advancer"}} + serviceArgs := &service.TickServiceConfigs{BaseConfigs: service.BaseConfigs{Name: "advancer"}} require.Nil(service.InitTickServiceTemplate(serviceArgs, &advancer.TickServiceTemplate, advancer)) // Create a snapshot directory @@ -1054,7 +1054,7 @@ func (s *AdvancerSuite) TestRemoveSnapshot() { tmpDir := s.T().TempDir() advancer := &Service{snapshotsDir: tmpDir} - serviceArgs := &service.TickServiceConfigs{ServiceConfigs: service.ServiceConfigs{Name: "advancer"}} + serviceArgs := &service.TickServiceConfigs{BaseConfigs: service.BaseConfigs{Name: "advancer"}} require.Nil(service.InitTickServiceTemplate(serviceArgs, &advancer.TickServiceTemplate, advancer)) snapshotPath := filepath.Join(tmpDir, "myapp_epoch0_input0") @@ -1067,7 +1067,7 @@ func (s *AdvancerSuite) TestRemoveSnapshot() { tmpDir := s.T().TempDir() advancer := &Service{snapshotsDir: tmpDir} - serviceArgs := &service.TickServiceConfigs{ServiceConfigs: service.ServiceConfigs{Name: "advancer"}} + serviceArgs := &service.TickServiceConfigs{BaseConfigs: service.BaseConfigs{Name: "advancer"}} require.Nil(service.InitTickServiceTemplate(serviceArgs, &advancer.TickServiceTemplate, advancer)) // Try to traverse outside snapshotsDir @@ -1082,7 +1082,7 @@ func (s *AdvancerSuite) TestRemoveSnapshot() { tmpDir := s.T().TempDir() advancer := &Service{snapshotsDir: tmpDir} - serviceArgs := &service.TickServiceConfigs{ServiceConfigs: service.ServiceConfigs{Name: "advancer"}} + serviceArgs := &service.TickServiceConfigs{BaseConfigs: service.BaseConfigs{Name: "advancer"}} require.Nil(service.InitTickServiceTemplate(serviceArgs, &advancer.TickServiceTemplate, advancer)) snapshotPath := filepath.Join(tmpDir, "otherapp_epoch0_input0") diff --git a/internal/advancer/service.go b/internal/advancer/service.go index 889076019..693607464 100644 --- a/internal/advancer/service.go +++ b/internal/advancer/service.go @@ -7,7 +7,6 @@ import ( "context" "errors" "fmt" - "net/http" "sync/atomic" "time" @@ -29,7 +28,6 @@ type Service struct { snapshotsDir string repository AdvancerRepository machineManager manager.MachineProvider - inspector *inspect.Inspector // cleanedUp ensures HTTP server shutdown and machine manager close run // exactly once, even when Stop() is called multiple times (by the child's @@ -39,14 +37,13 @@ type Service struct { // CreateInfo contains the configuration for creating an advancer service type CreateInfo struct { - service.SupervisorConfigs service.TickServiceConfigs Config config.AdvancerConfig Repository repository.Repository } // Create initializes a new advancer service -func Create(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) { +func Create(ctx context.Context, c *CreateInfo) ([]service.SupervisedService, error) { var err error if err = ctx.Err(); err != nil { return nil, err // This returns context.Canceled or context.DeadlineExceeded. @@ -78,32 +75,38 @@ func Create(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) { ) s.machineManager = manager + services := []service.SupervisedService{s} + // Initialize the inspect service if enabled if c.Config.FeatureInspectEnabled { - var admission *service.SemaphoreAdmission - if c.Config.InspectMaxInflight > 0 { - admission = service.NewSemaphoreAdmission(c.Config.InspectMaxInflight) - } inspector, err := inspect.NewInspector(inspect.CreateInfo{ + HTTPServiceConfigs: service.HTTPServiceConfigs{ + BaseConfigs: service.BaseConfigs{ + Name: "inspect", + LogLevel: c.LogLevel, + LogColor: c.LogColor, + }, + HTTPServerOptions: service.DefaultInspectOptions(), + Address: c.Config.InspectAddress, + SafeRequestID: true, + CorsAllowedOrigins: c.Config.InspectCorsAllowedOrigins, + MaxInflight: c.Config.InspectMaxInflight, + ShutdownTimeout: httpShutdownTimeout, + }, Repository: c.Repository, Machines: manager, - Address: c.Config.InspectAddress, - LogLevel: c.LogLevel, - LogPretty: c.LogColor, - Admission: admission, - CORSAllowedOrigins: c.Config.InspectCorsAllowedOrigins, }) if err != nil { return nil, fmt.Errorf("failed to create inspect service: %w", err) } - s.inspector = inspector + services = append(services, inspector) } s.snapshotsDir = c.Config.SnapshotsDir service.LogConfig(s.Logger, c.Config) - return s, nil + return services, nil } // Service interface implementation @@ -137,14 +140,6 @@ func (s *Service) Tick(ctx context.Context) (bool, error) { func (s *Service) Stop(b bool) error { var errs []error - if s.inspector != nil { - s.Logger.Info("Shutting down inspect HTTP server") - shutdownCtx, cancel := context.WithTimeout(context.Background(), httpShutdownTimeout) - defer cancel() - if err := s.inspector.Shutdown(shutdownCtx); err != nil { - errs = append(errs, fmt.Errorf("failed to shutdown inspect HTTP server: %w", err)) - } - } if s.machineManager != nil { s.Logger.Info("Closing machine manager") if err := s.machineManager.Close(); err != nil { @@ -156,15 +151,3 @@ func (s *Service) Stop(b bool) error { } return errors.Join(errs...) } - -func (s *Service) Serve() error { - if s.inspector != nil { - go func() { - if err := s.inspector.Serve(); err != nil && !errors.Is(err, http.ErrServerClosed) { - s.Logger.Error("Inspect HTTP server failed — shutting down", "error", err) - s.Stop(true) - } - }() - } - return s.TickServiceTemplate.Serve() -} diff --git a/internal/claimer/claimer_test.go b/internal/claimer/claimer_test.go index 982596473..ede2a9f53 100644 --- a/internal/claimer/claimer_test.go +++ b/internal/claimer/claimer_test.go @@ -37,7 +37,7 @@ func TestTickInterleavesStagesWithPinnedBlockAndReschedulesOnProgress(t *testing defer b.AssertExpectations(t) err := service.InitTickServiceTemplate(&service.TickServiceConfigs{ - ServiceConfigs: service.ServiceConfigs{Name: "claimer-test"}, + BaseConfigs: service.BaseConfigs{Name: "claimer-test"}, PollInterval: time.Hour, }, &m.TickServiceTemplate, m) require.NoError(t, err) diff --git a/internal/claimer/fixtures_test.go b/internal/claimer/fixtures_test.go index ee995fec1..584e630c9 100644 --- a/internal/claimer/fixtures_test.go +++ b/internal/claimer/fixtures_test.go @@ -68,7 +68,7 @@ func newServiceMock() (*Service, *claimerRepositoryMock, *claimerBlockchainMock) claimer := &Service{ TickServiceTemplate: service.TickServiceTemplate{ - ServiceTemplate: service.ServiceTemplate{ + BaseTemplate: service.BaseTemplate{ Logger: slog.New(handler), }, }, diff --git a/internal/claimer/service.go b/internal/claimer/service.go index 566731f17..6bcac4204 100644 --- a/internal/claimer/service.go +++ b/internal/claimer/service.go @@ -20,7 +20,6 @@ import ( ) type CreateInfo struct { - service.SupervisorConfigs service.TickServiceConfigs Config config.ClaimerConfig @@ -76,7 +75,7 @@ type PersistentConfig struct { ChainID uint64 } -func Create(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) { +func Create(ctx context.Context, c *CreateInfo) (service.SupervisedService, error) { var err error if c == nil { diff --git a/internal/evmreader/accounts_drive_proved_test.go b/internal/evmreader/accounts_drive_proved_test.go index faf7a9b9b..a9cdbcce4 100644 --- a/internal/evmreader/accounts_drive_proved_test.go +++ b/internal/evmreader/accounts_drive_proved_test.go @@ -36,8 +36,8 @@ func newPostForeclosureFixture(t *testing.T) ( } require.NoError(t, service.InitTickServiceTemplate( &service.TickServiceConfigs{ - ServiceConfigs: service.ServiceConfigs{ - Name: "evm-reader", + BaseConfigs: service.BaseConfigs{ + Name: "evm-reader", Logger: slog.New(slog.NewTextHandler(os.Stdout, nil)), }, }, diff --git a/internal/evmreader/evmreader_test.go b/internal/evmreader/evmreader_test.go index edd3d6a96..a587e5f3a 100644 --- a/internal/evmreader/evmreader_test.go +++ b/internal/evmreader/evmreader_test.go @@ -67,9 +67,9 @@ func (s *EvmReaderSuite) SetupTest() { s.Require().NoError(err) serviceArgs := &service.TickServiceConfigs{ - ServiceConfigs: service.ServiceConfigs{ - Name: "evm-reader", - LogLevel: logLevel, + BaseConfigs: service.BaseConfigs{ + Name: "evm-reader", + LogLevel: logLevel, }, PollInterval: 100 * time.Millisecond, } @@ -79,11 +79,11 @@ func (s *EvmReaderSuite) SetupTest() { s.evmReader.resolver = newApplicationAdapterResolver(s.evmReader.Logger, s.contractFactory) supCfg := service.SupervisorConfigs{ - Services: []service.ServiceImpl{s.evmReader}, + Services: []service.SupervisedService{s.evmReader}, Context: s.ctx, Cancel: s.cancel, } - supervisor := &service.ServicesSupervisor{} + supervisor := &service.Supervisor{} err = service.InitServicesSupervisor(&supCfg, supervisor) s.Require().NoError(err) } diff --git a/internal/evmreader/foreclosure_test.go b/internal/evmreader/foreclosure_test.go index 8375364fd..03ed2e13a 100644 --- a/internal/evmreader/foreclosure_test.go +++ b/internal/evmreader/foreclosure_test.go @@ -46,8 +46,8 @@ func newForeclosureServiceFixture(t *testing.T) ( } require.NoError(t, service.InitTickServiceTemplate( &service.TickServiceConfigs{ - ServiceConfigs: service.ServiceConfigs{ - Name: "evm-reader", + BaseConfigs: service.BaseConfigs{ + Name: "evm-reader", Logger: slog.New(slog.NewTextHandler(os.Stdout, nil)), }, }, diff --git a/internal/evmreader/input_scan_units_test.go b/internal/evmreader/input_scan_units_test.go index c821cef3f..5f35dd712 100644 --- a/internal/evmreader/input_scan_units_test.go +++ b/internal/evmreader/input_scan_units_test.go @@ -19,7 +19,7 @@ func TestBuildIConsensusInputScanUnits_GroupsByInputBoxAndCursor(t *testing.T) { ctx := context.Background() reader := &Service{ TickServiceTemplate: service.TickServiceTemplate{ - ServiceTemplate: service.ServiceTemplate{Logger: testLogger(t)}, + BaseTemplate: service.BaseTemplate{Logger: testLogger(t)}, }, } inputBoxA := common.HexToAddress("0x00000000000000000000000000000000000000a1") @@ -114,7 +114,7 @@ func TestBuildIConsensusInputScanUnits_InitializesBeforeGrouping(t *testing.T) { Return(nil).Once() reader := &Service{ TickServiceTemplate: service.TickServiceTemplate{ - ServiceTemplate: service.ServiceTemplate{Logger: testLogger(t)}, + BaseTemplate: service.BaseTemplate{Logger: testLogger(t)}, }, repository: repo, } @@ -135,7 +135,7 @@ func TestBuildIConsensusInputScanUnits_FailedInitializationExcludesOnlyThatApp(t ctx := context.Background() reader := &Service{ TickServiceTemplate: service.TickServiceTemplate{ - ServiceTemplate: service.ServiceTemplate{Logger: testLogger(t)}, + BaseTemplate: service.BaseTemplate{Logger: testLogger(t)}, }, } inputBox := common.HexToAddress("0x00000000000000000000000000000000000000a1") diff --git a/internal/evmreader/output_test.go b/internal/evmreader/output_test.go index 9ec4d8938..bc6054f3e 100644 --- a/internal/evmreader/output_test.go +++ b/internal/evmreader/output_test.go @@ -609,7 +609,7 @@ func (s *EvmReaderSuite) setupOutputMismatchTest() { s.Require().NoError(err) serviceArgs := &service.TickServiceConfigs{ - ServiceConfigs: service.ServiceConfigs{ + BaseConfigs: service.BaseConfigs{ Name: "evm-reader", LogLevel: logLevel, }, @@ -621,11 +621,11 @@ func (s *EvmReaderSuite) setupOutputMismatchTest() { s.evmReader.resolver = newApplicationAdapterResolver(s.evmReader.Logger, s.contractFactory) supCfg := service.SupervisorConfigs{ - Services: []service.ServiceImpl{s.evmReader}, + Services: []service.SupervisedService{s.evmReader}, Context: s.ctx, Cancel: s.cancel, } - supervisor := &service.ServicesSupervisor{} + supervisor := &service.Supervisor{} err = service.InitServicesSupervisor(&supCfg, supervisor) s.Require().NoError(err) diff --git a/internal/evmreader/sealedepochs_test.go b/internal/evmreader/sealedepochs_test.go index 8e62a5034..1bcba04da 100644 --- a/internal/evmreader/sealedepochs_test.go +++ b/internal/evmreader/sealedepochs_test.go @@ -56,8 +56,8 @@ func (s *SealedEpochsSuite) SetupTest() { logLevel, err := config.GetLogLevel() s.Require().NoError(err) - serviceArgs := &service.ServiceConfigs{Name: "evm-reader", LogLevel: logLevel} - err = service.InitServiceTemplate(serviceArgs, &s.evmReader.ServiceTemplate) + serviceArgs := &service.BaseConfigs{Name: "evm-reader", LogLevel: logLevel} + err = service.InitServiceTemplate(serviceArgs, &s.evmReader.BaseTemplate) s.Require().NoError(err) } diff --git a/internal/evmreader/service.go b/internal/evmreader/service.go index fc27aed1b..dd4a861da 100644 --- a/internal/evmreader/service.go +++ b/internal/evmreader/service.go @@ -19,7 +19,6 @@ import ( ) type CreateInfo struct { - service.SupervisorConfigs service.TickServiceConfigs Config config.EvmreaderConfig @@ -53,7 +52,7 @@ type PersistentConfig struct { ChainID uint64 } -func Create(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) { +func Create(ctx context.Context, c *CreateInfo) (service.SupervisedService, error) { var err error if err = ctx.Err(); err != nil { return nil, err // This returns context.Canceled or context.DeadlineExceeded. diff --git a/internal/evmreader/service_config_test.go b/internal/evmreader/service_config_test.go index 2604c3bb7..3d17f8e30 100644 --- a/internal/evmreader/service_config_test.go +++ b/internal/evmreader/service_config_test.go @@ -39,8 +39,8 @@ func TestCreateWithNilEthClient(t *testing.T) { _, err = Create(context.Background(), &CreateInfo{ TickServiceConfigs: service.TickServiceConfigs{ - ServiceConfigs: service.ServiceConfigs{ - Name: "evm-reader", + BaseConfigs: service.BaseConfigs{ + Name: "evm-reader", LogLevel: logLevel, }, }, @@ -80,9 +80,9 @@ func TestCreateAcceptsRequestTimeoutBelowPollingInterval(t *testing.T) { svc, err := Create(context.Background(), &CreateInfo{ TickServiceConfigs: service.TickServiceConfigs{ - ServiceConfigs: service.ServiceConfigs{ - Name: "evm-reader", - LogLevel: logLevel, + BaseConfigs: service.BaseConfigs{ + Name: "evm-reader", + LogLevel: logLevel, }, PollInterval: pollInterval, }, diff --git a/internal/inspect/hardening_test.go b/internal/inspect/hardening_test.go index 63e107465..fd1ce1870 100644 --- a/internal/inspect/hardening_test.go +++ b/internal/inspect/hardening_test.go @@ -48,14 +48,24 @@ func newInspectorForTest(t *testing.T, machineErr error) (*Inspector, *Applicati mm := newMockMachines() mm.Map[1] = MockMachine{application: app} - insp, err := NewInspector(CreateInfo{ + svc, err := NewInspector(CreateInfo{ + HTTPServiceConfigs: service.HTTPServiceConfigs{ + BaseConfigs: service.BaseConfigs{ + Name: "inspect", + LogLevel: slog.LevelError, + }, + HTTPServerOptions: service.DefaultInspectOptions(), + Address: "127.0.0.1:0", + SafeRequestID: true, + }, Repository: repo, Machines: hardeningMachines{MachinesMock: mm, err: machineErr}, - Address: "127.0.0.1:0", - LogLevel: slog.LevelError, - LogPretty: false, }) require.NoError(t, err) + + insp := svc.(*Inspector) + require.NotNil(t, insp.Logger) + return insp, app } @@ -121,7 +131,7 @@ func TestInspector_NewWithCreateInfo(t *testing.T) { insp, _ := newInspectorForTest(t, nil) // Package-internal access: the hardened http.Server is unexported and // tests pin its fields directly rather than via a public accessor. - srv := insp.server + srv := insp.Server require.NotNil(t, srv) opts := service.DefaultInspectOptions() @@ -136,7 +146,7 @@ func TestInspector_NewRejectsNilMachines(t *testing.T) { _, err := NewInspector(CreateInfo{ Repository: newMockRepository(), Machines: nil, - Address: "127.0.0.1:0", + HTTPServiceConfigs: service.HTTPServiceConfigs{Address: "127.0.0.1:0"}, }) require.ErrorIs(t, err, ErrInvalidMachines) } @@ -147,7 +157,7 @@ func TestInspector_OversizedPayloadReturns413(t *testing.T) { req := httptest.NewRequest(http.MethodPost, fmt.Sprintf("/inspect/%s", app.Name), body) rr := httptest.NewRecorder() - insp.ServeMux.ServeHTTP(rr, req) + insp.Server.Handler.ServeHTTP(rr, req) require.Equal(t, http.StatusRequestEntityTooLarge, rr.Code) require.Contains(t, rr.Body.String(), "Payload too large") @@ -159,7 +169,7 @@ func TestInspector_ExactBoundaryAccepted(t *testing.T) { req := httptest.NewRequest(http.MethodPost, fmt.Sprintf("/inspect/%s", app.Name), body) rr := httptest.NewRecorder() - insp.ServeMux.ServeHTTP(rr, req) + insp.Server.Handler.ServeHTTP(rr, req) require.Equal(t, http.StatusOK, rr.Code, "body at exact limit must be accepted") } @@ -168,7 +178,7 @@ func TestInspector_GETReturns405WithAllowHeader(t *testing.T) { insp, app := newInspectorForTest(t, nil) req := httptest.NewRequest(http.MethodGet, fmt.Sprintf("/inspect/%s", app.Name), nil) rr := httptest.NewRecorder() - insp.ServeMux.ServeHTTP(rr, req) + insp.Server.Handler.ServeHTTP(rr, req) require.Equal(t, http.StatusMethodNotAllowed, rr.Code) require.Equal(t, http.MethodPost, rr.Header().Get("Allow")) @@ -178,7 +188,7 @@ func TestInspector_PUTReturns405WithAllowHeader(t *testing.T) { insp, app := newInspectorForTest(t, nil) req := httptest.NewRequest(http.MethodPut, fmt.Sprintf("/inspect/%s", app.Name), nil) rr := httptest.NewRecorder() - insp.ServeMux.ServeHTTP(rr, req) + insp.Server.Handler.ServeHTTP(rr, req) require.Equal(t, http.StatusMethodNotAllowed, rr.Code) require.Equal(t, http.MethodPost, rr.Header().Get("Allow")) @@ -192,7 +202,7 @@ func TestInspector_InternalErrorBodyIsGeneric(t *testing.T) { fmt.Sprintf("/inspect/%s", app.Name), strings.NewReader("hello")) rr := httptest.NewRecorder() - insp.ServeMux.ServeHTTP(rr, req) + insp.Server.Handler.ServeHTTP(rr, req) require.Equal(t, http.StatusInternalServerError, rr.Code) require.Contains(t, rr.Body.String(), "Internal server error (request_id=") @@ -208,7 +218,7 @@ func TestInspector_InternalErrorIncludesRequestID(t *testing.T) { strings.NewReader("hello")) req.Header.Set("X-Request-ID", "pinned-id-42") rr := httptest.NewRecorder() - insp.ServeMux.ServeHTTP(rr, req) + insp.Server.Handler.ServeHTTP(rr, req) require.Equal(t, http.StatusInternalServerError, rr.Code) require.Contains(t, rr.Body.String(), "request_id=pinned-id-42") @@ -236,7 +246,7 @@ func TestInspector_ChainOrder_RecoverCoversRequestID(t *testing.T) { rr := httptest.NewRecorder() require.NotPanics(t, func() { - insp.ServeMux.ServeHTTP(rr, req) + insp.Server.Handler.ServeHTTP(rr, req) }, "panic in handler must be caught by RecoverMiddleware, not propagate to the test") require.Equal(t, http.StatusInternalServerError, rr.Code, @@ -252,7 +262,7 @@ func TestInspector_HappyPathStillWorks(t *testing.T) { fmt.Sprintf("/inspect/%s", app.Name), strings.NewReader("hello")) rr := httptest.NewRecorder() - insp.ServeMux.ServeHTTP(rr, req) + insp.Server.Handler.ServeHTTP(rr, req) require.Equal(t, http.StatusOK, rr.Code) require.Contains(t, rr.Body.String(), `"status":"Accepted"`) @@ -265,7 +275,7 @@ func TestInspector_EmptyDappPathReturns404(t *testing.T) { // does not match the inspect route. req := httptest.NewRequest(http.MethodPost, "/inspect/", strings.NewReader("x")) rr := httptest.NewRecorder() - insp.ServeMux.ServeHTTP(rr, req) + insp.Server.Handler.ServeHTTP(rr, req) require.Equal(t, http.StatusNotFound, rr.Code) } @@ -278,7 +288,7 @@ func TestInspector_EmptyDappPathReturns404(t *testing.T) { func TestInspector_RealServer_PayloadTooLarge(t *testing.T) { insp, app := newInspectorForTest(t, nil) - srv := httptest.NewServer(insp.ServeMux) + srv := httptest.NewServer(insp.Server.Handler) defer srv.Close() body := bytes.NewReader(make([]byte, maxPayloadSize+1)) @@ -301,7 +311,7 @@ func TestInspector_RealServer_PayloadTooLarge(t *testing.T) { // newInspectorWithAdmission is a variant of newInspectorForTest that also // accepts a *service.SemaphoreAdmission to exercise the admission // middleware in the full handler chain. -func newInspectorWithAdmission(t *testing.T, admission *service.SemaphoreAdmission) (*Inspector, *Application) { +func newInspectorWithAdmission(t *testing.T, maxInFlight uint64) (*Inspector, *Application) { t.Helper() app := &Application{ @@ -318,41 +328,43 @@ func newInspectorWithAdmission(t *testing.T, admission *service.SemaphoreAdmissi mm := newMockMachines() mm.Map[1] = MockMachine{application: app} - insp, err := NewInspector(CreateInfo{ + svc, err := NewInspector(CreateInfo{ Repository: repo, Machines: mm, - Address: "127.0.0.1:0", - LogLevel: slog.LevelError, - LogPretty: false, - Admission: admission, + HTTPServiceConfigs: service.HTTPServiceConfigs{ + BaseConfigs: service.BaseConfigs{ + LogLevel: slog.LevelError, + }, + Address: "127.0.0.1:0", + MaxInflight: maxInFlight, + }, }) require.NoError(t, err) - return insp, app + + return svc.(*Inspector), app } func TestInspector_AdmissionAccessor(t *testing.T) { - admission := service.NewSemaphoreAdmission(1) - insp, _ := newInspectorWithAdmission(t, admission) - require.Same(t, admission, insp.Admission(), + insp, _ := newInspectorWithAdmission(t, 1) + require.Same(t, insp.Admission, insp.Admission, "Admission() must return the instance passed via CreateInfo") - inspNil, _ := newInspectorWithAdmission(t, nil) - require.Nil(t, inspNil.Admission(), + inspNil, _ := newInspectorWithAdmission(t, 0) + require.Nil(t, inspNil.Admission, "Admission() must return nil when admission control is disabled") } func TestInspector_AdmissionRejectsWhenExhausted(t *testing.T) { // Pre-fill a single-permit admission so every subsequent request // bounces with 503 regardless of payload shape. - admission := service.NewSemaphoreAdmission(1) - admission.TryAcquire() // pre-fill the single permit - insp, app := newInspectorWithAdmission(t, admission) + insp, app := newInspectorWithAdmission(t, 1) + insp.Admission.TryAcquire() // pre-fill the single permit req := httptest.NewRequest(http.MethodPost, fmt.Sprintf("/inspect/%s", app.Name), strings.NewReader("hello")) rr := httptest.NewRecorder() - insp.ServeMux.ServeHTTP(rr, req) + insp.Server.Handler.ServeHTTP(rr, req) require.Equal(t, http.StatusServiceUnavailable, rr.Code) retryAfter, err := strconv.Atoi(rr.Header().Get("Retry-After")) @@ -361,36 +373,35 @@ func TestInspector_AdmissionRejectsWhenExhausted(t *testing.T) { require.LessOrEqual(t, retryAfter, 3) require.Contains(t, rr.Body.String(), "service at capacity") require.Equal(t, "nosniff", rr.Header().Get("X-Content-Type-Options")) - require.Equal(t, uint64(1), admission.Rejected()) + require.Equal(t, uint64(1), insp.Admission.Rejected()) } func TestInspector_AdmissionDisabledWhenNil(t *testing.T) { // nil Admission should disable the gate entirely. Any request // should reach the handler. - insp, app := newInspectorWithAdmission(t, nil) + insp, app := newInspectorWithAdmission(t, 0) req := httptest.NewRequest(http.MethodPost, fmt.Sprintf("/inspect/%s", app.Name), strings.NewReader("hello")) rr := httptest.NewRecorder() - insp.ServeMux.ServeHTTP(rr, req) + insp.Server.Handler.ServeHTTP(rr, req) require.Equal(t, http.StatusOK, rr.Code) } func TestInspector_AdmissionPermitReleasedAfterRequest(t *testing.T) { - admission := service.NewSemaphoreAdmission(1) - insp, app := newInspectorWithAdmission(t, admission) + insp, app := newInspectorWithAdmission(t, 1) for range 5 { req := httptest.NewRequest(http.MethodPost, fmt.Sprintf("/inspect/%s", app.Name), strings.NewReader("hello")) rr := httptest.NewRecorder() - insp.ServeMux.ServeHTTP(rr, req) + insp.Server.Handler.ServeHTTP(rr, req) require.Equal(t, http.StatusOK, rr.Code, "sequential requests must always succeed at limit=1") } - require.Equal(t, uint64(0), admission.Rejected()) + require.Equal(t, uint64(0), insp.Admission.Rejected()) } // ----------------------------------------------------------------------------- @@ -407,7 +418,7 @@ func TestInspector_PerAppCapacityReturns503(t *testing.T) { fmt.Sprintf("/inspect/%s", app.Name), strings.NewReader("hello")) rr := httptest.NewRecorder() - insp.ServeMux.ServeHTTP(rr, req) + insp.Server.Handler.ServeHTTP(rr, req) require.Equal(t, http.StatusServiceUnavailable, rr.Code) require.Contains(t, rr.Body.String(), "Application inspect at capacity") @@ -431,16 +442,19 @@ func newInspectorWithCORS(t *testing.T, origins string) (*Inspector, *Applicatio mm := newMockMachines() mm.Map[1] = MockMachine{application: app} - insp, err := NewInspector(CreateInfo{ + svc, err := NewInspector(CreateInfo{ Repository: repo, Machines: mm, - Address: "127.0.0.1:0", - LogLevel: slog.LevelError, - LogPretty: false, - CORSAllowedOrigins: origins, + HTTPServiceConfigs: service.HTTPServiceConfigs{ + BaseConfigs: service.BaseConfigs{ + LogLevel: slog.LevelError, + }, + Address: "127.0.0.1:0", + CorsAllowedOrigins: origins, + }, }) require.NoError(t, err) - return insp, app + return svc.(*Inspector), app } func TestInspector_CORSDisabledByDefault(t *testing.T) { @@ -451,7 +465,7 @@ func TestInspector_CORSDisabledByDefault(t *testing.T) { strings.NewReader("hello")) req.Header.Set("Origin", "http://evil.com") rr := httptest.NewRecorder() - insp.ServeMux.ServeHTTP(rr, req) + insp.Server.Handler.ServeHTTP(rr, req) require.Empty(t, rr.Header().Get("Access-Control-Allow-Origin")) require.Empty(t, rr.Header().Get("Vary")) @@ -465,7 +479,7 @@ func TestInspector_CORSAllowedOriginEchoed(t *testing.T) { strings.NewReader("hello")) req.Header.Set("Origin", "http://trusted.example.com") rr := httptest.NewRecorder() - insp.ServeMux.ServeHTTP(rr, req) + insp.Server.Handler.ServeHTTP(rr, req) require.Equal(t, "http://trusted.example.com", rr.Header().Get("Access-Control-Allow-Origin")) require.Contains(t, rr.Header().Values("Vary"), "Origin") @@ -479,7 +493,7 @@ func TestInspector_CORSDisallowedOriginNoGrant(t *testing.T) { strings.NewReader("hello")) req.Header.Set("Origin", "http://evil.com") rr := httptest.NewRecorder() - insp.ServeMux.ServeHTTP(rr, req) + insp.Server.Handler.ServeHTTP(rr, req) require.Empty(t, rr.Header().Get("Access-Control-Allow-Origin")) require.Contains(t, rr.Header().Values("Vary"), "Origin") @@ -493,7 +507,7 @@ func TestInspector_CORSPreflightShortCircuits(t *testing.T) { req.Header.Set("Origin", "http://trusted.example.com") req.Header.Set("Access-Control-Request-Method", "POST") rr := httptest.NewRecorder() - insp.ServeMux.ServeHTTP(rr, req) + insp.Server.Handler.ServeHTTP(rr, req) require.Equal(t, http.StatusNoContent, rr.Code) require.Equal(t, "http://trusted.example.com", rr.Header().Get("Access-Control-Allow-Origin")) @@ -514,7 +528,7 @@ func TestInspector_ServeReturnsNilOnGracefulShutdown(t *testing.T) { listener, err := net.Listen("tcp", "127.0.0.1:0") require.NoError(t, err) addr := listener.Addr().String() - insp.listen = func(string, string) (net.Listener, error) { return listener, nil } + insp.HTTPServiceTemplate.Listen = func(string, string) (net.Listener, error) { return listener, nil } serveErr := make(chan error, 1) go func() { serveErr <- insp.Serve() }() @@ -537,7 +551,7 @@ func TestInspector_ServeReturnsNilOnGracefulShutdown(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), time.Second) defer cancel() - require.NoError(t, insp.Shutdown(ctx)) + require.NoError(t, insp.HTTPServiceTemplate.Server.Shutdown(ctx)) select { case err := <-serveErr: diff --git a/internal/inspect/inspect.go b/internal/inspect/inspect.go index d5c378197..c95b791c7 100644 --- a/internal/inspect/inspect.go +++ b/internal/inspect/inspect.go @@ -9,8 +9,6 @@ import ( "errors" "fmt" "io" - "log/slog" - "net" "net/http" "sync" "time" @@ -48,17 +46,10 @@ type InspectRepository interface { type Inspector struct { IInspectMachines + service.HTTPServiceTemplate repository InspectRepository - Logger *slog.Logger - ServeMux *http.ServeMux - server *http.Server - admission *service.SemaphoreAdmission deadlineWarnedMu sync.Mutex deadlineWarned map[int64]struct{} - // listen opens the HTTP listener. It defaults to net.Listen and is - // overridden in tests so Serve() can be exercised against a pre-bound - // listener whose actual address is known to the test. - listen func(network, address string) (net.Listener, error) } type ReportResponse struct { @@ -74,18 +65,9 @@ type InspectResponse struct { // CreateInfo bundles the parameters for [NewInspector]. type CreateInfo struct { + service.HTTPServiceConfigs Repository InspectRepository Machines IInspectMachines - Address string - LogLevel slog.Level - LogPretty bool - // Admission is an optional HTTP-level concurrency gate. A nil value - // disables admission control; the middleware chain treats nil as a - // pass-through so wiring is uniform regardless of configuration. - Admission *service.SemaphoreAdmission - // CORSAllowedOrigins is the raw comma-separated origin allowlist. - // Empty disables CORS entirely. - CORSAllowedOrigins string } // NewInspector constructs an [Inspector] and its backing HTTP server @@ -95,65 +77,30 @@ type CreateInfo struct { // // Use [Inspector.Serve] to run the HTTP server and [Inspector.Shutdown] // to stop it gracefully. -func NewInspector(c CreateInfo) (*Inspector, error) { +func NewInspector(c CreateInfo) (service.SupervisedService, error) { if c.Machines == nil { return nil, ErrInvalidMachines } - logger := service.NewLogger(c.LogLevel, c.LogPretty).With("service", "inspect") inspector := &Inspector{ IInspectMachines: c.Machines, repository: c.Repository, - Logger: logger, deadlineWarned: make(map[int64]struct{}), - ServeMux: http.NewServeMux(), - admission: c.Admission, } - handler := service.NewServiceHandler(inspector, service.HandlerOptions{ - Logger: logger, - Admission: c.Admission, - CORS: service.ParseCORSConfig(logger, c.CORSAllowedOrigins, - []string{"POST", "OPTIONS"}, []string{"Content-Type"}), - }) - inspector.ServeMux.Handle("/inspect/{dapp}", handler) + mux := http.NewServeMux() + mux.Handle("/inspect/{dapp}", inspector) - inspector.server = service.NewHTTPServer(c.Address, inspector.ServeMux, service.DefaultInspectOptions(), logger) - inspector.listen = net.Listen - service.StartupBindWarning(logger, "inspect", c.Address) - - return inspector, nil -} - -// Serve opens the HTTP listener and runs the server. Returns nil on -// graceful shutdown. -func (inspect *Inspector) Serve() error { - listener, err := inspect.listen("tcp", inspect.server.Addr) + err := service.InitHTTPServiceTemplate( + &c.HTTPServiceConfigs, + &inspector.HTTPServiceTemplate, + mux, + ) if err != nil { - return err + return nil, err } - inspect.Logger.Info("Listening", "address", listener.Addr().String()) - if err := inspect.server.Serve(listener); !errors.Is(err, http.ErrServerClosed) { - return err - } - return nil -} -// Shutdown gracefully stops the inspect HTTP server, waiting for in-flight -// requests to complete or ctx to expire. Callers should not access the -// underlying *http.Server directly; exposing only Shutdown keeps the API -// surface minimal and prevents misuse (e.g. reaching for ListenAndServe, -// SetKeepAlivesEnabled, or mutating Handler after construction). -func (inspect *Inspector) Shutdown(ctx context.Context) error { - return inspect.server.Shutdown(ctx) -} - -// Admission returns the concurrency gate used by the inspect HTTP surface, -// or nil when admission control is disabled. This accessor gives the -// advancer (or a future metrics hook) a path to reach the inspect -// admission counters without threading the controller separately. -func (inspect *Inspector) Admission() *service.SemaphoreAdmission { - return inspect.admission + return inspector, nil } func (inspect *Inspector) ServeHTTP(w http.ResponseWriter, r *http.Request) { @@ -225,7 +172,7 @@ func (inspect *Inspector) ServeHTTP(w http.ResponseWriter, r *http.Request) { } deadline := app.ExecutionParameters.InspectMaxDeadline + inspectResponseHeadroom - if inspect.server != nil && deadline > inspect.server.WriteTimeout { + if inspect.Server != nil && deadline > inspect.Server.WriteTimeout { inspect.warnDeadlineExceedsWriteTimeout(app, deadline) } ctx, cancel := context.WithTimeout(r.Context(), deadline) @@ -296,7 +243,7 @@ func (inspect *Inspector) warnDeadlineExceedsWriteTimeout(app *Application, dead "inspect_max_deadline", app.ExecutionParameters.InspectMaxDeadline, "response_headroom", inspectResponseHeadroom, "effective_deadline", deadline, - "http_write_timeout", inspect.server.WriteTimeout, + "http_write_timeout", inspect.Server.WriteTimeout, ) } diff --git a/internal/inspect/inspect_test.go b/internal/inspect/inspect_test.go index ff099a5fc..9a7f189ae 100644 --- a/internal/inspect/inspect_test.go +++ b/internal/inspect/inspect_test.go @@ -96,7 +96,11 @@ func (s *InspectSuite) TestPostMachineNotReady() { inspect := &Inspector{ repository: repo, IInspectMachines: machines, - Logger: service.NewLogger(slog.LevelDebug, true), + HTTPServiceTemplate: service.HTTPServiceTemplate{ + BaseTemplate: service.BaseTemplate{ + Logger: service.NewLogger(slog.LevelDebug, true), + }, + }, } srv := s.startServer(inspect) @@ -130,9 +134,13 @@ func (s *InspectSuite) TestPostForeclosedMachineUnavailable() { machines := newMockMachines() inspect := &Inspector{ - repository: repo, - IInspectMachines: machines, - Logger: service.NewLogger(slog.LevelDebug, true), + repository: repo, + IInspectMachines: machines, + HTTPServiceTemplate: service.HTTPServiceTemplate{ + BaseTemplate: service.BaseTemplate{ + Logger: service.NewLogger(slog.LevelDebug, true), + }, + }, } srv := s.startServer(inspect) @@ -207,9 +215,13 @@ func (s *InspectSuite) setup() (*Inspector, *Application, common.Hash) { machines := newMockMachines() machines.Map[1] = *m inspect := &Inspector{ - repository: repo, - IInspectMachines: machines, - Logger: service.NewLogger(slog.LevelDebug, true), + repository: repo, + IInspectMachines: machines, + HTTPServiceTemplate: service.HTTPServiceTemplate{ + BaseTemplate: service.BaseTemplate{ + Logger: service.NewLogger(slog.LevelDebug, true), + }, + }, } payload := randomHash() return inspect, m.application, payload diff --git a/internal/jsonrpc/service.go b/internal/jsonrpc/service.go index d3d90c9a4..9d2bc7e79 100644 --- a/internal/jsonrpc/service.go +++ b/internal/jsonrpc/service.go @@ -5,9 +5,7 @@ package jsonrpc import ( "context" - "errors" "fmt" - "net" "net/http" "time" @@ -22,43 +20,37 @@ import ( const jsonrpcShutdownTimeout = 5 * time.Second -// ----------------------------------------------------------------------------- -// Service Implementation -// ----------------------------------------------------------------------------- - // Service implements the IService interface. type Service struct { - service.ServiceTemplate + service.HTTPServiceTemplate repository repository.Repository - server *http.Server - admission *service.SemaphoreAdmission inputABI *abi.ABI outputABI *abi.ABI - // listen opens the HTTP listener. It defaults to net.Listen and is - // overridden in tests so Serve() can be exercised without real sockets. - listen func(network, address string) (net.Listener, error) } type CreateInfo struct { - service.SupervisorConfigs - service.ServiceConfigs + service.BaseConfigs Config config.JsonrpcConfig Repository repository.Repository } -func Create(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) { +func Create(ctx context.Context, c *CreateInfo) (service.SupervisedService, error) { var err error if err = ctx.Err(); err != nil { return nil, err // This returns context.Canceled or context.DeadlineExceeded. } s := &Service{} - - err = service.InitServiceTemplate(&c.ServiceConfigs, &s.ServiceTemplate) - if err != nil { - return nil, err + cfg := &service.HTTPServiceConfigs{ + BaseConfigs: c.BaseConfigs, + HTTPServerOptions: service.DefaultJSONRPCOptions(), + Address: c.Config.JsonrpcApiAddress, + SafeRequestID: true, + CorsAllowedOrigins: c.Config.JsonrpcCorsAllowedOrigins, + MaxInflight: c.Config.JsonrpcMaxInflight, + ShutdownTimeout: jsonrpcShutdownTimeout, } s.repository = c.Repository @@ -76,79 +68,15 @@ func Create(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) { return nil, err } - if c.Config.JsonrpcMaxInflight > 0 { - s.admission = service.NewSemaphoreAdmission(c.Config.JsonrpcMaxInflight) - } - mux := http.NewServeMux() mux.HandleFunc("/rpc", s.handleRPC) - handler := service.NewServiceHandler(mux, service.HandlerOptions{ - Logger: s.Logger, - Admission: s.admission, - CORS: service.ParseCORSConfig(s.Logger, c.Config.JsonrpcCorsAllowedOrigins, - []string{"POST", "OPTIONS"}, []string{"Content-Type"}), - }) - - s.server = service.NewHTTPServer(c.Config.JsonrpcApiAddress, handler, service.DefaultJSONRPCOptions(), s.Logger) - service.StartupBindWarning(s.Logger, "jsonrpc", c.Config.JsonrpcApiAddress) - - if s.listen == nil { - s.listen = net.Listen + err = service.InitHTTPServiceTemplate(cfg, &s.HTTPServiceTemplate, mux) + if err != nil { + return nil, err } service.LogConfig(s.Logger, c.Config) return s, nil } - -func (s *Service) Stop(_ bool) error { - s.Logger.Info("Shutting down JSON-RPC HTTP server", "addr", s.server.Addr) - ctx, cancel := context.WithTimeout(context.Background(), jsonrpcShutdownTimeout) - defer cancel() - if err := s.server.Shutdown(ctx); err != nil { - return err - } - return nil -} - -func (s *Service) Serve() error { - ctx := s.Supervisor.Context() - listener, err := s.listen("tcp", s.server.Addr) - if err != nil { - return err - } - - s.Logger.Info("Listening", "addr", listener.Addr().String()) - - serverDone := make(chan error, 1) - go func() { - // Run the HTTP accept loop in parallel with the framework's lifecycle - // loop below. The lifecycle loop is the component that consumes - // SIGINT/SIGTERM and calls Stop() for graceful shutdown. - err := s.server.Serve(listener) - s.Logger.Info("Stopped listening", "addr", listener.Addr().String(), "err", err) - if errors.Is(err, http.ErrServerClosed) { - serverDone <- nil - return - } - serverDone <- err - }() - - select { - case err := <-serverDone: - // The HTTP loop exited first. This is unexpected unless the listener - // failed or the server was already closed, so cancel the framework - // loop and wait for it to observe the cancellation before returning. - s.Supervisor.Stop(true) - <-ctx.Done() - return err - case <-ctx.Done(): - // The framework loop exited first because it handled a shutdown signal - // or context cancellation and called Stop(), which should trigger - // s.server.Shutdown(). Wait for the HTTP loop to finish so Serve() - // returns only after the listener is fully closed. - serverErr := <-serverDone - return serverErr - } -} diff --git a/internal/jsonrpc/service_test.go b/internal/jsonrpc/service_test.go index e4ad62ba9..f0898ece0 100644 --- a/internal/jsonrpc/service_test.go +++ b/internal/jsonrpc/service_test.go @@ -22,7 +22,7 @@ func ensureSentinelRejects(t *testing.T, s *Service) { req := httptest.NewRequest(http.MethodPost, "/rpc", http.NoBody) req.Header.Set("Content-Type", "application/json") rr := httptest.NewRecorder() - s.server.Handler.ServeHTTP(rr, req) + s.Server.Handler.ServeHTTP(rr, req) require.Equal(t, http.StatusServiceUnavailable, rr.Code) retryAfter, err := strconv.Atoi(rr.Header().Get("Retry-After")) @@ -39,15 +39,15 @@ func ensureSentinelRejects(t *testing.T, s *Service) { // this test catches it. func TestJSONRPC_HardenedServerOptions(t *testing.T) { s := newTestService(t, "jsonrpc-server-options") - require.NotNil(t, s.server) + require.NotNil(t, s.Server) opts := service.DefaultJSONRPCOptions() - require.Equal(t, opts.ReadHeaderTimeout, s.server.ReadHeaderTimeout) - require.Equal(t, opts.ReadTimeout, s.server.ReadTimeout) - require.Equal(t, opts.WriteTimeout, s.server.WriteTimeout) - require.Equal(t, opts.IdleTimeout, s.server.IdleTimeout) - require.Equal(t, opts.MaxHeaderBytes, s.server.MaxHeaderBytes) - require.NotNil(t, s.server.ErrorLog) + require.Equal(t, opts.ReadHeaderTimeout, s.Server.ReadHeaderTimeout) + require.Equal(t, opts.ReadTimeout, s.Server.ReadTimeout) + require.Equal(t, opts.WriteTimeout, s.Server.WriteTimeout) + require.Equal(t, opts.IdleTimeout, s.Server.IdleTimeout) + require.Equal(t, opts.MaxHeaderBytes, s.Server.MaxHeaderBytes) + require.NotNil(t, s.Server.ErrorLog) } // TestJSONRPC_RequestIDPropagated verifies the middleware chain echoes a @@ -61,7 +61,7 @@ func TestJSONRPC_RequestIDPropagated(t *testing.T) { req.Header.Set("X-Request-ID", "pinned-xyz") req.Header.Set("Content-Type", "application/json") rr := httptest.NewRecorder() - s.server.Handler.ServeHTTP(rr, req) + s.Server.Handler.ServeHTTP(rr, req) // handleRPC will reject the empty body as a bad request, but the // middleware chain still runs and must echo the header. @@ -81,7 +81,7 @@ func TestJSONRPC_OversizedBodyReturns413(t *testing.T) { req := httptest.NewRequest(http.MethodPost, "/rpc", bytes.NewReader(oversized)) req.Header.Set("Content-Type", "application/json") rr := httptest.NewRecorder() - s.server.Handler.ServeHTTP(rr, req) + s.Server.Handler.ServeHTTP(rr, req) require.Equal(t, http.StatusRequestEntityTooLarge, rr.Code) require.Contains(t, rr.Body.String(), "Payload too large") @@ -98,12 +98,12 @@ func TestJSONRPC_AdmissionDisabledWhenZero(t *testing.T) { // and confirm a basic request reaches handleRPC (which rejects // an empty body with 400, not 503). s := newTestServiceWithInflight(t, "jsonrpc-adm-zero", 0) - require.Nil(t, s.admission, "limit=0 must produce nil admission") + require.Nil(t, s.Admission, "limit=0 must produce nil admission") req := httptest.NewRequest(http.MethodPost, "/rpc", http.NoBody) req.Header.Set("Content-Type", "application/json") rr := httptest.NewRecorder() - s.server.Handler.ServeHTTP(rr, req) + s.Server.Handler.ServeHTTP(rr, req) require.NotEqual(t, http.StatusServiceUnavailable, rr.Code, "disabled admission must not return 503") } @@ -112,8 +112,8 @@ func TestJSONRPC_AdmissionWiredWhenPositive(t *testing.T) { // JsonrpcMaxInflight>0 must construct a non-nil SemaphoreAdmission // with the matching limit. s := newTestServiceWithInflight(t, "jsonrpc-adm-wired", 7) - require.NotNil(t, s.admission) - require.Equal(t, uint64(7), s.admission.Limit()) + require.NotNil(t, s.Admission) + require.Equal(t, uint64(7), s.Admission.Limit()) } func TestJSONRPC_AdmissionRejectsWhenExhausted(t *testing.T) { @@ -125,31 +125,30 @@ func TestJSONRPC_AdmissionRejectsWhenExhausted(t *testing.T) { // Swap the admission underlying the server handler for a // pre-filled one to force rejection on every request. - s.admission = service.NewSemaphoreAdmission(1) - s.admission.TryAcquire() // pre-fill the single permit - s.server.Handler = rebuildHandlerWithAdmission(s) + s.Admission.TryAcquire() // pre-fill the single permit + s.Server.Handler = rebuildHandlerWithAdmission(s) ensureSentinelRejects(t, s) - require.Equal(t, uint64(1), s.admission.Rejected()) + require.Equal(t, uint64(1), s.Admission.Rejected()) } func TestJSONRPC_AdmissionPermitReleasedAfterRequest(t *testing.T) { // With limit=1 a sequential burst must all succeed: each request // releases its permit on return. s := newTestServiceWithInflight(t, "jsonrpc-adm-release", 1) - require.NotNil(t, s.admission) + require.NotNil(t, s.Admission) for range 5 { req := httptest.NewRequest(http.MethodPost, "/rpc", http.NoBody) req.Header.Set("Content-Type", "application/json") rr := httptest.NewRecorder() - s.server.Handler.ServeHTTP(rr, req) + s.Server.Handler.ServeHTTP(rr, req) // handleRPC returns 400 on empty body; the key assertion is // that we never see 503 because the permit is released each // time the handler returns. require.NotEqual(t, http.StatusServiceUnavailable, rr.Code) } - require.Equal(t, uint64(0), s.admission.Rejected()) + require.Equal(t, uint64(0), s.Admission.Rejected()) } // rebuildHandlerWithAdmission rewraps the service's mux with only @@ -163,7 +162,7 @@ func rebuildHandlerWithAdmission(s *Service) http.Handler { mux.HandleFunc("/rpc", s.handleRPC) var handler http.Handler = mux - handler = service.AdmissionMiddleware(s.admission)(handler) + handler = service.AdmissionMiddleware(s.Admission)(handler) return handler } @@ -178,7 +177,7 @@ func TestJSONRPC_CORSDisabledByDefault(t *testing.T) { req.Header.Set("Content-Type", "application/json") req.Header.Set("Origin", "http://evil.com") rr := httptest.NewRecorder() - s.server.Handler.ServeHTTP(rr, req) + s.Server.Handler.ServeHTTP(rr, req) require.Empty(t, rr.Header().Get("Access-Control-Allow-Origin")) require.Empty(t, rr.Header().Get("Vary")) @@ -191,7 +190,7 @@ func TestJSONRPC_CORSAllowedOriginEchoed(t *testing.T) { req.Header.Set("Content-Type", "application/json") req.Header.Set("Origin", "http://trusted.example.com") rr := httptest.NewRecorder() - s.server.Handler.ServeHTTP(rr, req) + s.Server.Handler.ServeHTTP(rr, req) require.Equal(t, "http://trusted.example.com", rr.Header().Get("Access-Control-Allow-Origin")) require.Contains(t, rr.Header().Values("Vary"), "Origin") @@ -204,7 +203,7 @@ func TestJSONRPC_CORSDisallowedOriginNoGrant(t *testing.T) { req.Header.Set("Content-Type", "application/json") req.Header.Set("Origin", "http://evil.com") rr := httptest.NewRecorder() - s.server.Handler.ServeHTTP(rr, req) + s.Server.Handler.ServeHTTP(rr, req) require.Empty(t, rr.Header().Get("Access-Control-Allow-Origin")) require.Contains(t, rr.Header().Values("Vary"), "Origin") @@ -216,7 +215,7 @@ func TestJSONRPC_CORSNoOriginPassthrough(t *testing.T) { req := httptest.NewRequest(http.MethodPost, "/rpc", http.NoBody) req.Header.Set("Content-Type", "application/json") rr := httptest.NewRecorder() - s.server.Handler.ServeHTTP(rr, req) + s.Server.Handler.ServeHTTP(rr, req) require.Empty(t, rr.Header().Get("Access-Control-Allow-Origin")) require.NotEqual(t, http.StatusServiceUnavailable, rr.Code) diff --git a/internal/jsonrpc/util_test.go b/internal/jsonrpc/util_test.go index 9e55f88b3..67e4bb1e9 100644 --- a/internal/jsonrpc/util_test.go +++ b/internal/jsonrpc/util_test.go @@ -101,7 +101,7 @@ func newTestServiceFull(t *testing.T, name string, maxInflight uint64, corsOrigi require.NoError(t, err) ci := CreateInfo{ - ServiceConfigs: service.ServiceConfigs{ + BaseConfigs: service.BaseConfigs{ Name: name, LogLevel: logLevel, LogColor: true, diff --git a/internal/node/node.go b/internal/node/node.go index 98244f046..bf85ca156 100644 --- a/internal/node/node.go +++ b/internal/node/node.go @@ -7,6 +7,7 @@ import ( "context" "errors" "fmt" + "slices" "github.com/cartesi/rollups-node/pkg/service" @@ -25,13 +26,13 @@ import ( // serviceResult carries either a successfully created service or an error // back from the goroutines in createServices. type serviceResult struct { - service service.ServiceImpl - err error + services []service.SupervisedService + err error } type CreateInfo struct { service.SupervisorConfigs - service.ServiceConfigs + service.BaseConfigs Config config.NodeConfig @@ -48,7 +49,7 @@ func Create(ctx context.Context, cfg *CreateInfo) (service.IService, error) { return nil, err // This returns context.Canceled or context.DeadlineExceeded. } - supervisor := &service.ServicesSupervisor{} + supervisor := &service.Supervisor{} err = createServices(ctx, cfg, supervisor) if err != nil { @@ -61,12 +62,12 @@ func Create(ctx context.Context, cfg *CreateInfo) (service.IService, error) { return supervisor, nil } -type serviceCreator func(context.Context, *CreateInfo) (service.ServiceImpl, error) +type serviceCreator func(context.Context, *CreateInfo) ([]service.SupervisedService, error) func createServices( ctx context.Context, cfg *CreateInfo, - supervisor *service.ServicesSupervisor, + supervisor *service.Supervisor, ) error { creators := []serviceCreator{ newNode, @@ -83,17 +84,17 @@ func createServices( ch := make(chan serviceResult, len(creators)) for _, create := range creators { go func() { - svc, err := create(ctx, cfg) - ch <- serviceResult{service: svc, err: err} + services, err := create(ctx, cfg) + ch <- serviceResult{services: services, err: err} }() } - services := make([]service.ServiceImpl, 0) + services := make([]service.SupervisedService, 0) errs := make([]error, 0) for range len(creators) { result := <-ch - if result.service != nil { - services = append(services, result.service) + if len(result.services) > 0 { + services = slices.Concat(services, result.services) } else { errs = append(errs, result.err) } @@ -115,7 +116,7 @@ func createServices( } type Service struct { - service.ServiceTemplate + service.BaseTemplate Children []service.IService Repository repository.Repository @@ -127,27 +128,27 @@ func (me *Service) Serve() error { return nil } -func newNode(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) { +func newNode(ctx context.Context, c *CreateInfo) ([]service.SupervisedService, error) { s := &Service{} - err := service.InitServiceTemplate(&c.ServiceConfigs, &s.ServiceTemplate) + err := service.InitServiceTemplate(&c.BaseConfigs, &s.BaseTemplate) if err != nil { return nil, err } - return s, nil + return []service.SupervisedService{s}, nil } // services creation -func newEVMReader(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) { +func newEVMReader(ctx context.Context, c *CreateInfo) ([]service.SupervisedService, error) { readerArgs := evmreader.CreateInfo{ TickServiceConfigs: service.TickServiceConfigs{ - PollInterval: c.Config.EvmReaderPollingInterval, - ServiceConfigs: service.ServiceConfigs{ - Name: config.ServiceEvmReader, - LogLevel: config.ResolveServiceLogLevel(config.ServiceEvmReader, c.Config.LogLevel), - LogColor: c.Config.LogColor, + PollInterval: c.Config.EvmReaderPollingInterval, + BaseConfigs: service.BaseConfigs{ + Name: config.ServiceEvmReader, + LogLevel: config.ResolveServiceLogLevel(config.ServiceEvmReader, c.Config.LogLevel), + LogColor: c.Config.LogColor, }, }, EthClient: c.ReaderClient, @@ -159,38 +160,38 @@ func newEVMReader(ctx context.Context, c *CreateInfo) (service.ServiceImpl, erro if err != nil { return nil, fmt.Errorf("create evm-reader: %w", err) } - return readerService, nil + return []service.SupervisedService{readerService}, nil } -func newAdvancer(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) { +func newAdvancer(ctx context.Context, c *CreateInfo) ([]service.SupervisedService, error) { advancerArgs := advancer.CreateInfo{ TickServiceConfigs: service.TickServiceConfigs{ - PollInterval: c.Config.AdvancerPollingInterval, - ServiceConfigs: service.ServiceConfigs{ - Name: config.ServiceAdvancer, - LogLevel: config.ResolveServiceLogLevel(config.ServiceAdvancer, c.Config.LogLevel), - LogColor: c.Config.LogColor, + PollInterval: c.Config.AdvancerPollingInterval, + BaseConfigs: service.BaseConfigs{ + Name: config.ServiceAdvancer, + LogLevel: config.ResolveServiceLogLevel(config.ServiceAdvancer, c.Config.LogLevel), + LogColor: c.Config.LogColor, }, }, Repository: c.Repository, Config: *c.Config.ToAdvancerConfig(), } - advancerService, err := advancer.Create(ctx, &advancerArgs) + advancerServices, err := advancer.Create(ctx, &advancerArgs) if err != nil { return nil, fmt.Errorf("create advancer: %w", err) } - return advancerService, nil + return advancerServices, nil } -func newValidator(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) { +func newValidator(ctx context.Context, c *CreateInfo) ([]service.SupervisedService, error) { validatorArgs := validator.CreateInfo{ TickServiceConfigs: service.TickServiceConfigs{ - PollInterval: c.Config.ValidatorPollingInterval, - ServiceConfigs: service.ServiceConfigs{ - Name: config.ServiceValidator, - LogLevel: config.ResolveServiceLogLevel(config.ServiceValidator, c.Config.LogLevel), - LogColor: c.Config.LogColor, + PollInterval: c.Config.ValidatorPollingInterval, + BaseConfigs: service.BaseConfigs{ + Name: config.ServiceValidator, + LogLevel: config.ResolveServiceLogLevel(config.ServiceValidator, c.Config.LogLevel), + LogColor: c.Config.LogColor, }, }, Repository: c.Repository, @@ -201,17 +202,17 @@ func newValidator(ctx context.Context, c *CreateInfo) (service.ServiceImpl, erro if err != nil { return nil, fmt.Errorf("create validator: %w", err) } - return validatorService, nil + return []service.SupervisedService{validatorService}, nil } -func newClaimer(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) { +func newClaimer(ctx context.Context, c *CreateInfo) ([]service.SupervisedService, error) { claimerArgs := claimer.CreateInfo{ TickServiceConfigs: service.TickServiceConfigs{ - PollInterval: c.Config.ClaimerPollingInterval, - ServiceConfigs: service.ServiceConfigs{ - Name: config.ServiceClaimer, - LogLevel: config.ResolveServiceLogLevel(config.ServiceClaimer, c.Config.LogLevel), - LogColor: c.Config.LogColor, + PollInterval: c.Config.ClaimerPollingInterval, + BaseConfigs: service.BaseConfigs{ + Name: config.ServiceClaimer, + LogLevel: config.ResolveServiceLogLevel(config.ServiceClaimer, c.Config.LogLevel), + LogColor: c.Config.LogColor, }, }, EthConn: c.ClaimerClient, @@ -223,15 +224,15 @@ func newClaimer(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) if err != nil { return nil, fmt.Errorf("create claimer: %w", err) } - return claimerService, nil + return []service.SupervisedService{claimerService}, nil } -func newJsonrpc(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) { +func newJsonrpc(ctx context.Context, c *CreateInfo) ([]service.SupervisedService, error) { jsonrpcArgs := jsonrpc.CreateInfo{ - ServiceConfigs: service.ServiceConfigs{ - Name: config.ServiceJsonrpc, - LogLevel: config.ResolveServiceLogLevel(config.ServiceJsonrpc, c.Config.LogLevel), - LogColor: c.Config.LogColor, + BaseConfigs: service.BaseConfigs{ + Name: config.ServiceJsonrpc, + LogLevel: config.ResolveServiceLogLevel(config.ServiceJsonrpc, c.Config.LogLevel), + LogColor: c.Config.LogColor, }, Repository: c.Repository, Config: *c.Config.ToJsonrpcConfig(), @@ -241,17 +242,17 @@ func newJsonrpc(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) if err != nil { return nil, fmt.Errorf("create jsonrpc: %w", err) } - return jsonrpcService, nil + return []service.SupervisedService{jsonrpcService}, nil } -func newPrt(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) { +func newPrt(ctx context.Context, c *CreateInfo) ([]service.SupervisedService, error) { prtArgs := prt.CreateInfo{ TickServiceConfigs: service.TickServiceConfigs{ - PollInterval: c.Config.PrtPollingInterval, - ServiceConfigs: service.ServiceConfigs{ - Name: config.ServicePrt, - LogLevel: config.ResolveServiceLogLevel(config.ServicePrt, c.Config.LogLevel), - LogColor: c.Config.LogColor, + PollInterval: c.Config.PrtPollingInterval, + BaseConfigs: service.BaseConfigs{ + Name: config.ServicePrt, + LogLevel: config.ResolveServiceLogLevel(config.ServicePrt, c.Config.LogLevel), + LogColor: c.Config.LogColor, }, }, EthClient: c.PrtClient, @@ -263,5 +264,5 @@ func newPrt(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) { if err != nil { return nil, fmt.Errorf("create prt: %w", err) } - return prtService, nil + return []service.SupervisedService{prtService}, nil } diff --git a/internal/prt/handle_foreclosed_test.go b/internal/prt/handle_foreclosed_test.go index 8948e171a..0bdeab8a8 100644 --- a/internal/prt/handle_foreclosed_test.go +++ b/internal/prt/handle_foreclosed_test.go @@ -114,7 +114,7 @@ func newPRTServiceMock() (*Service, *prtRepositoryMock) { repo := &prtRepositoryMock{} s := &Service{ TickServiceTemplate: service.TickServiceTemplate{ - ServiceTemplate: service.ServiceTemplate{ + BaseTemplate: service.BaseTemplate{ Logger: slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{Level: slog.LevelDebug})), }, }, diff --git a/internal/prt/service.go b/internal/prt/service.go index d248be2f3..2562a8ce2 100644 --- a/internal/prt/service.go +++ b/internal/prt/service.go @@ -21,7 +21,6 @@ import ( ) type CreateInfo struct { - service.SupervisorConfigs service.TickServiceConfigs Config config.PrtConfig Repository repository.Repository @@ -50,7 +49,7 @@ type PersistentConfig struct { ChainID uint64 } -func Create(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) { +func Create(ctx context.Context, c *CreateInfo) (service.SupervisedService, error) { var err error if err = ctx.Err(); err != nil { return nil, err // This returns context.Canceled or context.DeadlineExceeded. diff --git a/internal/validator/validator.go b/internal/validator/validator.go index 8997f15cb..a080cd832 100644 --- a/internal/validator/validator.go +++ b/internal/validator/validator.go @@ -33,7 +33,6 @@ type Service struct { } type CreateInfo struct { - service.SupervisorConfigs service.TickServiceConfigs Config config.ValidatorConfig @@ -41,7 +40,7 @@ type CreateInfo struct { Repository repository.Repository } -func Create(ctx context.Context, c *CreateInfo) (service.ServiceImpl, error) { +func Create(ctx context.Context, c *CreateInfo) (service.SupervisedService, error) { var err error if err = ctx.Err(); err != nil { return nil, err // This returns context.Canceled or context.DeadlineExceeded. diff --git a/internal/validator/validator_test.go b/internal/validator/validator_test.go index 32d0615f1..80f47e559 100644 --- a/internal/validator/validator_test.go +++ b/internal/validator/validator_test.go @@ -51,8 +51,8 @@ func (s *ValidatorSuite) SetupSubTest() { pristinePostContext: postContext, pristineRootHash: postContext[merkle.TREE_DEPTH], } - serviceArgs := &service.ServiceConfigs{Name: "validator"} - err := service.InitServiceTemplate(serviceArgs, &validator.ServiceTemplate) + serviceArgs := &service.BaseConfigs{Name: "validator"} + err := service.InitServiceTemplate(serviceArgs, &validator.BaseTemplate) s.Require().Nil(err) dummyOutputsMerkleRoot := common.HexToHash("0x0a162946e56158bac0673e6dd3bdfdc1e4a0e7744a120fdb640050c8d7abe1c6") dummyEpochs = []Epoch{ diff --git a/pkg/service/http_service.go b/pkg/service/http_service.go new file mode 100644 index 000000000..0e976486a --- /dev/null +++ b/pkg/service/http_service.go @@ -0,0 +1,113 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +package service + +import ( + "context" + "errors" + "net" + "net/http" + "time" +) + +type HTTPServiceTemplate struct { + BaseTemplate + // TODO: this should not be exported (but is used by tests in other packages) + Server *http.Server + // Listen opens the HTTP listener. It defaults to net.Listen and is + // overridden in tests so Serve() can be exercised without real sockets. + Listen func(network, address string) (net.Listener, error) + // Admission is an optional HTTP-level concurrency gate. A nil value + // disables Admission control; the middleware chain treats nil as a + // pass-through so wiring is uniform regardless of configuration. + Admission *SemaphoreAdmission + // Maximum duration for the service to wait for in-flight requests to + // complete and shutdown. + shutdownTimeout time.Duration +} + +type HTTPServiceConfigs struct { + BaseConfigs + HTTPServerOptions + // HTTP address for JSON-RPC's telemetry service. + Address string + // Enforces request IDs are in the charset that cover the ID formats emitted + // by common reverse proxies and tracing systems while remaining safe to log + // and echo in response headers. + SafeRequestID bool + // Comma-separated list of allowed browser origins for the HTTP service. + // If empty, CORS is disabled. Origins are lowercased and validated at + // startup. Example: "http://localhost:3000,https://app.example.com". + CorsAllowedOrigins string + // Maximum number of concurrent in-flight JSON-RPC requests. + // Requests beyond this limit receive HTTP 503 Service Unavailable + // with Retry-After: 1. Set to 0 to disable HTTP-level admission + // control. + MaxInflight uint64 + // Maximum duration for the service to wait for in-flight requests to + // complete and shutdown. + ShutdownTimeout time.Duration +} + +func InitHTTPServiceTemplate( + cfg *HTTPServiceConfigs, + tmpl *HTTPServiceTemplate, + handler http.Handler, +) error { + if cfg == nil || tmpl == nil { + return ErrInvalid + } + + err := InitServiceTemplate(&cfg.BaseConfigs, &tmpl.BaseTemplate) + if err != nil { + return err + } + + if cfg.MaxInflight > 0 { + tmpl.Admission = NewSemaphoreAdmission(cfg.MaxInflight) + handler = AdmissionMiddleware(tmpl.Admission)(handler) + } + if cfg.CorsAllowedOrigins != "" { + corsCfg := ParseCORSConfig( + tmpl.Logger, + cfg.CorsAllowedOrigins, + []string{"POST", "OPTIONS"}, + []string{"Content-Type"}, + ) + handler = CORSMiddleware(corsCfg)(handler) + } + if cfg.SafeRequestID { + handler = RequestIDMiddleware(handler) + } + + handler = RecoverMiddleware(tmpl.Logger)(handler) + + tmpl.shutdownTimeout = cfg.ShutdownTimeout + tmpl.Listen = net.Listen + tmpl.Server = NewHTTPServer(cfg.Address, handler, cfg.HTTPServerOptions, tmpl.Logger) + StartupBindWarning(tmpl.Logger, cfg.Name, cfg.Address) + + return nil +} + +func (s *HTTPServiceTemplate) Stop(bool) error { + s.Logger.Info("Shutting down HTTP service", "addr", s.Server.Addr) + ctx, cancel := context.WithTimeout(context.Background(), s.shutdownTimeout) + defer cancel() + return s.Server.Shutdown(ctx) +} + +// Serve opens the HTTP listener and runs the server. Returns nil on +// graceful shutdown. +func (s *HTTPServiceTemplate) Serve() error { + listener, err := s.Listen("tcp", s.Server.Addr) + if err != nil { + return err + } + s.Logger.Info("Starting HTTP service", "addr", listener.Addr().String()) + if err := s.Server.Serve(listener); !errors.Is(err, http.ErrServerClosed) { + return err + } + return nil +} diff --git a/pkg/service/service.go b/pkg/service/service.go index b6b5144ec..7d17c4127 100644 --- a/pkg/service/service.go +++ b/pkg/service/service.go @@ -6,8 +6,8 @@ // The runtime information is then stored in the Service. // // The recommended way to implement a new service is to: -// - embed a [ServiceConfigs] struct into a new CreateInfo struct. -// - embed a [ServiceTemplate] struct into a new Service struct. +// - embed a [BaseConfigs] struct into a new CreateInfo struct. +// - embed a [BaseTemplate] struct into a new Service struct. // - embed a [Create] call into a new Create function. // // To use a service, call its corresponding Create function with a matching CreateInfo and Service, @@ -62,8 +62,8 @@ import ( ) var ( - ErrInvalid = fmt.Errorf("Invalid Argument") // invalid argument - ErrServiceStopped = fmt.Errorf("Service was stopped") + ErrInvalid = fmt.Errorf("invalid argument") // invalid argument + ErrServiceStopped = fmt.Errorf("service was stopped") ) // Public interface with methods to manipulate the service. @@ -76,33 +76,31 @@ type IService interface { String() string } -// Public interface with methods to manipulate the service. -type ServiceImpl interface { +// Service interface with a service supervisor. +type SupervisedService interface { IService - SetSupervisor(*ServicesSupervisor) + SetSupervisor(*Supervisor) } -// Supervisor of multiple services under a single management. -type ServiceTemplate struct { - Name string - Supervisor *ServicesSupervisor - Logger *slog.Logger - LogLevel slog.Level - LogColor bool +// Basic template for single services under a single management. +type BaseTemplate struct { + Name string + Supervisor *Supervisor + Logger *slog.Logger } -// ServiceConfigs stores configuration for the InitServiceTemplate function -type ServiceConfigs struct { - Name string - Logger *slog.Logger - LogLevel slog.Level - LogColor bool +// BaseConfigs stores configuration for the InitServiceTemplate function +type BaseConfigs struct { + Name string + Logger *slog.Logger + LogLevel slog.Level + LogColor bool } // Initialize the 'ServiceTemplate' structure using values from 'ServiceConfigs'. // 'impl' must be a reference to the concrete service implementation that // embeds 'ServiceTemplate' -func InitServiceTemplate(c *ServiceConfigs, s *ServiceTemplate) error { +func InitServiceTemplate(c *BaseConfigs, s *BaseTemplate) error { s.Name = c.Name s.Logger = c.Logger @@ -118,13 +116,13 @@ func InitServiceTemplate(c *ServiceConfigs, s *ServiceTemplate) error { // Default implementation of some abstract methods (except `Serve`). // Remove them to force concrete services to provide implementation for them. -func (s *ServiceTemplate) Reload() error { return nil } -func (s *ServiceTemplate) Stop(bool) error { return nil } -func (s *ServiceTemplate) Alive() bool { return true } -func (s *ServiceTemplate) Ready() bool { return true } -func (s *ServiceTemplate) String() string { return s.Name } +func (s *BaseTemplate) Reload() error { return nil } +func (s *BaseTemplate) Stop(bool) error { return nil } +func (s *BaseTemplate) Alive() bool { return true } +func (s *BaseTemplate) Ready() bool { return true } +func (s *BaseTemplate) String() string { return s.Name } -func (s *ServiceTemplate) SetSupervisor(supervisor *ServicesSupervisor) { +func (s *BaseTemplate) SetSupervisor(supervisor *Supervisor) { s.Supervisor = supervisor } @@ -150,6 +148,6 @@ func NewLogger(level slog.Level, color bool) *slog.Logger { return slog.New(handler) } -func NewServiceLogger(c *ServiceConfigs) *slog.Logger { +func NewServiceLogger(c *BaseConfigs) *slog.Logger { return NewLogger(c.LogLevel, c.LogColor).With("service", c.Name) } diff --git a/pkg/service/service_test.go b/pkg/service/service_test.go index 7b7ae9d75..fd4e3beb6 100644 --- a/pkg/service/service_test.go +++ b/pkg/service/service_test.go @@ -10,7 +10,7 @@ import ( ) type delayedCloseImpl struct { - ServiceTemplate + BaseTemplate onServeInitChan chan struct{} onStopInitChan chan struct{} } @@ -33,13 +33,14 @@ func (s *ServeSuite) TestServeExitsAfterStopIsComplete() { } // Create the service with a live context, then cancel before Serve(). - err := InitServiceTemplate(&ServiceConfigs{ + err := InitServiceTemplate(&BaseConfigs{ Name: "stopOnChanClose", LogLevel: slog.LevelError, - }, &svc.ServiceTemplate) + }, &svc.BaseTemplate) + s.NoError(err) - supervisorcfg := &SupervisorConfigs{ Services: []ServiceImpl{ svc } } - supervisor := &ServicesSupervisor{} + supervisorcfg := &SupervisorConfigs{Services: []SupervisedService{svc}} + supervisor := &Supervisor{} err = InitServicesSupervisor(supervisorcfg, supervisor) s.NoError(err) @@ -99,18 +100,19 @@ func (s *ServeSuite) TestServeExitsAfterStopIsCompleteOnContextCancelation() { } // Create the service with a live context, then cancel before Serve(). - err := InitServiceTemplate(&ServiceConfigs{ + err := InitServiceTemplate(&BaseConfigs{ Name: "stopOnChanClose", LogLevel: slog.LevelError, - }, &svc.ServiceTemplate) + }, &svc.BaseTemplate) + s.NoError(err) ctx, cancel := context.WithCancel(context.Background()) supervisorcfg := &SupervisorConfigs{ - Services: []ServiceImpl{ svc }, - Context: ctx, - Cancel: cancel, + Services: []SupervisedService{svc}, + Context: ctx, + Cancel: cancel, } - supervisor := &ServicesSupervisor{} + supervisor := &Supervisor{} err = InitServicesSupervisor(supervisorcfg, supervisor) s.NoError(err) diff --git a/pkg/service/supervisor.go b/pkg/service/supervisor.go index a5eeac853..5ae060b8c 100644 --- a/pkg/service/supervisor.go +++ b/pkg/service/supervisor.go @@ -6,9 +6,7 @@ package service import ( "context" "errors" - "fmt" "log/slog" - "net/http" "os" "os/signal" "sync/atomic" @@ -17,11 +15,8 @@ import ( ) type SupervisorConfigs struct { - Name string - Services []ServiceImpl - Logger *slog.Logger - LogLevel slog.Level - LogColor bool + BaseConfigs + Services []SupervisedService Context context.Context Cancel context.CancelFunc EnableSignalHandling bool @@ -30,13 +25,10 @@ type SupervisorConfigs struct { } // Supervisor of multiple services under a single management. -type ServicesSupervisor struct { +type Supervisor struct { Name string Logger *slog.Logger - ServeMux *http.ServeMux - services []IService - telemetry *http.Server - telemetryFunc func() error + services []SupervisedService context context.Context cancelContext context.CancelFunc sigHangUp chan os.Signal // SIGHUP to reload @@ -48,7 +40,7 @@ type ServicesSupervisor struct { stoppedChan chan struct{} } -func InitServicesSupervisor(c *SupervisorConfigs, s *ServicesSupervisor) error { +func InitServicesSupervisor(c *SupervisorConfigs, s *Supervisor) error { s.stoppedChan = make(chan struct{}) s.Logger = c.Logger @@ -80,44 +72,38 @@ func InitServicesSupervisor(c *SupervisorConfigs, s *ServicesSupervisor) error { } } - // telemetry - if c.TelemetryCreate { - if s.ServeMux == nil { - s.ServeMux = http.NewServeMux() - } - if c.TelemetryAddress == "" { - c.TelemetryAddress = ":8080" - } - s.telemetry, s.telemetryFunc = s.CreateDefaultTelemetry(c.TelemetryAddress) - go func() { - if err := s.telemetryFunc(); err != nil { - s.Logger.Error("Telemetry HTTP server failed", "error", err) - } - }() - } - - s.services = make([]IService, len(c.Services)) + s.services = make([]SupervisedService, len(c.Services)) for i, svc := range c.Services { s.services[i] = svc svc.SetSupervisor(s) } - if s.telemetry != nil { - s.Logger.Info("Telemetry", "address", s.telemetry.Addr) + // telemetry + if c.TelemetryCreate { + if c.TelemetryAddress == "" { + c.TelemetryAddress = ":8080" + } + telemetry, err := CreateDefaultTelemetry(c.TelemetryAddress) + if err != nil { + return err + } + s.services = append(s.services, telemetry) + telemetry.SetSupervisor(s) + s.Logger.Info("Telemetry", "address", c.TelemetryAddress) } return nil } -func (s *ServicesSupervisor) Context() context.Context { +func (s *Supervisor) Context() context.Context { return s.context } -func (s *ServicesSupervisor) String() string { +func (s *Supervisor) String() string { return s.Name } -func (s *ServicesSupervisor) Alive() bool { +func (s *Supervisor) Alive() bool { for _, svc := range s.services { if !svc.Alive() { s.Logger.Info("Service still not alive", "service", svc.String()) @@ -127,7 +113,7 @@ func (s *ServicesSupervisor) Alive() bool { return true } -func (s *ServicesSupervisor) Ready() bool { +func (s *Supervisor) Ready() bool { for _, svc := range s.services { if !svc.Ready() { s.Logger.Info("Service still not ready", "service", svc.String()) @@ -137,7 +123,7 @@ func (s *ServicesSupervisor) Ready() bool { return true } -func (s *ServicesSupervisor) Reload() error { +func (s *Supervisor) Reload() error { if s.stopped.Load() { return ErrServiceStopped } @@ -164,7 +150,7 @@ func (s *ServicesSupervisor) Reload() error { return err } -func (s *ServicesSupervisor) Serve() error { +func (s *Supervisor) Serve() error { if s.stopped.Load() { return nil } @@ -193,7 +179,7 @@ func (s *ServicesSupervisor) Serve() error { svcCount := 0 for _, svc := range s.services { svcCount++ - go func(svc IService) { + go func(svc SupervisedService) { s.Logger.Info("Starting service", "service", svc.String()) err := svc.Serve() switch { @@ -229,7 +215,7 @@ func (s *ServicesSupervisor) Serve() error { return errors.Join(allErrs...) } -func (s *ServicesSupervisor) Stop(force bool) error { +func (s *Supervisor) Stop(force bool) error { // CAS achieves once-semantics: the second caller returns immediately // (fire-and-forget) rather than blocking like sync.Once. This is safe // because the orchestrator calls Cancel() after Stop() and waits for @@ -271,52 +257,3 @@ func (s *ServicesSupervisor) Stop(force bool) error { return err } - -/* - * Service Telemetry - */ - -func (s *ServicesSupervisor) CreateDefaultTelemetry(addr string) (*http.Server, func() error) { - s.ServeMux.Handle("/readyz", http.HandlerFunc(s.ReadyHandler)) - s.ServeMux.Handle("/livez", http.HandlerFunc(s.AliveHandler)) - - // Telemetry deliberately omits RequestIDMiddleware. /livez and /readyz are - // hit every few seconds per pod per service by orchestrators like - // Kubernetes; burning a crypto/rand UUID per probe is measurable overhead - // for 1-byte idempotent responses that have nothing to correlate against. - // RecoverMiddleware is kept so panics still become clean 500s. - // A static request ID is set so panic logs show "telemetry" instead of "". - handler := RecoverMiddleware(s.Logger)(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - w.Header().Set(requestIDHeader, "telemetry") - s.ServeMux.ServeHTTP(w, r) - })) - server := NewHTTPServer(addr, handler, DefaultTelemetryOptions(), s.Logger) - StartupBindWarning(s.Logger, s.Name+"/telemetry", addr) - - return server, func() error { - if err := server.ListenAndServe(); !errors.Is(err, http.ErrServerClosed) { - return err - } - return nil - } -} - -// HTTP handler for `/s.Name/readyz` that exposes the value of Ready() -func (s *ServicesSupervisor) ReadyHandler(w http.ResponseWriter, r *http.Request) { - if !s.Ready() { - http.Error(w, s.Name+": ready check failed", - http.StatusInternalServerError) - } else { - fmt.Fprintf(w, "%s: ready\n", s.Name) - } -} - -// HTTP handler for `/s.Name/livez` that exposes the value of Alive() -func (s *ServicesSupervisor) AliveHandler(w http.ResponseWriter, r *http.Request) { - if !s.Alive() { - http.Error(w, s.Name+": alive check failed", - http.StatusInternalServerError) - } else { - fmt.Fprintf(w, "%s: alive\n", s.Name) - } -} diff --git a/pkg/service/supervisor_test.go b/pkg/service/supervisor_test.go index a5930c6d9..e9fd27aa2 100644 --- a/pkg/service/supervisor_test.go +++ b/pkg/service/supervisor_test.go @@ -15,7 +15,7 @@ import ( ) type blockingChildImpl struct { - ServiceTemplate + BaseTemplate started chan struct{} done chan struct{} once sync.Once @@ -30,7 +30,7 @@ func (c *blockingChildImpl) Serve() error { func createBlockingChild(t *testing.T, cfg *SupervisorConfigs, name string) *blockingChildImpl { t.Helper() - childCfg := ServiceConfigs{ + childCfg := BaseConfigs{ Name: name, LogLevel: cfg.LogLevel, } @@ -39,7 +39,7 @@ func createBlockingChild(t *testing.T, cfg *SupervisorConfigs, name string) *blo started: make(chan struct{}), done: make(chan struct{}), } - require.NoError(t, InitServiceTemplate(&childCfg, &child.ServiceTemplate)) + require.NoError(t, InitServiceTemplate(&childCfg, &child.BaseTemplate)) return child } @@ -53,15 +53,17 @@ func TestSupervisor(t *testing.T) { func (s *SupervisorSuite) TestNodeStopCancelsChildContexts() { parentCfg := &SupervisorConfigs{ - Name: "node", - LogLevel: slog.LevelError, + BaseConfigs: BaseConfigs{ + Name: "node", + LogLevel: slog.LevelError, + }, } child1 := createBlockingChild(s.T(), parentCfg, "child-1") child2 := createBlockingChild(s.T(), parentCfg, "child-2") - parentCfg.Services = []ServiceImpl{child1, child2} - supervisor := &ServicesSupervisor{} + parentCfg.Services = []SupervisedService{child1, child2} + supervisor := &Supervisor{} require.NoError(s.T(), InitServicesSupervisor(parentCfg, supervisor)) done := make(chan struct{}) @@ -102,19 +104,19 @@ func (s *SupervisorSuite) TestNodeStopCancelsChildContexts() { } type errorChildImpl struct { - ServiceTemplate + BaseTemplate started chan struct{} } func (c *errorChildImpl) Serve() error { close(c.started) time.Sleep(10 * time.Millisecond) - return fmt.Errorf("Oops %s!", c.Name) + return fmt.Errorf("oops by %s", c.Name) } func createErrorChild(t *testing.T, cfg *SupervisorConfigs, name string) *errorChildImpl { t.Helper() - childCfg := &ServiceConfigs{ + childCfg := &BaseConfigs{ Name: name, LogLevel: cfg.LogLevel, } @@ -122,15 +124,15 @@ func createErrorChild(t *testing.T, cfg *SupervisorConfigs, name string) *errorC child := &errorChildImpl{ started: make(chan struct{}), } - require.NoError(t, InitServiceTemplate(childCfg, &child.ServiceTemplate)) + require.NoError(t, InitServiceTemplate(childCfg, &child.BaseTemplate)) return child } type stopAwareChildImpl struct { - ServiceTemplate - started chan struct{} - stopped chan struct{} - done chan struct{} + BaseTemplate + started chan struct{} + stopped chan struct{} + done chan struct{} stopOnce sync.Once doneOnce sync.Once } @@ -149,7 +151,7 @@ func (c *stopAwareChildImpl) Stop(bool) error { func createStopAwareChild(t *testing.T, cfg *SupervisorConfigs, name string) *stopAwareChildImpl { t.Helper() - childCfg := &ServiceConfigs{ + childCfg := &BaseConfigs{ Name: name, LogLevel: cfg.LogLevel, } @@ -159,21 +161,23 @@ func createStopAwareChild(t *testing.T, cfg *SupervisorConfigs, name string) *st stopped: make(chan struct{}), done: make(chan struct{}), } - require.NoError(t, InitServiceTemplate(childCfg, &child.ServiceTemplate)) + require.NoError(t, InitServiceTemplate(childCfg, &child.BaseTemplate)) return child } func (s *SupervisorSuite) TestNodeReturnChildErrors() { parentCfg := &SupervisorConfigs{ - Name: "node", - LogLevel: slog.LevelError, + BaseConfigs: BaseConfigs{ + Name: "node", + LogLevel: slog.LevelError, + }, } child1 := createErrorChild(s.T(), parentCfg, "child-1") child2 := createErrorChild(s.T(), parentCfg, "child-2") - parentCfg.Services = []ServiceImpl{child1, child2} - supervisor := &ServicesSupervisor{} + parentCfg.Services = []SupervisedService{child1, child2} + supervisor := &Supervisor{} require.NoError(s.T(), InitServicesSupervisor(parentCfg, supervisor)) done := make(chan error) @@ -195,8 +199,8 @@ func (s *SupervisorSuite) TestNodeReturnChildErrors() { select { case err := <-done: - s.ErrorContains(err, "Oops child-1!") - s.ErrorContains(err, "Oops child-2!") + s.ErrorContains(err, "oops by child-1") + s.ErrorContains(err, "oops by child-2") case <-time.After(2 * time.Second): s.Fail("node did not exit after child errors") } @@ -204,15 +208,17 @@ func (s *SupervisorSuite) TestNodeReturnChildErrors() { func (s *SupervisorSuite) TestNodeStopsChildrenWhenOneChildErrors() { parentCfg := &SupervisorConfigs{ - Name: "node", - LogLevel: slog.LevelError, + BaseConfigs: BaseConfigs{ + Name: "node", + LogLevel: slog.LevelError, + }, } healthyChild := createStopAwareChild(s.T(), parentCfg, "healthy-child") errorChild := createErrorChild(s.T(), parentCfg, "error-child") - parentCfg.Services = []ServiceImpl{healthyChild, errorChild} - supervisor := &ServicesSupervisor{} + parentCfg.Services = []SupervisedService{healthyChild, errorChild} + supervisor := &Supervisor{} require.NoError(s.T(), InitServicesSupervisor(parentCfg, supervisor)) done := make(chan error) @@ -246,7 +252,7 @@ func (s *SupervisorSuite) TestNodeStopsChildrenWhenOneChildErrors() { select { case err := <-done: - s.ErrorContains(err, "Oops error-child!") + s.ErrorContains(err, "oops by error-child") case <-time.After(2 * time.Second): s.Fail("supervisor did not exit after child error") } diff --git a/pkg/service/telemetry.go b/pkg/service/telemetry.go new file mode 100644 index 000000000..354fd3ee7 --- /dev/null +++ b/pkg/service/telemetry.go @@ -0,0 +1,69 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +package service + +import ( + "fmt" + "net/http" + "time" +) + +const shutdownTimeout = 5 * time.Second + +type telemetryService struct { + HTTPServiceTemplate + // TODO: this should not be exported (but is used by tests in other packages) + ServeMux *http.ServeMux +} + +type CreateInfo struct { + SupervisorConfigs + BaseConfigs +} + +func CreateDefaultTelemetry(addr string) (SupervisedService, error) { + s := &telemetryService{} + cfg := &HTTPServiceConfigs{ + BaseConfigs: BaseConfigs{Name: "telemetry"}, + HTTPServerOptions: DefaultTelemetryOptions(), + Address: addr, + ShutdownTimeout: shutdownTimeout, + } + + s.ServeMux = http.NewServeMux() + s.ServeMux.Handle("/readyz", http.HandlerFunc(s.ReadyHandler)) + s.ServeMux.Handle("/livez", http.HandlerFunc(s.AliveHandler)) + + handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set(requestIDHeader, "telemetry") + s.ServeMux.ServeHTTP(w, r) + }) + + err := InitHTTPServiceTemplate(cfg, &s.HTTPServiceTemplate, handler) + if err != nil { + return nil, err + } + + return s, nil +} + +// HTTP handler for `/s.Name/readyz` that exposes the value of Ready() +func (s *telemetryService) ReadyHandler(w http.ResponseWriter, _ *http.Request) { + if !s.Supervisor.Ready() { + http.Error(w, s.Name+": ready check failed", + http.StatusInternalServerError) + } else { + fmt.Fprintf(w, "%s: ready\n", s.Name) + } +} + +// HTTP handler for `/s.Name/livez` that exposes the value of Alive() +func (s *telemetryService) AliveHandler(w http.ResponseWriter, _ *http.Request) { + if !s.Supervisor.Alive() { + http.Error(w, s.Name+": alive check failed", + http.StatusInternalServerError) + } else { + fmt.Fprintf(w, "%s: alive\n", s.Name) + } +} diff --git a/pkg/service/telemetry_test.go b/pkg/service/telemetry_test.go index b3754a1a5..b3f00d348 100644 --- a/pkg/service/telemetry_test.go +++ b/pkg/service/telemetry_test.go @@ -14,22 +14,16 @@ import ( // newTelemetryTestService returns a *Service ready to have CreateDefaultTelemetry // called on it. It wires a ServeMux, a mockImpl, and a discard logger. -func newTelemetryTestService(t *testing.T) *ServicesSupervisor { - cfg := &SupervisorConfigs{ - Name: "test", - Logger: discardLogger(), - TelemetryCreate: true, - TelemetryAddress: ":0", - } - supervisor := &ServicesSupervisor{} - err := InitServicesSupervisor(cfg, supervisor) +func newTelemetryTestService(t *testing.T) *telemetryService { + svc, err := CreateDefaultTelemetry("localhost:0") + svc.SetSupervisor(&Supervisor{}) require.NoError(t, err) - return supervisor + return svc.(*telemetryService) } func TestCreateDefaultTelemetry_Hardened(t *testing.T) { s := newTelemetryTestService(t) - srv := s.telemetry + srv := s.Server opts := DefaultTelemetryOptions() require.Equal(t, opts.ReadHeaderTimeout, srv.ReadHeaderTimeout) @@ -42,7 +36,7 @@ func TestCreateDefaultTelemetry_Hardened(t *testing.T) { func TestCreateDefaultTelemetry_HandlersWired(t *testing.T) { s := newTelemetryTestService(t) - srv := s.telemetry + srv := s.Server // /readyz: mockImpl.Ready() is true, so expect 200. rr := httptest.NewRecorder() @@ -64,7 +58,7 @@ func TestCreateDefaultTelemetry_HandlersWired(t *testing.T) { // greppable without the cost of crypto/rand per probe. func TestCreateDefaultTelemetry_StaticRequestID(t *testing.T) { s := newTelemetryTestService(t) - srv := s.telemetry + srv := s.Server for _, path := range []string{"/livez", "/readyz"} { rr := httptest.NewRecorder() @@ -85,12 +79,12 @@ func TestCreateDefaultTelemetry_StaticRequestID(t *testing.T) { func TestCreateDefaultTelemetry_PanicRecovered(t *testing.T) { s := newTelemetryTestService(t) + srv := s.Server + s.ServeMux.Handle("/boom", http.HandlerFunc(func(_ http.ResponseWriter, _ *http.Request) { panic("kaboom") })) - srv := s.telemetry - rr := httptest.NewRecorder() srv.Handler.ServeHTTP(rr, httptest.NewRequest(http.MethodGet, "/boom", nil)) @@ -100,7 +94,7 @@ func TestCreateDefaultTelemetry_PanicRecovered(t *testing.T) { require.NotContains(t, rr.Body.String(), "kaboom") } -type falseLifecycleImpl struct{ ServiceTemplate } +type falseLifecycleImpl struct{ BaseTemplate } func (*falseLifecycleImpl) Alive() bool { return false } func (*falseLifecycleImpl) Ready() bool { return false } @@ -108,23 +102,27 @@ func (*falseLifecycleImpl) Serve() error { return nil } func TestCreateDefaultTelemetry_Returns500WhenLifecycleFails(t *testing.T) { svc := &falseLifecycleImpl{} - err := InitServiceTemplate(&ServiceConfigs{}, &svc.ServiceTemplate) + err := InitServiceTemplate(&BaseConfigs{}, &svc.BaseTemplate) require.NoError(t, err) cfg := &SupervisorConfigs{ - Name: "test", - Logger: discardLogger(), + BaseConfigs: BaseConfigs{ + Name: "test", + Logger: discardLogger(), + }, TelemetryCreate: true, - TelemetryAddress: ":0", - Services: []ServiceImpl{ svc }, + TelemetryAddress: "localhost:0", + Services: []SupervisedService{svc}, } - supervisor := &ServicesSupervisor{} + supervisor := &Supervisor{} err = InitServicesSupervisor(cfg, supervisor) require.NoError(t, err) + srv := supervisor.services[len(supervisor.services)-1].(*telemetryService) + for _, path := range []string{"/readyz", "/livez"} { rr := httptest.NewRecorder() - supervisor.telemetry.Handler.ServeHTTP(rr, httptest.NewRequest(http.MethodGet, path, nil)) + srv.Server.Handler.ServeHTTP(rr, httptest.NewRequest(http.MethodGet, path, nil)) require.Equal(t, http.StatusInternalServerError, rr.Code, "path=%s", path) } } diff --git a/pkg/service/tick.go b/pkg/service/tick.go index b908e8348..9d560f928 100644 --- a/pkg/service/tick.go +++ b/pkg/service/tick.go @@ -17,13 +17,13 @@ type TickImpl interface { } type TickServiceTemplate struct { - ServiceTemplate + BaseTemplate tickImpl TickImpl ticker *time.Ticker } type TickServiceConfigs struct { - ServiceConfigs + BaseConfigs PollInterval time.Duration } @@ -36,7 +36,7 @@ func InitTickServiceTemplate( return ErrInvalid } - err := InitServiceTemplate(&cfg.ServiceConfigs, &tmpl.ServiceTemplate) + err := InitServiceTemplate(&cfg.BaseConfigs, &tmpl.BaseTemplate) if err != nil { return err } diff --git a/pkg/service/tick_test.go b/pkg/service/tick_test.go index 272e7fdca..12bf01920 100644 --- a/pkg/service/tick_test.go +++ b/pkg/service/tick_test.go @@ -21,7 +21,7 @@ type mockImpl struct { onTick func(n int32) bool // called on each Tick with the tick count (1-based) } -func (m *mockImpl) Tick(ctx context.Context) (bool, error) { +func (m *mockImpl) Tick(context.Context) (bool, error) { n := m.tickCount.Add(1) reschedule := false if m.onTick != nil { @@ -41,7 +41,7 @@ func createTestService( t.Helper() err := InitTickServiceTemplate(&TickServiceConfigs{ - ServiceConfigs: ServiceConfigs{ + BaseConfigs: BaseConfigs{ Name: "test", LogLevel: slog.LevelError, }, @@ -49,11 +49,11 @@ func createTestService( }, &impl.TickServiceTemplate, impl) require.NoError(t, err) - supervisor := &ServicesSupervisor{} + supervisor := &Supervisor{} err = InitServicesSupervisor( &SupervisorConfigs{ - Name: "supervisor", - Services: []ServiceImpl{ impl }, + BaseConfigs: BaseConfigs{Name: "supervisor"}, + Services: []SupervisedService{impl}, }, supervisor, ) @@ -99,8 +99,7 @@ func (s *ServeSuite) TestDisabledReschedulePreservesExistingBehavior() { func (s *ServeSuite) TestRescheduleTriggersImmediateRetick() { // When SignalReschedule() is called from Tick(), Serve() should call // Tick() again immediately without waiting for the timer. - var impl *mockImpl - impl = &mockImpl{ + impl := &mockImpl{ onTick: func(n int32) bool { // Signal reschedule on ticks 1 and 2 (the initial tick // and the first rescheduled tick). Stop on tick 3. @@ -131,8 +130,7 @@ func (s *ServeSuite) TestRescheduleTriggersImmediateRetick() { func (s *ServeSuite) TestContextCancellationExitsPromptly() { // When context is cancelled with a reschedule signal pending, // Serve() should exit promptly. - var impl *mockImpl - impl = &mockImpl{ + impl := &mockImpl{ onTick: func(_ int32) bool { return true }, diff --git a/test/validator/validator_test.go b/test/validator/validator_test.go index 5e802df7e..95beb5279 100644 --- a/test/validator/validator_test.go +++ b/test/validator/validator_test.go @@ -63,7 +63,7 @@ func (s *ValidatorRepositoryIntegrationSuite) SetupSubTest() { serviceArgs := validator.CreateInfo{ TickServiceConfigs: service.TickServiceConfigs{ - ServiceConfigs: service.ServiceConfigs{ + BaseConfigs: service.BaseConfigs{ Name: "validator", LogLevel: slog.LevelDebug, }, From 6752beab9e3284c3aa0fa4e8d71c23df08673669 Mon Sep 17 00:00:00 2001 From: Renato Maia <1887792+renatomaia@users.noreply.github.com> Date: Thu, 25 Jun 2026 16:01:04 -0300 Subject: [PATCH 15/16] refactor(services): discard the 'force' flag to operation that stop services --- internal/advancer/advancer.go | 2 +- internal/advancer/advancer_test.go | 3 +- internal/advancer/service.go | 4 +-- internal/evmreader/service_config_test.go | 2 +- internal/node/node.go | 2 +- pkg/service/http_service.go | 2 +- pkg/service/service.go | 16 +++++----- pkg/service/service_test.go | 10 +++--- pkg/service/supervisor.go | 38 ++++++++++------------- pkg/service/supervisor_test.go | 4 +-- pkg/service/tick.go | 2 +- pkg/service/tick_test.go | 10 +++--- 12 files changed, 43 insertions(+), 52 deletions(-) diff --git a/internal/advancer/advancer.go b/internal/advancer/advancer.go index 1593cc81d..a4049b6b9 100644 --- a/internal/advancer/advancer.go +++ b/internal/advancer/advancer.go @@ -311,7 +311,7 @@ func (s *Service) processInputs(ctx context.Context, app *Application, inputs [] "epoch", input.EpochIndex, "index", input.Index, "error", err) - s.Supervisor.Stop(true) // triggers graceful shutdown of all services + s.Supervisor.Stop() // shutdown all services return err } diff --git a/internal/advancer/advancer_test.go b/internal/advancer/advancer_test.go index ab5c90922..593815cbb 100644 --- a/internal/advancer/advancer_test.go +++ b/internal/advancer/advancer_test.go @@ -118,8 +118,7 @@ func (s *AdvancerSuite) TestServiceInterface() { require.Contains(tickErr.Error(), "list epochs error") // Stop must be called last to cleanly shut down the service. - // It should complete without returning any errors. - require.NoError(advancer.Supervisor.Stop(false)) + advancer.Supervisor.Stop() }) } diff --git a/internal/advancer/service.go b/internal/advancer/service.go index 693607464..fc245ed99 100644 --- a/internal/advancer/service.go +++ b/internal/advancer/service.go @@ -138,7 +138,7 @@ func (s *Service) Tick(ctx context.Context) (bool, error) { return hadWork, err } -func (s *Service) Stop(b bool) error { +func (s *Service) Stop() error { var errs []error if s.machineManager != nil { s.Logger.Info("Closing machine manager") @@ -146,7 +146,7 @@ func (s *Service) Stop(b bool) error { errs = append(errs, fmt.Errorf("failed to close machine manager: %w", err)) } } - if err := s.TickServiceTemplate.Stop(b); err != nil { + if err := s.TickServiceTemplate.Stop(); err != nil { errs = append(errs, err) } return errors.Join(errs...) diff --git a/internal/evmreader/service_config_test.go b/internal/evmreader/service_config_test.go index 3d17f8e30..2cd175edd 100644 --- a/internal/evmreader/service_config_test.go +++ b/internal/evmreader/service_config_test.go @@ -97,7 +97,7 @@ func TestCreateAcceptsRequestTimeoutBelowPollingInterval(t *testing.T) { Repository: repo, }) require.NoError(t, err) - defer svc.Stop(false) + defer svc.Stop() repo.AssertExpectations(t) } diff --git a/internal/node/node.go b/internal/node/node.go index bf85ca156..7b1234083 100644 --- a/internal/node/node.go +++ b/internal/node/node.go @@ -102,7 +102,7 @@ func createServices( if len(errs) > 0 { for _, svc := range services { - stopErr := svc.Stop(true) + stopErr := svc.Stop() if stopErr != nil { errs = append(errs, stopErr) } diff --git a/pkg/service/http_service.go b/pkg/service/http_service.go index 0e976486a..e43627db8 100644 --- a/pkg/service/http_service.go +++ b/pkg/service/http_service.go @@ -91,7 +91,7 @@ func InitHTTPServiceTemplate( return nil } -func (s *HTTPServiceTemplate) Stop(bool) error { +func (s *HTTPServiceTemplate) Stop() error { s.Logger.Info("Shutting down HTTP service", "addr", s.Server.Addr) ctx, cancel := context.WithTimeout(context.Background(), s.shutdownTimeout) defer cancel() diff --git a/pkg/service/service.go b/pkg/service/service.go index 7d17c4127..a181da010 100644 --- a/pkg/service/service.go +++ b/pkg/service/service.go @@ -66,19 +66,19 @@ var ( ErrServiceStopped = fmt.Errorf("service was stopped") ) -// Public interface with methods to manipulate the service. -type IService interface { +// Basic methods to manipulate a service. +type BaseService interface { Alive() bool Ready() bool Reload() error - Stop(bool) error Serve() error String() string } // Service interface with a service supervisor. type SupervisedService interface { - IService + BaseService + Stop() error SetSupervisor(*Supervisor) } @@ -116,10 +116,10 @@ func InitServiceTemplate(c *BaseConfigs, s *BaseTemplate) error { // Default implementation of some abstract methods (except `Serve`). // Remove them to force concrete services to provide implementation for them. -func (s *BaseTemplate) Reload() error { return nil } -func (s *BaseTemplate) Stop(bool) error { return nil } -func (s *BaseTemplate) Alive() bool { return true } -func (s *BaseTemplate) Ready() bool { return true } +func (s *BaseTemplate) Reload() error { return nil } +func (s *BaseTemplate) Stop() error { return nil } +func (s *BaseTemplate) Alive() bool { return true } +func (s *BaseTemplate) Ready() bool { return true } func (s *BaseTemplate) String() string { return s.Name } func (s *BaseTemplate) SetSupervisor(supervisor *Supervisor) { diff --git a/pkg/service/service_test.go b/pkg/service/service_test.go index fd4e3beb6..24d59bd9b 100644 --- a/pkg/service/service_test.go +++ b/pkg/service/service_test.go @@ -15,7 +15,7 @@ type delayedCloseImpl struct { onStopInitChan chan struct{} } -func (s *delayedCloseImpl) Stop(bool) error { +func (s *delayedCloseImpl) Stop() error { <-s.onStopInitChan // wait signal to initiate stop return nil } @@ -51,13 +51,12 @@ func (s *ServeSuite) TestServeExitsAfterStopIsComplete() { close(onServeEndChan) }() - onStopEndChan := make(chan error) + onStopEndChan := make(chan struct{}) select { case <-svc.onServeInitChan: // wait service to initiate, so can be stopped. // initiate service shutdown through context cancelation go func() { - err := supervisor.Stop(true) - onStopEndChan <- err // signal stop ended and provide the errors + supervisor.Stop() close(onStopEndChan) }() case <-time.After(2 * time.Second): @@ -86,8 +85,7 @@ func (s *ServeSuite) TestServeExitsAfterStopIsComplete() { // Stop() should exit without errors. select { - case err := <-onStopEndChan: - s.NoError(err) + case <-onStopEndChan: case <-time.After(2 * time.Second): s.Fail("Stop() did not exit within 2 seconds after 'OnStop' concluded") } diff --git a/pkg/service/supervisor.go b/pkg/service/supervisor.go index 5ae060b8c..ba68f2956 100644 --- a/pkg/service/supervisor.go +++ b/pkg/service/supervisor.go @@ -14,6 +14,11 @@ import ( "time" ) +type IService interface { + BaseService + Stop() +} + type SupervisorConfigs struct { BaseConfigs Services []SupervisedService @@ -164,13 +169,12 @@ func (s *Supervisor) Serve() error { if err != nil { s.Logger.Error("Service failed to restart", "error", err) } + continue case <-s.sigShutdown: - s.Stop(false) // Graceful shutdown; errors are logged by Stop. - return case <-s.context.Done(): - s.Stop(true) // Stop logs errors internally. - return } + s.Stop() + return } }() @@ -188,7 +192,7 @@ func (s *Supervisor) Serve() error { "service", svc.String(), "error", err, ) - s.Stop(false) + s.Stop() case s.stopped.Load(): s.Logger.Info("Service stopped", "service", svc.String(), @@ -209,19 +213,19 @@ func (s *Supervisor) Serve() error { } } - go s.Stop(true) + go s.Stop() <-s.stoppedChan return errors.Join(allErrs...) } -func (s *Supervisor) Stop(force bool) error { +func (s *Supervisor) Stop() { // CAS achieves once-semantics: the second caller returns immediately // (fire-and-forget) rather than blocking like sync.Once. This is safe // because the orchestrator calls Cancel() after Stop() and waits for // the Serve goroutine to exit. if !s.stopped.CompareAndSwap(false, true) { - return nil // already stopped + return // already stopped } if s.sigShutdown != nil { @@ -231,29 +235,19 @@ func (s *Supervisor) Stop(force bool) error { signal.Stop(s.sigHangUp) } - var err error for i := len(s.services)-1; i >= 0; i-- { svc := s.services[i] start := time.Now() - svcErr := svc.Stop(force) + err := svc.Stop() elapsed := time.Since(start) - if svcErr != nil { - s.Logger.Error("Stop", - "force", force, - "duration", elapsed, - "error", svcErr) + if err != nil { + s.Logger.Error("Stop", "duration", elapsed, "error", err) } else { - s.Logger.Info("Stop", - "force", force, - "duration", elapsed) + s.Logger.Info("Stop", "duration", elapsed) } - - err = errors.Join(err, svcErr) } s.cancelContext() close(s.stoppedChan) - - return err } diff --git a/pkg/service/supervisor_test.go b/pkg/service/supervisor_test.go index e9fd27aa2..3afc30b9d 100644 --- a/pkg/service/supervisor_test.go +++ b/pkg/service/supervisor_test.go @@ -83,7 +83,7 @@ func (s *SupervisorSuite) TestNodeStopCancelsChildContexts() { s.Fail("child-2 did not start") } - supervisor.Stop(false) + supervisor.Stop() select { case <-child1.done: @@ -144,7 +144,7 @@ func (c *stopAwareChildImpl) Serve() error { return nil } -func (c *stopAwareChildImpl) Stop(bool) error { +func (c *stopAwareChildImpl) Stop() error { c.stopOnce.Do(func() { close(c.stopped) }) return nil } diff --git a/pkg/service/tick.go b/pkg/service/tick.go index 9d560f928..0b7620b11 100644 --- a/pkg/service/tick.go +++ b/pkg/service/tick.go @@ -75,7 +75,7 @@ func (s *TickServiceTemplate) tick(ctx context.Context) bool { return reschedule } -func (s *TickServiceTemplate) Stop(bool) error { +func (s *TickServiceTemplate) Stop() error { s.ticker.Stop() return nil } diff --git a/pkg/service/tick_test.go b/pkg/service/tick_test.go index 12bf01920..2bf3fc653 100644 --- a/pkg/service/tick_test.go +++ b/pkg/service/tick_test.go @@ -85,7 +85,7 @@ func (s *ServeSuite) TestDisabledReschedulePreservesExistingBehavior() { // Let a few timer ticks fire. time.Sleep(90 * time.Millisecond) - svc.Stop(true) + svc.Stop() <-done // The initial tick + ~3-4 timer ticks at 20ms intervals over 90ms. @@ -108,7 +108,7 @@ func (s *ServeSuite) TestRescheduleTriggersImmediateRetick() { } svc := createTestService(s.T(), impl, 10 * time.Minute) - defer svc.Stop(true) + defer svc.Stop() done := make(chan struct{}) go func() { @@ -119,7 +119,7 @@ func (s *ServeSuite) TestRescheduleTriggersImmediateRetick() { // Wait briefly. With a 10-minute poll interval, the only way to get // 3 ticks quickly is via SignalReschedule. time.Sleep(100 * time.Millisecond) - svc.Stop(true) + svc.Stop() <-done ticks := impl.tickCount.Load() @@ -146,7 +146,7 @@ func (s *ServeSuite) TestContextCancellationExitsPromptly() { // Let the initial tick fire and signal reschedule. time.Sleep(20 * time.Millisecond) - svc.Stop(true) + svc.Stop() // Serve() should exit promptly. select { @@ -162,7 +162,7 @@ func (s *ServeSuite) TestServeExitsOnContextCancelledBeforeFirstTick() { // Create the service with a live context, then cancel before Serve(). svc := createTestService(s.T(), impl, 10 * time.Minute) - svc.Stop(true) + svc.Stop() errs := svc.Serve() s.Empty(errs) From 426dd48e511294b2d04c8fae13708fde6281e2b0 Mon Sep 17 00:00:00 2001 From: Renato Maia <1887792+renatomaia@users.noreply.github.com> Date: Thu, 25 Jun 2026 17:16:06 -0300 Subject: [PATCH 16/16] style: format code using standard Go style --- cmd/cartesi-rollups-advancer/root/root.go | 6 ++--- cmd/cartesi-rollups-claimer/root/root.go | 8 +++---- cmd/cartesi-rollups-evm-reader/root/root.go | 6 ++--- cmd/cartesi-rollups-jsonrpc-api/root/root.go | 6 ++--- cmd/cartesi-rollups-node/root/root.go | 4 ++-- cmd/cartesi-rollups-prt/root/root.go | 8 +++---- cmd/cartesi-rollups-validator/root/root.go | 8 +++---- internal/advancer/service.go | 4 ++-- internal/claimer/claimer.go | 2 +- internal/claimer/claimer_test.go | 4 ++-- internal/claimer/service_test.go | 2 +- internal/inspect/hardening_test.go | 10 ++++---- internal/inspect/inspect_test.go | 8 +++---- internal/jsonrpc/service.go | 2 +- internal/node/node.go | 12 +++++----- pkg/service/service.go | 8 +++---- pkg/service/service_test.go | 24 ++++++++++---------- pkg/service/supervisor.go | 2 +- pkg/service/telemetry_test.go | 4 ++-- pkg/service/tick.go | 6 +++-- pkg/service/tick_test.go | 8 +++---- 21 files changed, 72 insertions(+), 70 deletions(-) diff --git a/cmd/cartesi-rollups-advancer/root/root.go b/cmd/cartesi-rollups-advancer/root/root.go index 3c9a3cb33..01ff0e135 100644 --- a/cmd/cartesi-rollups-advancer/root/root.go +++ b/cmd/cartesi-rollups-advancer/root/root.go @@ -86,10 +86,10 @@ func run(cmd *cobra.Command, args []string) { createInfo := advancer.CreateInfo{ TickServiceConfigs: service.TickServiceConfigs{ - PollInterval: cfg.AdvancerPollingInterval, + PollInterval: cfg.AdvancerPollingInterval, BaseConfigs: service.BaseConfigs{ - Name: svcName, - Logger: logger, + Name: svcName, + Logger: logger, }, }, Config: *cfg, diff --git a/cmd/cartesi-rollups-claimer/root/root.go b/cmd/cartesi-rollups-claimer/root/root.go index c392e7c41..e8cc33679 100644 --- a/cmd/cartesi-rollups-claimer/root/root.go +++ b/cmd/cartesi-rollups-claimer/root/root.go @@ -87,10 +87,10 @@ func run(cmd *cobra.Command, args []string) { createInfo := claimer.CreateInfo{ TickServiceConfigs: service.TickServiceConfigs{ - PollInterval: cfg.ClaimerPollingInterval, + PollInterval: cfg.ClaimerPollingInterval, BaseConfigs: service.BaseConfigs{ - Name: svcName, - Logger: logger, + Name: svcName, + Logger: logger, }, }, Config: *cfg, @@ -120,7 +120,7 @@ func run(cmd *cobra.Command, args []string) { EnableSignalHandling: true, TelemetryCreate: true, TelemetryAddress: cfg.ClaimerTelemetryAddress, - Services: []service.SupervisedService{ claimerService }, + Services: []service.SupervisedService{claimerService}, } supervisor := &service.Supervisor{} err = service.InitServicesSupervisor(supCfg, supervisor) diff --git a/cmd/cartesi-rollups-evm-reader/root/root.go b/cmd/cartesi-rollups-evm-reader/root/root.go index 3d870f842..c8ab11a86 100644 --- a/cmd/cartesi-rollups-evm-reader/root/root.go +++ b/cmd/cartesi-rollups-evm-reader/root/root.go @@ -88,8 +88,8 @@ func run(cmd *cobra.Command, args []string) { createInfo := evmreader.CreateInfo{ TickServiceConfigs: service.TickServiceConfigs{ BaseConfigs: service.BaseConfigs{ - Name: svcName, - Logger: logger, + Name: svcName, + Logger: logger, }, }, Config: *cfg, @@ -121,7 +121,7 @@ func run(cmd *cobra.Command, args []string) { EnableSignalHandling: true, TelemetryCreate: true, TelemetryAddress: cfg.EvmReaderTelemetryAddress, - Services: []service.SupervisedService{ readerService }, + Services: []service.SupervisedService{readerService}, } supervisor := &service.Supervisor{} err = service.InitServicesSupervisor(supCfg, supervisor) diff --git a/cmd/cartesi-rollups-jsonrpc-api/root/root.go b/cmd/cartesi-rollups-jsonrpc-api/root/root.go index 341f952a3..f8f0b59a9 100644 --- a/cmd/cartesi-rollups-jsonrpc-api/root/root.go +++ b/cmd/cartesi-rollups-jsonrpc-api/root/root.go @@ -74,8 +74,8 @@ func run(cmd *cobra.Command, args []string) { createInfo := jsonrpc.CreateInfo{ BaseConfigs: service.BaseConfigs{ - Name: svcName, - Logger: logger, + Name: svcName, + Logger: logger, }, Config: *cfg, } @@ -93,7 +93,7 @@ func run(cmd *cobra.Command, args []string) { EnableSignalHandling: true, TelemetryCreate: true, TelemetryAddress: cfg.JsonrpcTelemetryAddress, - Services: []service.SupervisedService{ jsonrpcService }, + Services: []service.SupervisedService{jsonrpcService}, } supervisor := &service.Supervisor{} err = service.InitServicesSupervisor(supCfg, supervisor) diff --git a/cmd/cartesi-rollups-node/root/root.go b/cmd/cartesi-rollups-node/root/root.go index 15b653361..348af96db 100644 --- a/cmd/cartesi-rollups-node/root/root.go +++ b/cmd/cartesi-rollups-node/root/root.go @@ -164,8 +164,8 @@ func run(cmd *cobra.Command, args []string) { TelemetryAddress: cfg.NodeTelemetryAddress, }, BaseConfigs: service.BaseConfigs{ - Name: config.ServiceNode, - Logger: logger, + Name: config.ServiceNode, + Logger: logger, }, Config: *cfg, } diff --git a/cmd/cartesi-rollups-prt/root/root.go b/cmd/cartesi-rollups-prt/root/root.go index 3f75b5b0c..da798252d 100644 --- a/cmd/cartesi-rollups-prt/root/root.go +++ b/cmd/cartesi-rollups-prt/root/root.go @@ -75,10 +75,10 @@ func run(cmd *cobra.Command, args []string) { createInfo := prt.CreateInfo{ TickServiceConfigs: service.TickServiceConfigs{ - PollInterval: cfg.PrtPollingInterval, + PollInterval: cfg.PrtPollingInterval, BaseConfigs: service.BaseConfigs{ - Name: svcName, - Logger: logger, + Name: svcName, + Logger: logger, }, }, Config: *cfg, @@ -109,7 +109,7 @@ func run(cmd *cobra.Command, args []string) { EnableSignalHandling: true, TelemetryCreate: true, TelemetryAddress: cfg.PrtTelemetryAddress, - Services: []service.SupervisedService{ prtService }, + Services: []service.SupervisedService{prtService}, } supervisor := &service.Supervisor{} err = service.InitServicesSupervisor(supCfg, supervisor) diff --git a/cmd/cartesi-rollups-validator/root/root.go b/cmd/cartesi-rollups-validator/root/root.go index 986dc55ea..222c134b5 100644 --- a/cmd/cartesi-rollups-validator/root/root.go +++ b/cmd/cartesi-rollups-validator/root/root.go @@ -74,10 +74,10 @@ func run(cmd *cobra.Command, args []string) { createInfo := validator.CreateInfo{ TickServiceConfigs: service.TickServiceConfigs{ - PollInterval: cfg.ValidatorPollingInterval, + PollInterval: cfg.ValidatorPollingInterval, BaseConfigs: service.BaseConfigs{ - Name: config.ServiceValidator, - Logger: logger, + Name: config.ServiceValidator, + Logger: logger, }, }, Config: *cfg, @@ -95,7 +95,7 @@ func run(cmd *cobra.Command, args []string) { EnableSignalHandling: true, TelemetryCreate: true, TelemetryAddress: cfg.ValidatorTelemetryAddress, - Services: []service.SupervisedService{ validatorService }, + Services: []service.SupervisedService{validatorService}, } supervisor := &service.Supervisor{} err = service.InitServicesSupervisor(supCfg, supervisor) diff --git a/internal/advancer/service.go b/internal/advancer/service.go index fc245ed99..5749dc5cc 100644 --- a/internal/advancer/service.go +++ b/internal/advancer/service.go @@ -93,8 +93,8 @@ func Create(ctx context.Context, c *CreateInfo) ([]service.SupervisedService, er MaxInflight: c.Config.InspectMaxInflight, ShutdownTimeout: httpShutdownTimeout, }, - Repository: c.Repository, - Machines: manager, + Repository: c.Repository, + Machines: manager, }) if err != nil { return nil, fmt.Errorf("failed to create inspect service: %w", err) diff --git a/internal/claimer/claimer.go b/internal/claimer/claimer.go index 2ba877d33..d0d064a9f 100644 --- a/internal/claimer/claimer.go +++ b/internal/claimer/claimer.go @@ -102,7 +102,7 @@ func (s *Service) Tick(ctx context.Context) (bool, error) { if errStaged != nil { if errors.Is(errStaged, context.Canceled) { s.Logger.Warn("Tick interrupted by shutdown", "stage", "SelectClaimsToAcceptPerApp", "error", errStaged) - return false, nil // TODO:[maia] should we discard the potential database update errors from calls above? + return false, nil // TODO:[maia] should we discard the potential database update errors from calls above? } return false, errors.Join(err, errStaged) } diff --git a/internal/claimer/claimer_test.go b/internal/claimer/claimer_test.go index ede2a9f53..d15c535c6 100644 --- a/internal/claimer/claimer_test.go +++ b/internal/claimer/claimer_test.go @@ -37,8 +37,8 @@ func TestTickInterleavesStagesWithPinnedBlockAndReschedulesOnProgress(t *testing defer b.AssertExpectations(t) err := service.InitTickServiceTemplate(&service.TickServiceConfigs{ - BaseConfigs: service.BaseConfigs{Name: "claimer-test"}, - PollInterval: time.Hour, + BaseConfigs: service.BaseConfigs{Name: "claimer-test"}, + PollInterval: time.Hour, }, &m.TickServiceTemplate, m) require.NoError(t, err) diff --git a/internal/claimer/service_test.go b/internal/claimer/service_test.go index bc71dec6f..bc50775ff 100644 --- a/internal/claimer/service_test.go +++ b/internal/claimer/service_test.go @@ -45,7 +45,7 @@ func TestCreateUsesPersistedDefaultBlock(t *testing.T) { }) require.NoError(t, err) - impl := s.(*Service) // expose struct API for whitebox testing. + impl := s.(*Service) // expose struct API for whitebox testing. blockchain, ok := impl.blockchain.(*claimerBlockchain) require.True(t, ok) diff --git a/internal/inspect/hardening_test.go b/internal/inspect/hardening_test.go index fd1ce1870..84b885809 100644 --- a/internal/inspect/hardening_test.go +++ b/internal/inspect/hardening_test.go @@ -144,8 +144,8 @@ func TestInspector_NewWithCreateInfo(t *testing.T) { func TestInspector_NewRejectsNilMachines(t *testing.T) { _, err := NewInspector(CreateInfo{ - Repository: newMockRepository(), - Machines: nil, + Repository: newMockRepository(), + Machines: nil, HTTPServiceConfigs: service.HTTPServiceConfigs{Address: "127.0.0.1:0"}, }) require.ErrorIs(t, err, ErrInvalidMachines) @@ -335,7 +335,7 @@ func newInspectorWithAdmission(t *testing.T, maxInFlight uint64) (*Inspector, *A BaseConfigs: service.BaseConfigs{ LogLevel: slog.LevelError, }, - Address: "127.0.0.1:0", + Address: "127.0.0.1:0", MaxInflight: maxInFlight, }, }) @@ -443,8 +443,8 @@ func newInspectorWithCORS(t *testing.T, origins string) (*Inspector, *Applicatio mm.Map[1] = MockMachine{application: app} svc, err := NewInspector(CreateInfo{ - Repository: repo, - Machines: mm, + Repository: repo, + Machines: mm, HTTPServiceConfigs: service.HTTPServiceConfigs{ BaseConfigs: service.BaseConfigs{ LogLevel: slog.LevelError, diff --git a/internal/inspect/inspect_test.go b/internal/inspect/inspect_test.go index 9a7f189ae..ad3e6c52e 100644 --- a/internal/inspect/inspect_test.go +++ b/internal/inspect/inspect_test.go @@ -134,8 +134,8 @@ func (s *InspectSuite) TestPostForeclosedMachineUnavailable() { machines := newMockMachines() inspect := &Inspector{ - repository: repo, - IInspectMachines: machines, + repository: repo, + IInspectMachines: machines, HTTPServiceTemplate: service.HTTPServiceTemplate{ BaseTemplate: service.BaseTemplate{ Logger: service.NewLogger(slog.LevelDebug, true), @@ -215,8 +215,8 @@ func (s *InspectSuite) setup() (*Inspector, *Application, common.Hash) { machines := newMockMachines() machines.Map[1] = *m inspect := &Inspector{ - repository: repo, - IInspectMachines: machines, + repository: repo, + IInspectMachines: machines, HTTPServiceTemplate: service.HTTPServiceTemplate{ BaseTemplate: service.BaseTemplate{ Logger: service.NewLogger(slog.LevelDebug, true), diff --git a/internal/jsonrpc/service.go b/internal/jsonrpc/service.go index 9d2bc7e79..31801ad5a 100644 --- a/internal/jsonrpc/service.go +++ b/internal/jsonrpc/service.go @@ -44,7 +44,7 @@ func Create(ctx context.Context, c *CreateInfo) (service.SupervisedService, erro s := &Service{} cfg := &service.HTTPServiceConfigs{ - BaseConfigs: c.BaseConfigs, + BaseConfigs: c.BaseConfigs, HTTPServerOptions: service.DefaultJSONRPCOptions(), Address: c.Config.JsonrpcApiAddress, SafeRequestID: true, diff --git a/internal/node/node.go b/internal/node/node.go index 7b1234083..249205d88 100644 --- a/internal/node/node.go +++ b/internal/node/node.go @@ -145,7 +145,7 @@ func newEVMReader(ctx context.Context, c *CreateInfo) ([]service.SupervisedServi readerArgs := evmreader.CreateInfo{ TickServiceConfigs: service.TickServiceConfigs{ PollInterval: c.Config.EvmReaderPollingInterval, - BaseConfigs: service.BaseConfigs{ + BaseConfigs: service.BaseConfigs{ Name: config.ServiceEvmReader, LogLevel: config.ResolveServiceLogLevel(config.ServiceEvmReader, c.Config.LogLevel), LogColor: c.Config.LogColor, @@ -167,7 +167,7 @@ func newAdvancer(ctx context.Context, c *CreateInfo) ([]service.SupervisedServic advancerArgs := advancer.CreateInfo{ TickServiceConfigs: service.TickServiceConfigs{ PollInterval: c.Config.AdvancerPollingInterval, - BaseConfigs: service.BaseConfigs{ + BaseConfigs: service.BaseConfigs{ Name: config.ServiceAdvancer, LogLevel: config.ResolveServiceLogLevel(config.ServiceAdvancer, c.Config.LogLevel), LogColor: c.Config.LogColor, @@ -188,7 +188,7 @@ func newValidator(ctx context.Context, c *CreateInfo) ([]service.SupervisedServi validatorArgs := validator.CreateInfo{ TickServiceConfigs: service.TickServiceConfigs{ PollInterval: c.Config.ValidatorPollingInterval, - BaseConfigs: service.BaseConfigs{ + BaseConfigs: service.BaseConfigs{ Name: config.ServiceValidator, LogLevel: config.ResolveServiceLogLevel(config.ServiceValidator, c.Config.LogLevel), LogColor: c.Config.LogColor, @@ -209,7 +209,7 @@ func newClaimer(ctx context.Context, c *CreateInfo) ([]service.SupervisedService claimerArgs := claimer.CreateInfo{ TickServiceConfigs: service.TickServiceConfigs{ PollInterval: c.Config.ClaimerPollingInterval, - BaseConfigs: service.BaseConfigs{ + BaseConfigs: service.BaseConfigs{ Name: config.ServiceClaimer, LogLevel: config.ResolveServiceLogLevel(config.ServiceClaimer, c.Config.LogLevel), LogColor: c.Config.LogColor, @@ -230,7 +230,7 @@ func newClaimer(ctx context.Context, c *CreateInfo) ([]service.SupervisedService func newJsonrpc(ctx context.Context, c *CreateInfo) ([]service.SupervisedService, error) { jsonrpcArgs := jsonrpc.CreateInfo{ BaseConfigs: service.BaseConfigs{ - Name: config.ServiceJsonrpc, + Name: config.ServiceJsonrpc, LogLevel: config.ResolveServiceLogLevel(config.ServiceJsonrpc, c.Config.LogLevel), LogColor: c.Config.LogColor, }, @@ -249,7 +249,7 @@ func newPrt(ctx context.Context, c *CreateInfo) ([]service.SupervisedService, er prtArgs := prt.CreateInfo{ TickServiceConfigs: service.TickServiceConfigs{ PollInterval: c.Config.PrtPollingInterval, - BaseConfigs: service.BaseConfigs{ + BaseConfigs: service.BaseConfigs{ Name: config.ServicePrt, LogLevel: config.ResolveServiceLogLevel(config.ServicePrt, c.Config.LogLevel), LogColor: c.Config.LogColor, diff --git a/pkg/service/service.go b/pkg/service/service.go index a181da010..24ebcd021 100644 --- a/pkg/service/service.go +++ b/pkg/service/service.go @@ -116,10 +116,10 @@ func InitServiceTemplate(c *BaseConfigs, s *BaseTemplate) error { // Default implementation of some abstract methods (except `Serve`). // Remove them to force concrete services to provide implementation for them. -func (s *BaseTemplate) Reload() error { return nil } -func (s *BaseTemplate) Stop() error { return nil } -func (s *BaseTemplate) Alive() bool { return true } -func (s *BaseTemplate) Ready() bool { return true } +func (s *BaseTemplate) Reload() error { return nil } +func (s *BaseTemplate) Stop() error { return nil } +func (s *BaseTemplate) Alive() bool { return true } +func (s *BaseTemplate) Ready() bool { return true } func (s *BaseTemplate) String() string { return s.Name } func (s *BaseTemplate) SetSupervisor(supervisor *Supervisor) { diff --git a/pkg/service/service_test.go b/pkg/service/service_test.go index 24d59bd9b..e1175550e 100644 --- a/pkg/service/service_test.go +++ b/pkg/service/service_test.go @@ -12,16 +12,16 @@ import ( type delayedCloseImpl struct { BaseTemplate onServeInitChan chan struct{} - onStopInitChan chan struct{} + onStopInitChan chan struct{} } func (s *delayedCloseImpl) Stop() error { - <-s.onStopInitChan // wait signal to initiate stop + <-s.onStopInitChan // wait signal to initiate stop return nil } func (s *delayedCloseImpl) Serve() error { - close(s.onServeInitChan) // signal service was initiated + close(s.onServeInitChan) // signal service was initiated <-s.Supervisor.Context().Done() return nil } @@ -29,7 +29,7 @@ func (s *delayedCloseImpl) Serve() error { func (s *ServeSuite) TestServeExitsAfterStopIsComplete() { svc := &delayedCloseImpl{ onServeInitChan: make(chan struct{}), - onStopInitChan: make(chan struct{}), + onStopInitChan: make(chan struct{}), } // Create the service with a live context, then cancel before Serve(). @@ -47,13 +47,13 @@ func (s *ServeSuite) TestServeExitsAfterStopIsComplete() { onServeEndChan := make(chan error) go func() { err = supervisor.Serve() - onServeEndChan <- err // signal service ended and provide error + onServeEndChan <- err // signal service ended and provide error close(onServeEndChan) }() onStopEndChan := make(chan struct{}) select { - case <-svc.onServeInitChan: // wait service to initiate, so can be stopped. + case <-svc.onServeInitChan: // wait service to initiate, so can be stopped. // initiate service shutdown through context cancelation go func() { supervisor.Stop() @@ -73,7 +73,7 @@ func (s *ServeSuite) TestServeExitsAfterStopIsComplete() { // OK } - close(svc.onStopInitChan) // signal that stop shall initiate and eventually complete + close(svc.onStopInitChan) // signal that stop shall initiate and eventually complete // Serve() should exit without errors. select { @@ -94,7 +94,7 @@ func (s *ServeSuite) TestServeExitsAfterStopIsComplete() { func (s *ServeSuite) TestServeExitsAfterStopIsCompleteOnContextCancelation() { svc := &delayedCloseImpl{ onServeInitChan: make(chan struct{}), - onStopInitChan: make(chan struct{}), + onStopInitChan: make(chan struct{}), } // Create the service with a live context, then cancel before Serve(). @@ -117,13 +117,13 @@ func (s *ServeSuite) TestServeExitsAfterStopIsCompleteOnContextCancelation() { onServeEndChan := make(chan error) go func() { err = supervisor.Serve() - onServeEndChan <- err // signal service ended and provide error + onServeEndChan <- err // signal service ended and provide error close(onServeEndChan) }() select { - case <-svc.onServeInitChan: // wait service to initiate, so can be stopped. - cancel() // initiate service shutdown through context cancelation + case <-svc.onServeInitChan: // wait service to initiate, so can be stopped. + cancel() // initiate service shutdown through context cancelation case <-time.After(2 * time.Second): s.Fail("Serve() did not start within 2 seconds") } @@ -136,7 +136,7 @@ func (s *ServeSuite) TestServeExitsAfterStopIsCompleteOnContextCancelation() { // OK } - close(svc.onStopInitChan) // signal that stop shall initiate and eventually complete + close(svc.onStopInitChan) // signal that stop shall initiate and eventually complete // Serve() should exit without errors. select { diff --git a/pkg/service/supervisor.go b/pkg/service/supervisor.go index ba68f2956..550efc956 100644 --- a/pkg/service/supervisor.go +++ b/pkg/service/supervisor.go @@ -235,7 +235,7 @@ func (s *Supervisor) Stop() { signal.Stop(s.sigHangUp) } - for i := len(s.services)-1; i >= 0; i-- { + for i := len(s.services) - 1; i >= 0; i-- { svc := s.services[i] start := time.Now() err := svc.Stop() diff --git a/pkg/service/telemetry_test.go b/pkg/service/telemetry_test.go index b3f00d348..8fe62b643 100644 --- a/pkg/service/telemetry_test.go +++ b/pkg/service/telemetry_test.go @@ -96,8 +96,8 @@ func TestCreateDefaultTelemetry_PanicRecovered(t *testing.T) { type falseLifecycleImpl struct{ BaseTemplate } -func (*falseLifecycleImpl) Alive() bool { return false } -func (*falseLifecycleImpl) Ready() bool { return false } +func (*falseLifecycleImpl) Alive() bool { return false } +func (*falseLifecycleImpl) Ready() bool { return false } func (*falseLifecycleImpl) Serve() error { return nil } func TestCreateDefaultTelemetry_Returns500WhenLifecycleFails(t *testing.T) { diff --git a/pkg/service/tick.go b/pkg/service/tick.go index 0b7620b11..5e3b445b5 100644 --- a/pkg/service/tick.go +++ b/pkg/service/tick.go @@ -85,13 +85,15 @@ func (s *TickServiceTemplate) Serve() error { if ctx.Err() != nil { return nil } - for s.tick(ctx) {} + for s.tick(ctx) { + } for { select { case <-ctx.Done(): return nil case <-s.ticker.C: - for s.tick(ctx) {} + for s.tick(ctx) { + } } } } diff --git a/pkg/service/tick_test.go b/pkg/service/tick_test.go index 2bf3fc653..b7929805c 100644 --- a/pkg/service/tick_test.go +++ b/pkg/service/tick_test.go @@ -75,7 +75,7 @@ func (s *ServeSuite) TestDisabledReschedulePreservesExistingBehavior() { // Serve() should tick only on timer fires. impl := &mockImpl{} - svc := createTestService(s.T(), impl, 20 * time.Millisecond) + svc := createTestService(s.T(), impl, 20*time.Millisecond) done := make(chan struct{}) go func() { @@ -107,7 +107,7 @@ func (s *ServeSuite) TestRescheduleTriggersImmediateRetick() { }, } - svc := createTestService(s.T(), impl, 10 * time.Minute) + svc := createTestService(s.T(), impl, 10*time.Minute) defer svc.Stop() done := make(chan struct{}) @@ -136,7 +136,7 @@ func (s *ServeSuite) TestContextCancellationExitsPromptly() { }, } - svc := createTestService(s.T(), impl, 10 * time.Minute) + svc := createTestService(s.T(), impl, 10*time.Minute) done := make(chan struct{}) go func() { @@ -161,7 +161,7 @@ func (s *ServeSuite) TestServeExitsOnContextCancelledBeforeFirstTick() { impl := &mockImpl{} // Create the service with a live context, then cancel before Serve(). - svc := createTestService(s.T(), impl, 10 * time.Minute) + svc := createTestService(s.T(), impl, 10*time.Minute) svc.Stop() errs := svc.Serve()