From feb7b8fffb4187347671f5d1f1dbd353883cfcfc Mon Sep 17 00:00:00 2001 From: TianDi101 Date: Mon, 15 Jun 2026 09:42:57 +0000 Subject: [PATCH 1/8] umbp: Phase 0 prep for IMasterMetadataStore refactor Preparatory type changes ahead of the IMasterMetadataStore refactor. No behavior change; the rest of the refactor becomes purely structural. 0a. Hoist NodeTierKey (from GlobalBlockIndex) and NodeMatch (from ExternalKvBlockIndex, with MatchedHashCount) to types.h. Temporary `using` aliases left in both classes so existing callers compile; removed in Phase 5. 0b. Add ClientRegistration and HeartbeatResult (with nested enum Status { APPLIED, SEQ_GAP, UNKNOWN }) to types.h. Additive; these are the input/result vocabulary the Phase 1 interface uses. 0c. Migrate boundary-crossing timestamps from steady_clock to system_clock (hazard #7): ClientRecord {last_heartbeat,registered_at}, BlockMetrics {created_at,last_accessed_at}, EvictionCandidate last_accessed_at, BlockEntry lease/access atomics + GrantLease/ BatchLookupForRouteGet durations, Router lease_duration_, and master_server NowNs(). Process-local duration/timeout clocks (rpc latency timer, ssd read-lease, peer allocator deadlines) intentionally stay steady_clock. Fixes the one audit fallout in test_route_put_strategy.cpp that constructed the migrated fields. Full UMBP C++ unit suite (local + distributed) green. Co-Authored-By: Claude Opus 4.8 --- .../distributed/master/client_registry.cpp | 6 +- .../distributed/master/global_block_index.cpp | 8 +-- src/umbp/distributed/master/master_server.cpp | 2 +- .../master/external_kv_block_index.h | 15 +---- .../distributed/master/global_block_index.h | 30 ++++----- .../include/umbp/distributed/routing/router.h | 4 +- src/umbp/include/umbp/distributed/types.h | 67 +++++++++++++++++-- .../distributed/test_route_put_strategy.cpp | 4 +- 8 files changed, 90 insertions(+), 46 deletions(-) diff --git a/src/umbp/distributed/master/client_registry.cpp b/src/umbp/distributed/master/client_registry.cpp index 8f06394be..b9e1b96fb 100644 --- a/src/umbp/distributed/master/client_registry.cpp +++ b/src/umbp/distributed/master/client_registry.cpp @@ -53,7 +53,7 @@ bool ClientRegistry::RegisterClient(const std::string& node_id, const std::strin const std::vector& engine_desc_bytes, const std::vector& tags) { std::unique_lock lock(mutex_); - const auto now = std::chrono::steady_clock::now(); + const auto now = std::chrono::system_clock::now(); auto it = clients_.find(node_id); if (it != clients_.end()) { @@ -132,7 +132,7 @@ ClientStatus ClientRegistry::Heartbeat(const std::string& node_id, } auto& record = it->second; - record.last_heartbeat = std::chrono::steady_clock::now(); + record.last_heartbeat = std::chrono::system_clock::now(); record.status = ClientStatus::ALIVE; record.tier_capacities = tier_capacities; @@ -235,7 +235,7 @@ void ClientRegistry::ReaperLoop() { } void ClientRegistry::ReapExpiredClients() { - const auto now = std::chrono::steady_clock::now(); + const auto now = std::chrono::system_clock::now(); const auto expiry = ExpiryDuration(); std::vector dead_nodes; diff --git a/src/umbp/distributed/master/global_block_index.cpp b/src/umbp/distributed/master/global_block_index.cpp index 87744a0b6..24114dcaa 100644 --- a/src/umbp/distributed/master/global_block_index.cpp +++ b/src/umbp/distributed/master/global_block_index.cpp @@ -93,7 +93,7 @@ size_t GlobalBlockIndex::ApplyEvents(const std::string& node_id, if (events.empty()) return 0; std::unique_lock lock(mutex_); size_t mutated = 0; - const auto now = std::chrono::steady_clock::now(); + const auto now = std::chrono::system_clock::now(); for (const auto& ev : events) { if (ev.kind == KvEvent::Kind::CLEAR_AT_TIER) { @@ -148,7 +148,7 @@ size_t GlobalBlockIndex::ApplyEvents(const std::string& node_id, void GlobalBlockIndex::ReplaceNodeLocations(const std::string& node_id, const std::vector& adds) { std::unique_lock lock(mutex_); - const auto now = std::chrono::steady_clock::now(); + const auto now = std::chrono::system_clock::now(); // O(N_node + |adds|) via the reverse index. auto rev_it = node_to_keys_.find(node_id); @@ -198,7 +198,7 @@ void GlobalBlockIndex::RecordAccess(const std::string& key) { } void GlobalBlockIndex::GrantLease(const std::string& key, - std::chrono::steady_clock::duration duration) { + std::chrono::system_clock::duration duration) { std::shared_lock lock(mutex_); auto it = entries_.find(key); if (it != entries_.end()) it->second.GrantLease(duration); @@ -234,7 +234,7 @@ std::optional GlobalBlockIndex::GetMetrics(const std::string& key) std::vector> GlobalBlockIndex::BatchLookupForRouteGet( const std::vector& keys, const std::unordered_set& exclude_nodes, - std::chrono::steady_clock::duration lease_duration) { + std::chrono::system_clock::duration lease_duration) { std::vector> out(keys.size()); if (keys.empty()) return out; std::shared_lock lock(mutex_); diff --git a/src/umbp/distributed/master/master_server.cpp b/src/umbp/distributed/master/master_server.cpp index 9c17b894a..2e3fac410 100644 --- a/src/umbp/distributed/master/master_server.cpp +++ b/src/umbp/distributed/master/master_server.cpp @@ -73,7 +73,7 @@ uint32_t HitQueryMaxBatch() { uint64_t NowNs() { return static_cast(std::chrono::duration_cast( - std::chrono::steady_clock::now().time_since_epoch()) + std::chrono::system_clock::now().time_since_epoch()) .count()); } diff --git a/src/umbp/include/umbp/distributed/master/external_kv_block_index.h b/src/umbp/include/umbp/distributed/master/external_kv_block_index.h index fa0161e0f..b573098af 100644 --- a/src/umbp/include/umbp/distributed/master/external_kv_block_index.h +++ b/src/umbp/include/umbp/distributed/master/external_kv_block_index.h @@ -52,18 +52,9 @@ class ExternalKvBlockIndex { size_t UnregisterByNodeAtTier(const std::string& node_id, TierType tier); size_t UnregisterByNode(const std::string& node_id); - struct NodeMatch { - std::string node_id; - std::map> hashes_by_tier; - - size_t MatchedHashCount() const { - std::unordered_set seen; - for (const auto& [tier, hashes] : hashes_by_tier) { - for (const auto& h : hashes) seen.insert(h); - } - return seen.size(); - } - }; + // Hoisted to umbp/distributed/types.h; alias kept temporarily so existing + // callers (e.g. ExternalKvBlockIndex::NodeMatch) compile. Removed in Phase 5. + using NodeMatch = mori::umbp::NodeMatch; std::vector Match(const std::vector& hashes) const; size_t GetKvCount(const std::string& node_id) const; diff --git a/src/umbp/include/umbp/distributed/master/global_block_index.h b/src/umbp/include/umbp/distributed/master/global_block_index.h index c634e2045..2897dcf61 100644 --- a/src/umbp/include/umbp/distributed/master/global_block_index.h +++ b/src/umbp/include/umbp/distributed/master/global_block_index.h @@ -44,32 +44,32 @@ struct BlockEntry { std::atomic last_accessed_rep{0}; std::atomic atomic_access_count{0}; - void GrantLease(std::chrono::steady_clock::duration duration) { - auto expiry = std::chrono::steady_clock::now() + duration; + void GrantLease(std::chrono::system_clock::duration duration) { + auto expiry = std::chrono::system_clock::now() + duration; lease_expiry_rep.store(expiry.time_since_epoch().count(), std::memory_order_release); } bool IsLeased() const { - auto now_rep = std::chrono::steady_clock::now().time_since_epoch().count(); + auto now_rep = std::chrono::system_clock::now().time_since_epoch().count(); return lease_expiry_rep.load(std::memory_order_acquire) > now_rep; } void RecordAccessAtomic() { - last_accessed_rep.store(std::chrono::steady_clock::now().time_since_epoch().count(), + last_accessed_rep.store(std::chrono::system_clock::now().time_since_epoch().count(), std::memory_order_release); atomic_access_count.fetch_add(1, std::memory_order_relaxed); } - std::chrono::steady_clock::time_point GetLastAccessed() const { + std::chrono::system_clock::time_point GetLastAccessed() const { auto rep = last_accessed_rep.load(std::memory_order_acquire); - return std::chrono::steady_clock::time_point(std::chrono::steady_clock::duration(rep)); + return std::chrono::system_clock::time_point(std::chrono::system_clock::duration(rep)); } }; struct EvictionCandidate { std::string key; Location location; - std::chrono::steady_clock::time_point last_accessed_at; + std::chrono::system_clock::time_point last_accessed_at; uint64_t size; }; @@ -104,13 +104,13 @@ class GlobalBlockIndex { void RecordAccess(const std::string& key); // Grant a time-limited lease to protect a key from eviction. - void GrantLease(const std::string& key, std::chrono::steady_clock::duration duration); + void GrantLease(const std::string& key, std::chrono::system_clock::duration duration); // Batched Lookup + filter + (on non-empty result) RecordAccess + GrantLease, // under a single shared_lock. std::vector> BatchLookupForRouteGet( const std::vector& keys, const std::unordered_set& exclude_nodes, - std::chrono::steady_clock::duration lease_duration); + std::chrono::system_clock::duration lease_duration); // --- Queries --- @@ -126,15 +126,9 @@ class GlobalBlockIndex { // --- Eviction --- - struct NodeTierKey { - std::string node_id; - TierType tier; - bool operator<(const NodeTierKey& o) const { - if (node_id != o.node_id) return node_id < o.node_id; - return tier < o.tier; - } - bool operator==(const NodeTierKey& o) const { return node_id == o.node_id && tier == o.tier; } - }; + // Hoisted to umbp/distributed/types.h; alias kept temporarily so existing + // callers (e.g. GlobalBlockIndex::NodeTierKey) compile. Removed in Phase 5. + using NodeTierKey = mori::umbp::NodeTierKey; std::vector FindEvictionCandidates( const std::set& overloaded_node_tiers) const; diff --git a/src/umbp/include/umbp/distributed/routing/router.h b/src/umbp/include/umbp/distributed/routing/router.h index 7b359d530..9ae5ea4c4 100644 --- a/src/umbp/include/umbp/distributed/routing/router.h +++ b/src/umbp/include/umbp/distributed/routing/router.h @@ -75,14 +75,14 @@ class Router { const std::vector& keys, const std::string& node_id, const std::unordered_set& exclude_nodes); - void SetLeaseDuration(std::chrono::steady_clock::duration d) { lease_duration_ = d; } + void SetLeaseDuration(std::chrono::system_clock::duration d) { lease_duration_ = d; } private: GlobalBlockIndex& index_; ClientRegistry& registry_; std::unique_ptr get_strategy_; std::unique_ptr put_strategy_; - std::chrono::steady_clock::duration lease_duration_{std::chrono::seconds{10}}; + std::chrono::system_clock::duration lease_duration_{std::chrono::seconds{10}}; }; } // namespace mori::umbp diff --git a/src/umbp/include/umbp/distributed/types.h b/src/umbp/include/umbp/distributed/types.h index e79b5f58d..7b981f6c0 100644 --- a/src/umbp/include/umbp/distributed/types.h +++ b/src/umbp/include/umbp/distributed/types.h @@ -26,6 +26,8 @@ #include #include #include +#include +#include #include namespace mori::umbp { @@ -61,6 +63,38 @@ struct Location { } }; +// Identifies one (node, tier) capacity bucket — the granularity at which +// eviction budgets and overload are tracked. Hoisted from +// GlobalBlockIndex because it is part of the master metadata store contract. +struct NodeTierKey { + std::string node_id; + TierType tier; + bool operator<(const NodeTierKey& o) const { + if (node_id != o.node_id) return node_id < o.node_id; + return tier < o.tier; + } + bool operator==(const NodeTierKey& o) const { return node_id == o.node_id && tier == o.tier; } +}; + +// One node's external-KV match result, grouped by tier. A single hash may +// appear in MORE THAN ONE tier bucket when a node holds multiple physical +// copies (e.g. HBM + DRAM mirror). std::map iterates in sorted TierType +// order, so the first non-empty bucket is the fastest available tier. +// Hoisted from ExternalKvBlockIndex because it is part of the master +// metadata store contract. +struct NodeMatch { + std::string node_id; + std::map> hashes_by_tier; + + size_t MatchedHashCount() const { + std::unordered_set seen; + for (const auto& [tier, hashes] : hashes_by_tier) { + for (const auto& h : hashes) seen.insert(h); + } + return seen.size(); + } +}; + enum class ClientStatus : int { UNKNOWN = 0, ALIVE = 1, @@ -68,8 +102,8 @@ enum class ClientStatus : int { }; struct BlockMetrics { - std::chrono::steady_clock::time_point created_at; - std::chrono::steady_clock::time_point last_accessed_at; + std::chrono::system_clock::time_point created_at; + std::chrono::system_clock::time_point last_accessed_at; uint64_t access_count = 0; }; @@ -127,8 +161,8 @@ struct ClientRecord { std::string node_id; std::string node_address; ClientStatus status = ClientStatus::UNKNOWN; - std::chrono::steady_clock::time_point last_heartbeat; - std::chrono::steady_clock::time_point registered_at; + std::chrono::system_clock::time_point last_heartbeat; + std::chrono::system_clock::time_point registered_at; std::map tier_capacities; std::string peer_address; @@ -142,6 +176,31 @@ struct ClientRecord { std::vector tags; }; +// Input to IMasterMetadataStore::RegisterClient. Deliberately omits +// last_heartbeat / registered_at / status / last_applied_seq: those are owned +// by the store and derived from the `now` argument the caller passes alongside +// this struct. Keeping them off the input removes the "did the caller bother to +// set these?" ambiguity. +struct ClientRegistration { + std::string node_id; + std::string node_address; + std::map tier_capacities; + std::string peer_address; + std::vector engine_desc_bytes; + std::vector tags; +}; + +// Result of IMasterMetadataStore::ApplyHeartbeat. APPLIED = events accepted, +// registry updated, acked_seq advanced to the request's seq. SEQ_GAP = peer's +// seq is not last_applied_seq + 1; caller responds with a full-sync request and +// acked_seq echoes the previously applied seq so the peer reships. UNKNOWN = no +// record for node_id (peer must re-register). +struct HeartbeatResult { + enum Status { APPLIED, SEQ_GAP, UNKNOWN }; + Status status; + uint64_t acked_seq; // meaningful for APPLIED and SEQ_GAP +}; + // Helpers for logging inline const char* TierTypeName(TierType t) { switch (t) { diff --git a/tests/cpp/umbp/distributed/test_route_put_strategy.cpp b/tests/cpp/umbp/distributed/test_route_put_strategy.cpp index 62d10014f..ddd7d4b44 100644 --- a/tests/cpp/umbp/distributed/test_route_put_strategy.cpp +++ b/tests/cpp/umbp/distributed/test_route_put_strategy.cpp @@ -38,8 +38,8 @@ ClientRecord MakeClient(const std::string& node_id, const std::string& addr, rec.node_address = addr; rec.peer_address = addr; rec.status = ClientStatus::ALIVE; - rec.last_heartbeat = std::chrono::steady_clock::now(); - rec.registered_at = std::chrono::steady_clock::now(); + rec.last_heartbeat = std::chrono::system_clock::now(); + rec.registered_at = std::chrono::system_clock::now(); rec.tier_capacities = std::move(caps); return rec; } From b3d2010a62ed617a7ad6f589209b472418311a6a Mon Sep 17 00:00:00 2001 From: TianDi101 Date: Mon, 15 Jun 2026 10:09:54 +0000 Subject: [PATCH 2/8] umbp: add IMasterMetadataStore interface (refactor phase 1) Land the abstract IMasterMetadataStore interface header consolidating the four master-side state holders (GlobalBlockIndex, ClientRegistry, ExternalKvBlockIndex, ExternalKvHitIndex) behind one contract. No consumers wired yet. - master_metadata_store.h: lifted from the draft, depends only on types.h. Adds the two hit-count methods the draft dropped (GetExternalKvHitCounts, GarbageCollectHits) so the live GetExternalKvHitCounts RPC path is preserved, and adds a `now` parameter to MatchExternalKv so count_as_hit=true can stamp last_seen. - Hoist EvictionCandidate from global_block_index.h into types.h (part of the store contract; mirrors the phase 0 NodeTierKey/NodeMatch hoist) so the interface depends only on types.h. - Compile/instantiation gates: self-compile TU, GMock MockMasterMetadataStore (reused in phase 3), and a signature-completeness test exercising every method through IMasterMetadataStore&. Co-Authored-By: Claude Opus 4.8 --- .../distributed/master/global_block_index.h | 8 +- .../master/master_metadata_store.h | 463 ++++++++++++++++++ src/umbp/include/umbp/distributed/types.h | 10 + src/umbp/tests/CMakeLists.txt | 19 + .../master_metadata_store_self_compile.cpp | 27 + src/umbp/tests/mock_master_metadata_store.h | 124 +++++ .../test_master_metadata_store_interface.cpp | 132 +++++ 7 files changed, 777 insertions(+), 6 deletions(-) create mode 100644 src/umbp/include/umbp/distributed/master/master_metadata_store.h create mode 100644 src/umbp/tests/master_metadata_store_self_compile.cpp create mode 100644 src/umbp/tests/mock_master_metadata_store.h create mode 100644 src/umbp/tests/test_master_metadata_store_interface.cpp diff --git a/src/umbp/include/umbp/distributed/master/global_block_index.h b/src/umbp/include/umbp/distributed/master/global_block_index.h index 2897dcf61..ba28a37aa 100644 --- a/src/umbp/include/umbp/distributed/master/global_block_index.h +++ b/src/umbp/include/umbp/distributed/master/global_block_index.h @@ -66,12 +66,8 @@ struct BlockEntry { } }; -struct EvictionCandidate { - std::string key; - Location location; - std::chrono::system_clock::time_point last_accessed_at; - uint64_t size; -}; +// EvictionCandidate hoisted to umbp/distributed/types.h since it is part of the +// IMasterMetadataStore contract; visible here via the types.h include above. // Master-side projection of every peer's owned-key set. In the // master-as-advisor design this index is *only* mutated through the diff --git a/src/umbp/include/umbp/distributed/master/master_metadata_store.h b/src/umbp/include/umbp/distributed/master/master_metadata_store.h new file mode 100644 index 000000000..ef0cff208 --- /dev/null +++ b/src/umbp/include/umbp/distributed/master/master_metadata_store.h @@ -0,0 +1,463 @@ +// Copyright © Advanced Micro Devices, Inc. All rights reserved. +// +// MIT License +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +// Master-side metadata storage interface for mori::umbp. +// +// Goal: make the master stateless so multiple master replicas can serve +// traffic concurrently (HA). Today GlobalBlockIndex, ClientRegistry, +// ExternalKvBlockIndex, and ExternalKvHitIndex hold their state in +// in-process unordered_maps guarded by std::shared_mutex — that's a +// split-brain hazard the moment you run more than one master process, +// because each replica's view of locations, liveness, leases, LRU, and +// hit counts drifts independently. The fix is to pull ALL durable AND +// volatile master state out behind one abstract interface — +// IMasterMetadataStore — and ship it to a shared backend. +// +// Concrete target backend is Redis (Cluster); a SQL backend is NOT a +// target. The read path does one store round-trip per RouteGet +// (lookup + lease + access in one Lua script) which is fine against +// Redis but ruinous against an OLTP database. If a SQL backend is +// ever needed it'd be for cold archival, not the hot path served by +// this interface. +// +// ===================================================================== +// Why one interface and not three (BlockLocations / ClientRecords / +// ExternalKv) per the earlier sketch: +// ===================================================================== +// - Every cross-store write today touches 2 or 3 of those stores: +// * UnregisterClient → registry + block index + external_kv +// * ReapExpiredClients → registry + block index + external_kv +// * Heartbeat → registry seq-CAS + block index events +// * RegisterExternalKvBlocks → registry alive-check + external_kv +// * RegisterClient re-registration → registry read TTL + write +// - Splitting forces a portable cross-store transaction abstraction. +// In-memory implements it trivially (shared_mutex). Redis does +// not: cross-keyspace MULTI/EXEC or Lua only works if all three +// namespaces hash to the same slot, which leaks "must run on the +// same cluster shard" up into the API. +// - Collapsing to one interface lets each implementation choose its +// own atomicity primitive instead of inventing a portable +// transaction abstraction. +// - Hot-path reads stay easy to identify because they're grouped in +// the read section below; internally an in-memory impl can still +// keep separate sub-maps for code organization. The interface just +// stops pretending they're independently swappable. +// +// ===================================================================== +// What does NOT move behind this interface: +// ===================================================================== +// - EvictionManager policy (watermark math, victim grouping, EvictKey +// RPC dispatch). The store returns LRU-ordered candidates limited +// by a byte budget; the manager decides how to spend that budget +// and ships the RPCs. Stateless under HA — each tick's decision is +// a pure function of the store's snapshot, so concurrent eviction +// passes on different replicas converge instead of fighting. +// - Reaper loop scheduling (timer + cv). Only the per-pass DB action +// moves down, via ExpireStaleClients. The schedule is per-replica +// with no shared state; the ExpireStaleClients call is idempotent +// so multiple replicas can safely run reaper passes concurrently. +// - Hit-index GC loop scheduling (timer + cv). Only the per-pass +// action moves down, via GarbageCollectHits. Same per-replica / +// idempotent reasoning as the reaper. +// +// In particular, lease_expiry / last_accessed_at / access_count and the +// per-hash hit counts DO move into the store — see hazards #4 and #7. +// +// ===================================================================== +// Critical design decisions / hazards (carried forward from review): +// ===================================================================== +// 1. Heartbeat is a CAS, not Get-then-Update. +// The current in-memory ClientRegistry::Heartbeat reads +// last_applied_seq, decides whether to accept, and writes the new +// seq + caps + last_heartbeat under one unique_lock. If this is +// split into Get() then UpdateHeartbeatState() across an external +// backend, two concurrent heartbeats for the same node can both +// observe seq N, both decide "in order," and one silently +// corrupts last_applied_seq. ApplyHeartbeat MUST take seq as the +// CAS value and atomically check `seq == last_applied_seq + 1` +// inside the implementation (one Lua script on Redis; +// shared_mutex unique_lock for the in-memory impl). See +// HeartbeatResult below. +// +// 2. RegisterClient must accept TTL-stale ALIVE rows. +// The current RegisterClient (client_registry.cpp:85-91) allows +// re-registration when `(now - last_heartbeat > ExpiryDuration())` +// even if status==ALIVE (i.e. the reaper hasn't flipped it yet). +// A naive InsertIfNotAlive would reject. The new RegisterClient +// therefore takes a `stale_after` duration so the implementation +// enforces this in the same atomic step. +// +// 3. EXPIRED rows are KEPT, not erased. +// Today's ReapExpiredClients erases rows from the map. +// ExpireStaleClients flips status ALIVE → EXPIRED and keeps the +// row so a re-registration can replace an EXPIRED record cleanly. +// Behavioral change vs. today; consumers must filter status when +// counting (use AliveClientCount, not "size of GetClient over all +// ids", which doesn't exist anyway). +// +// 4. lease_expiry, last_accessed_at, access_count are IN the store. +// Earlier sketches kept them in a process-local LeaseAccessTracker +// to avoid per-RouteGet store writes; that's only safe with a +// single master process. Under HA-stateless masters, two replicas +// can hold conflicting in-memory leases for the same key — +// replica A's lease doesn't block replica B's eviction loop — +// and LRU views diverge per replica. Both are correctness bugs, +// not stale-state inconveniences. So the tracker is gone: +// RouteGet hits the store via LookupBlockForRouteGet which +// atomically reads locations, sets lease_expiry, and bumps +// last_accessed_at / access_count in one Lua script. The +// BatchLookupBlockForRouteGet variant amortizes the cost over +// the prefix-match path the router already uses. +// +// 5. BatchLookupBlockForRouteGet exists because router.cpp:139 +// issues N Lookup() calls in a hot loop. N round-trips against +// Redis is unacceptable; one batched Lua call is required. +// +// 6. Sync vs async. Methods are synchronous; concurrency comes from +// the gRPC handler threads. Cleaner than future-returning every +// method. +// +// 7. Persisted timestamps are system_clock, not steady_clock. +// steady_clock has a per-process arbitrary epoch — its values +// are not meaningful to a different process or after master +// restart, so they cannot live in a Redis row. Every timestamp +// that crosses the IMasterMetadataStore boundary — registration +// times, heartbeat timestamps, lease_expiry, last_accessed_at, +// reaper cutoff, and the hit-count last_seen — is therefore +// system_clock::time_point. No steady_clock anywhere below this +// interface. Assumes NTP-disciplined clocks across master +// replicas: a backward wall-clock jump effectively grants longer +// leases until the clock recovers. +// +// 8. `last_acked_seq` is gone from the heartbeat path. +// The current ClientRegistry::Heartbeat takes a `last_acked_seq` +// parameter (client_registry.cpp:138) but its body ignores it — +// the master gap-checks against its own last_applied_seq. The new +// interface drops the parameter rather than passing through a +// value no implementation will read. The peer wire still carries +// last_acked_seq for the peer's own ack-on-progress logic; the +// master_server adapter simply doesn't forward it down here. +// +// NodeTierKey, NodeMatch, ClientRegistration, HeartbeatResult, and +// EvictionCandidate are defined in umbp/distributed/types.h — they are +// part of this store's contract. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "umbp/distributed/types.h" + +namespace mori::umbp { + +// ===================================================================== +// IMasterMetadataStore — single durable-state interface for the master. +// ===================================================================== +// +// All methods are thread-safe. The master's gRPC handler thread pool +// calls into a single shared instance from many threads concurrently — +// implementations must provide their own synchronization (shared_mutex +// for in-memory, single-script Lua atomicity for Redis). Callers do +// not add external locking around these calls. +// +// Every write method below is atomic in isolation. Methods that span +// what used to be multiple stores (UnregisterClient, ApplyHeartbeat, +// ExpireStaleClients, RegisterExternalKvIfAlive, and the hit-counting +// branch of MatchExternalKv) are atomic across those former +// boundaries — that is the whole reason for the merge. +// +// TODO(atomicity-contract): pin down the required isolation level +// before the Redis backend is written. The default position is +// "atomic with respect to other writes on this interface; readers +// may observe pre-state until commit" — i.e. single-script Lua for +// Redis, shared_mutex unique_lock for in-memory. Document the chosen +// level here once it's settled and add a conformance test that +// exercises a concurrent reader against an in-flight cross-store +// write. +// +// Expected implementations: +// - InMemoryMasterMetadataStore: one std::shared_mutex over the +// internal sub-maps. Mostly a mechanical lift of the current +// classes; keep them as private helpers. Used for single-master +// deployments and unit tests. The per-hash hit counts live in +// process memory and are lost on restart, exactly as the current +// ExternalKvHitIndex does. +// - RedisMasterMetadataStore: cross-keyspace ops via Lua scripts. +// All key namespaces (node:, block:, extkv:, hit:, lru:, lease:) +// must share a hash tag (e.g. `{umbp:}:node:`) +// so they hash to the same slot on Redis Cluster — that's what +// makes cross-namespace Lua atomic, and what makes the hit counts +// crash-durable. +class IMasterMetadataStore { + public: + virtual ~IMasterMetadataStore() = default; + + // =================================================================== + // Cross-store write operations — each call is atomic. + // =================================================================== + + // CAS-style registration. Inserts a fresh ALIVE record. + // - Returns true on new registration. + // - Returns true and replaces the record if an EXPIRED record for + // the same node_id exists. + // - Returns true and replaces the record if an ALIVE record exists + // whose last_heartbeat is older than `stale_after` — handles the + // "reaper hasn't run yet but the record is TTL-stale" case that + // today's RegisterClient permits (see hazard #2 in header + // preamble). + // - Returns false if an ALIVE non-stale record already exists. + // `now` is supplied by the caller so tests can inject time. Uses + // system_clock because the value is persisted in the backend; see + // hazard #7 in header preamble. The store sets last_heartbeat and + // registered_at to `now`, status to ALIVE, and last_applied_seq to 0 + // — the caller does not (and cannot) populate those fields. + // In production, callers derive `stale_after` from + // ClientRegistryConfig::ExpiryDuration() (heartbeat_ttl × + // max_missed_heartbeats); tests inject their own value. + virtual bool RegisterClient(const ClientRegistration& registration, + std::chrono::system_clock::time_point now, + std::chrono::system_clock::duration stale_after) = 0; + + // Drop the client from the client store AND drop every block location + // belonging to it AND drop every external-kv entry belonging to it. + // Idempotent on missing clients. + virtual void UnregisterClient(const std::string& node_id) = 0; + + // Heartbeat ingestion. Atomically: + // 1. Looks up the client record; returns UNKNOWN if absent (no + // other state touched). + // 2. If !is_full_sync and seq != last_applied_seq + 1, returns + // SEQ_GAP with acked_seq = last_applied_seq. SEQ_GAP still + // bumps last_heartbeat and sets status←ALIVE so the reaper + // doesn't kill a node that's heartbeating but mid-recovery; + // caps and last_applied_seq are NOT touched on SEQ_GAP. The + // seq check is the CAS that keeps the gap check race-free + // under concurrent heartbeats — DO NOT implement as a + // separate Get() then UpdateHeartbeatState(); two in-flight + // heartbeats can both observe the same seq and corrupt + // last_applied_seq (see hazard #1 in header preamble). + // 3. On APPLIED, updates caps, last_heartbeat, last_applied_seq, + // status←ALIVE. + // 4. Applies events to block locations: + // * is_full_sync=true → replace every location for node_id + // with the ADDs in events; REMOVE entries ignored. + // * is_full_sync=false → ADD with existing (node,tier) + // overwrites size; REMOVE for unknown (key,node,tier) is + // a silent no-op. + // Returns APPLIED with acked_seq = seq on success. + // + // TODO(payload-sizing): the `events` vector is bounded only by what + // the peer chooses to ship in one heartbeat batch. A full_sync from + // a peer with millions of keys produces a single Lua script of that + // size, which blocks every other Redis client (Redis is + // single-threaded). The contract should be: implementations MAY + // chunk internally for !is_full_sync, but is_full_sync MUST apply + // atomically (a half-applied ReplaceNodeLocations would leave the + // index in a torn state). That in turn forces an upper bound on + // peer-side full_sync batch size — decide and document the cap + // (e.g. 100k events) here, and have the peer fragment larger + // resyncs into multiple full_sync calls or shift to a snapshot- + // then-delta protocol before the Redis backend ships. + virtual HeartbeatResult ApplyHeartbeat(const std::string& node_id, uint64_t seq, + std::chrono::system_clock::time_point now, + const std::map& caps, + const std::vector& events, bool is_full_sync) = 0; + + // Reaper pass. Atomically: + // - Flips status ALIVE → EXPIRED for every record with + // last_heartbeat < cutoff. EXPIRED records are KEPT in the store, + // not erased — see hazard #3 in header preamble. + // - Drops every block location belonging to those clients. + // - Drops every external-kv entry belonging to those clients. + // Returns the affected node_ids for logging. + virtual std::vector ExpireStaleClients( + std::chrono::system_clock::time_point cutoff) = 0; + + // =================================================================== + // External-KV writes — alive-check + mutation atomic together. + // =================================================================== + + // Add `tier` to the tier-set of every (node_id, hash). Idempotent: + // re-registering at the same tier is a no-op; registering at a new + // tier adds a bucket without touching existing tiers. + // Returns true if the alive-check passed and the writes were applied + // (even if every write was a no-op because the entries already existed). + // Returns false if node_id was not ALIVE and nothing was written, so + // the caller can meter the reject without the impl having to log it. + virtual bool RegisterExternalKvIfAlive(const std::string& node_id, + const std::vector& hashes, TierType tier) = 0; + + // Remove `tier` from the tier-set of every (node_id, hash). Other + // tiers for the same hash untouched. (node,hash) entry dropped when + // its tier-set becomes empty. Does NOT check liveness — peers may + // ship unregister during teardown after status has flipped. + virtual void UnregisterExternalKv(const std::string& node_id, + const std::vector& hashes, TierType tier) = 0; + + // Remove `tier` from every hash registered by `node_id` (whole-tier + // wipe — admin path, not heartbeat). + virtual void UnregisterExternalKvByTier(const std::string& node_id, TierType tier) = 0; + + // Drop every per-hash hit-count entry whose last_seen < cutoff. + // Returns the number of entries dropped. Replaces + // ExternalKvHitIndex::GarbageCollect; the cutoff is a system_clock + // time_point (not a uint64_t ns) because last_seen now crosses the + // store boundary — hazard #7. Called by the master's hit-index GC + // loop on each tick with cutoff = system_clock::now() - max_age. + virtual std::size_t GarbageCollectHits(std::chrono::system_clock::time_point cutoff) = 0; + + // =================================================================== + // Reads. None require cross-store atomicity; implementations SHOULD + // make each one a single backend round-trip. + // =================================================================== + + // --- Block locations --- + + // Plain location lookup. Returns every location for `key` without + // granting a lease or recording an access. Pure read — no side + // effects on lease_expiry, last_accessed_at, or access_count. + virtual std::vector LookupBlock(const std::string& key) const = 0; + + // RouteGet primitive. Atomically reads every location for `key`, + // filters out locations whose node_id is in `exclude_nodes`, + // and — only if at least one location survives the filter — sets + // lease_expiry to now + lease_duration and bumps last_accessed_at + // to now / access_count by 1. Returns the filtered locations, or + // empty if the key has no locations or all were excluded. + // Filtering inside the store (not post-hoc in the caller) is + // required so that fully-excluded keys do not receive a lease or + // an access bump — granting those would perturb LRU ordering and + // extend eviction protection for keys the caller explicitly chose + // to skip. Splitting this into separate Lookup / GrantLease / + // RecordAccess methods would be three round trips per RouteGet + // AND would not be atomic across master replicas — see hazard #4. + virtual std::vector LookupBlockForRouteGet( + const std::string& key, const std::unordered_set& exclude_nodes, + std::chrono::system_clock::time_point now, + std::chrono::system_clock::duration lease_duration) = 0; + + // Vectorized RouteGet primitive. Same per-key semantics as + // LookupBlockForRouteGet (leases granted and access recorded only + // for keys that have at least one non-excluded location). Result + // parallel to `keys`; absent or fully-excluded keys yield empty + // inner vectors. One round trip for the whole batch — + // router.cpp:139 today issues N Lookup() calls in a hot loop and + // this is the single-RTT replacement (see hazard #5). + virtual std::vector> BatchLookupBlockForRouteGet( + const std::vector& keys, const std::unordered_set& exclude_nodes, + std::chrono::system_clock::time_point now, + std::chrono::system_clock::duration lease_duration) = 0; + + // Batched existence — pure read, no lease grant, no access record. + // Used by BatchRoutePut for dedup (router.cpp:112): a writer + // landing on an existing key isn't a "read" and must not extend + // the lease or perturb LRU ordering. One round trip. + virtual std::vector BatchExistsBlock(const std::vector& keys) const = 0; + + // LRU-prefix eviction enumeration. For each (node, tier) in + // `bytes_to_free`, walks the store's per-bucket LRU order + // (oldest last_accessed_at first), filters out keys whose + // lease_expiry > now, and accumulates rows until the cumulative + // `location.size` reaches that bucket's budget. Result map is + // keyed by the same NodeTierKeys as the input; absent buckets had + // no eligible candidates. Taking the whole budget map in one call + // lets a single Lua script fan out over every overloaded bucket + // in one round trip — important when dozens of (node, tier) pairs + // are over watermark. + // + // No EraseBlock on this interface — peers ship REMOVEs on their + // next heartbeat after EvictKey executes, so the only mutation + // channels for block locations are ApplyHeartbeat / + // UnregisterClient / ExpireStaleClients. + // + // How the LRU order is produced is an implementation detail: the + // in-memory backend does a full entries_ scan + sort per tick (an + // eviction tick is seconds, not a hot path), while the Redis + // backend maintains a per-(node, tier) ZSET keyed by + // last_accessed_at refreshed on every LookupBlockForRouteGet. The + // contract is only "return LRU-ordered candidates within the byte + // budget," not the index mechanism. + virtual std::map> EnumerateLruForEviction( + const std::map& bytes_to_free, + std::chrono::system_clock::time_point now) const = 0; + + // --- Client records --- + + // Returns the record regardless of status (ALIVE or EXPIRED). Caller + // filters when needed. + virtual std::optional GetClient(const std::string& node_id) const = 0; + + // Hot-path liveness check. Exists as its own method so a Redis backend + // can answer it with a single status field read instead of fetching the + // whole ClientRecord just to filter on status. + virtual bool IsClientAlive(const std::string& node_id) const = 0; + + // Single-node peer-address lookup. Exists as its own method so a + // Redis backend can answer with a single HGET on the node hash + // instead of fetching the whole ClientRecord just to read + // peer_address. The legacy router linear-scans GetAliveClients() + // per RouteGet for the same value; GetPeerAddress replaces that + // with one read. Returns std::nullopt for unknown node_id; + // EXPIRED records still surface their peer_address. + virtual std::optional GetPeerAddress(const std::string& node_id) const = 0; + + // ALIVE only — does not include EXPIRED records. + virtual std::vector ListAliveClients() const = 0; + virtual std::size_t AliveClientCount() const = 0; + + virtual std::vector GetClientTags(const std::string& node_id) const = 0; + + // --- External KV --- + + // Returns matches grouped by node WITHOUT peer_address. Callers + // that need peer addresses join with ListAliveClients() (snapshot + // once per response) or GetPeerAddress(node_id) per-node; + // embedding peer_address in NodeMatch would force every + // implementation to read from two namespaces on every match. + // When `count_as_hit` is true, atomically increments the per-hash + // hit counter for every matched hash AND stamps that hash's + // last_seen = `now`, all in one lock acquisition (lookup + + // increment + stamp). When false, pure read — no hit counts + // touched and `now` is ignored. `now` is system_clock because + // last_seen is persisted / feeds GarbageCollectHits (hazard #7). + virtual std::vector MatchExternalKv(const std::vector& hashes, + bool count_as_hit, + std::chrono::system_clock::time_point now) = 0; + + // Sparse per-hash hit-count read. Returns an entry for each requested + // hash that has a recorded count (hashes with no recorded hits may be + // omitted). Replaces ExternalKvHitIndex::Lookup; backs the live + // GetExternalKvHitCounts RPC. Pure read. + virtual std::vector GetExternalKvHitCounts( + const std::vector& hashes) const = 0; + + virtual std::size_t GetExternalKvCount(const std::string& node_id) const = 0; +}; + +} // namespace mori::umbp diff --git a/src/umbp/include/umbp/distributed/types.h b/src/umbp/include/umbp/distributed/types.h index 7b981f6c0..b767575de 100644 --- a/src/umbp/include/umbp/distributed/types.h +++ b/src/umbp/include/umbp/distributed/types.h @@ -107,6 +107,16 @@ struct BlockMetrics { uint64_t access_count = 0; }; +// One eviction-eligible (key, location) row returned by the master metadata +// store's LRU enumeration. Hoisted from GlobalBlockIndex because it is part of +// the IMasterMetadataStore contract (EnumerateLruForEviction returns these). +struct EvictionCandidate { + std::string key; + Location location; + std::chrono::system_clock::time_point last_accessed_at; + uint64_t size; +}; + // Structured form of one (buffer_index, page_index) slot. Used by the // peer DRAM/HBM allocator to describe which page slot a write should // land in, and by ResolveKey responses to tell readers where to RDMA diff --git a/src/umbp/tests/CMakeLists.txt b/src/umbp/tests/CMakeLists.txt index 13bb792b3..8a04425a0 100644 --- a/src/umbp/tests/CMakeLists.txt +++ b/src/umbp/tests/CMakeLists.txt @@ -21,6 +21,25 @@ enable_testing() include(GoogleTest) +# --------------------------------------------------------------------------- +# test_master_metadata_store_interface — Phase 1 compile/instantiation gate for +# IMasterMetadataStore. Includes an isolated self-compile TU (proves the header +# is self-contained) and a GMock mock instantiation/signature-completeness test. +# --------------------------------------------------------------------------- +add_executable( + test_master_metadata_store_interface test_master_metadata_store_interface.cpp + master_metadata_store_self_compile.cpp) + +target_include_directories(test_master_metadata_store_interface + PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) + +target_link_libraries(test_master_metadata_store_interface + PRIVATE umbp_common GTest::gmock GTest::gtest_main) + +target_compile_features(test_master_metadata_store_interface PRIVATE cxx_std_17) + +gtest_discover_tests(test_master_metadata_store_interface) + # --------------------------------------------------------------------------- # test_external_kv_block_index # --------------------------------------------------------------------------- diff --git a/src/umbp/tests/master_metadata_store_self_compile.cpp b/src/umbp/tests/master_metadata_store_self_compile.cpp new file mode 100644 index 000000000..e8e016ce8 --- /dev/null +++ b/src/umbp/tests/master_metadata_store_self_compile.cpp @@ -0,0 +1,27 @@ +// Copyright © Advanced Micro Devices, Inc. All rights reserved. +// +// MIT License +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +// Phase 1 "header self-compiles" gate: this translation unit includes ONLY the +// interface header (which in turn includes types.h). If it compiles, the header +// is self-contained — no missing includes or forward declarations. Deliberately +// has no other includes and no symbols of its own. +#include "umbp/distributed/master/master_metadata_store.h" diff --git a/src/umbp/tests/mock_master_metadata_store.h b/src/umbp/tests/mock_master_metadata_store.h new file mode 100644 index 000000000..8810c243c --- /dev/null +++ b/src/umbp/tests/mock_master_metadata_store.h @@ -0,0 +1,124 @@ +// Copyright © Advanced Micro Devices, Inc. All rights reserved. +// +// MIT License +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +// GMock mock for IMasterMetadataStore. +// +// Phase 1 use: instantiation gate. If this type compiles and instantiates, +// every pure-virtual on the interface is overridden with a well-typed +// signature — proving the contract has no orphaned/ill-typed methods. +// +// Reused in Phase 3 (consumer-integration) to assert that each rewired +// consumer (Router / EvictionManager / UMBPMasterServiceImpl handlers) calls +// the right store method with correctly-translated arguments. +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "umbp/distributed/master/master_metadata_store.h" +#include "umbp/distributed/types.h" + +namespace mori::umbp { + +class MockMasterMetadataStore : public IMasterMetadataStore { + public: + // Aliases for types whose commas would otherwise break the MOCK_METHOD macro + // parser (it splits the argument list on top-level commas). + using CapsMap = std::map; + using BudgetMap = std::map; + using LruResult = std::map>; + using LocationBatch = std::vector>; + + // --- Cross-store writes --- + MOCK_METHOD(bool, RegisterClient, + (const ClientRegistration& registration, std::chrono::system_clock::time_point now, + std::chrono::system_clock::duration stale_after), + (override)); + MOCK_METHOD(void, UnregisterClient, (const std::string& node_id), (override)); + MOCK_METHOD(HeartbeatResult, ApplyHeartbeat, + (const std::string& node_id, uint64_t seq, std::chrono::system_clock::time_point now, + const CapsMap& caps, (const std::vector&)events, bool is_full_sync), + (override)); + MOCK_METHOD(std::vector, ExpireStaleClients, + (std::chrono::system_clock::time_point cutoff), (override)); + + // --- External-KV writes --- + MOCK_METHOD(bool, RegisterExternalKvIfAlive, + (const std::string& node_id, (const std::vector&)hashes, TierType tier), + (override)); + MOCK_METHOD(void, UnregisterExternalKv, + (const std::string& node_id, (const std::vector&)hashes, TierType tier), + (override)); + MOCK_METHOD(void, UnregisterExternalKvByTier, (const std::string& node_id, TierType tier), + (override)); + MOCK_METHOD(std::size_t, GarbageCollectHits, (std::chrono::system_clock::time_point cutoff), + (override)); + + // --- Block reads --- + MOCK_METHOD(std::vector, LookupBlock, (const std::string& key), (const, override)); + MOCK_METHOD(std::vector, LookupBlockForRouteGet, + (const std::string& key, (const std::unordered_set&)exclude_nodes, + std::chrono::system_clock::time_point now, + std::chrono::system_clock::duration lease_duration), + (override)); + MOCK_METHOD(LocationBatch, BatchLookupBlockForRouteGet, + ((const std::vector&)keys, + (const std::unordered_set&)exclude_nodes, + std::chrono::system_clock::time_point now, + std::chrono::system_clock::duration lease_duration), + (override)); + MOCK_METHOD(std::vector, BatchExistsBlock, ((const std::vector&)keys), + (const, override)); + MOCK_METHOD(LruResult, EnumerateLruForEviction, + (const BudgetMap& bytes_to_free, std::chrono::system_clock::time_point now), + (const, override)); + + // --- Client reads --- + MOCK_METHOD(std::optional, GetClient, (const std::string& node_id), + (const, override)); + MOCK_METHOD(bool, IsClientAlive, (const std::string& node_id), (const, override)); + MOCK_METHOD(std::optional, GetPeerAddress, (const std::string& node_id), + (const, override)); + MOCK_METHOD(std::vector, ListAliveClients, (), (const, override)); + MOCK_METHOD(std::size_t, AliveClientCount, (), (const, override)); + MOCK_METHOD(std::vector, GetClientTags, (const std::string& node_id), + (const, override)); + + // --- External-KV reads --- + MOCK_METHOD(std::vector, MatchExternalKv, + ((const std::vector&)hashes, bool count_as_hit, + std::chrono::system_clock::time_point now), + (override)); + MOCK_METHOD(std::vector, GetExternalKvHitCounts, + ((const std::vector&)hashes), (const, override)); + MOCK_METHOD(std::size_t, GetExternalKvCount, (const std::string& node_id), (const, override)); +}; + +} // namespace mori::umbp diff --git a/src/umbp/tests/test_master_metadata_store_interface.cpp b/src/umbp/tests/test_master_metadata_store_interface.cpp new file mode 100644 index 000000000..28c83ccdc --- /dev/null +++ b/src/umbp/tests/test_master_metadata_store_interface.cpp @@ -0,0 +1,132 @@ +// Copyright © Advanced Micro Devices, Inc. All rights reserved. +// +// MIT License +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +// Phase 1 compile/instantiation gate for IMasterMetadataStore. +// +// The interface is abstract with no implementation yet, so there is no runtime +// behavior to exercise. The bar for Phase 1 is that the contract is well-formed +// and instantiable: +// 1. MockMasterMetadataStore overrides every pure-virtual (a missing or +// ill-typed override makes the mock abstract → fails to instantiate). +// 2. A MockMasterMetadataStore is usable through an IMasterMetadataStore&, +// proving the override set is complete. +// Behavioral assertions arrive with InMemoryMasterMetadataStore in Phase 2. + +#include + +#include + +#include "mock_master_metadata_store.h" +#include "umbp/distributed/master/master_metadata_store.h" + +namespace mori::umbp { +namespace { + +// Instantiation gate: if the interface had an orphaned or ill-typed pure +// virtual, MockMasterMetadataStore would stay abstract and this would not +// compile. +TEST(MasterMetadataStoreInterface, MockIsInstantiableThroughInterface) { + MockMasterMetadataStore mock; + IMasterMetadataStore& store = mock; + (void)store; + SUCCEED(); +} + +// Signature-completeness spot check: name every interface method once through +// the base-class pointer with a default ON_CALL, mirroring the §1b delta table +// plus the two added hit-count methods (GetExternalKvHitCounts, +// GarbageCollectHits) and the `now` parameter on MatchExternalKv. This guards +// against silently dropping the live GetExternalKvHitCounts RPC path. +TEST(MasterMetadataStoreInterface, EveryMethodIsCallableThroughInterface) { + using ::testing::_; + using ::testing::NiceMock; + using ::testing::Return; + using namespace std::chrono_literals; + + // NiceMock: these are default-action calls, not behavior under test, so the + // "uninteresting call" warnings would just be noise. + NiceMock mock; + const auto now = std::chrono::system_clock::now(); + + ON_CALL(mock, RegisterClient(_, _, _)).WillByDefault(Return(true)); + ON_CALL(mock, ApplyHeartbeat(_, _, _, _, _, _)) + .WillByDefault(Return(HeartbeatResult{HeartbeatResult::APPLIED, 0})); + ON_CALL(mock, ExpireStaleClients(_)).WillByDefault(Return(std::vector{})); + ON_CALL(mock, RegisterExternalKvIfAlive(_, _, _)).WillByDefault(Return(true)); + ON_CALL(mock, GarbageCollectHits(_)).WillByDefault(Return(0)); + ON_CALL(mock, LookupBlock(_)).WillByDefault(Return(std::vector{})); + ON_CALL(mock, LookupBlockForRouteGet(_, _, _, _)).WillByDefault(Return(std::vector{})); + ON_CALL(mock, BatchLookupBlockForRouteGet(_, _, _, _)) + .WillByDefault(Return(std::vector>{})); + ON_CALL(mock, BatchExistsBlock(_)).WillByDefault(Return(std::vector{})); + ON_CALL(mock, EnumerateLruForEviction(_, _)) + .WillByDefault(Return(std::map>{})); + ON_CALL(mock, GetClient(_)).WillByDefault(Return(std::nullopt)); + ON_CALL(mock, IsClientAlive(_)).WillByDefault(Return(false)); + ON_CALL(mock, GetPeerAddress(_)).WillByDefault(Return(std::nullopt)); + ON_CALL(mock, ListAliveClients()).WillByDefault(Return(std::vector{})); + ON_CALL(mock, AliveClientCount()).WillByDefault(Return(0)); + ON_CALL(mock, GetClientTags(_)).WillByDefault(Return(std::vector{})); + ON_CALL(mock, MatchExternalKv(_, _, _)).WillByDefault(Return(std::vector{})); + ON_CALL(mock, GetExternalKvHitCounts(_)) + .WillByDefault(Return(std::vector{})); + ON_CALL(mock, GetExternalKvCount(_)).WillByDefault(Return(0)); + + IMasterMetadataStore& store = mock; + + // Cross-store writes. + ClientRegistration reg; + reg.node_id = "node-a"; + EXPECT_TRUE(store.RegisterClient(reg, now, 30s)); + store.UnregisterClient("node-a"); + EXPECT_EQ(store.ApplyHeartbeat("node-a", 1, now, {}, {}, false).status, HeartbeatResult::APPLIED); + EXPECT_TRUE(store.ExpireStaleClients(now).empty()); + + // External-KV writes. + EXPECT_TRUE(store.RegisterExternalKvIfAlive("node-a", {"h0"}, TierType::HBM)); + store.UnregisterExternalKv("node-a", {"h0"}, TierType::HBM); + store.UnregisterExternalKvByTier("node-a", TierType::HBM); + EXPECT_EQ(store.GarbageCollectHits(now), 0u); + + // Block reads. + EXPECT_TRUE(store.LookupBlock("k0").empty()); + EXPECT_TRUE(store.LookupBlockForRouteGet("k0", {}, now, 5s).empty()); + EXPECT_TRUE(store.BatchLookupBlockForRouteGet({"k0"}, {}, now, 5s).empty()); + EXPECT_TRUE(store.BatchExistsBlock({"k0"}).empty()); + EXPECT_TRUE(store.EnumerateLruForEviction({}, now).empty()); + + // Client reads. + EXPECT_FALSE(store.GetClient("node-a").has_value()); + EXPECT_FALSE(store.IsClientAlive("node-a")); + EXPECT_FALSE(store.GetPeerAddress("node-a").has_value()); + EXPECT_TRUE(store.ListAliveClients().empty()); + EXPECT_EQ(store.AliveClientCount(), 0u); + EXPECT_TRUE(store.GetClientTags("node-a").empty()); + + // External-KV reads, incl. the two added hit-count methods + `now` param. + EXPECT_TRUE(store.MatchExternalKv({"h0"}, /*count_as_hit=*/true, now).empty()); + EXPECT_TRUE(store.GetExternalKvHitCounts({"h0"}).empty()); + EXPECT_EQ(store.GetExternalKvCount("node-a"), 0u); +} + +} // namespace +} // namespace mori::umbp From f75bfe0ade2531c35087cc54a256ea8f1dd5708a Mon Sep 17 00:00:00 2001 From: TianDi101 Date: Mon, 15 Jun 2026 12:19:41 +0000 Subject: [PATCH 3/8] umbp: add InMemoryMasterMetadataStore (Phase 2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement IMasterMetadataStore in-process by folding the four former state holders (GlobalBlockIndex, ClientRegistry, ExternalKvBlockIndex, ExternalKvHitIndex) behind a single std::shared_mutex. - Block locations + LRU/lease (per-entry atomics mutated under a shared lock, keeping the RouteGet hot path concurrent); lease/access timestamps are now caller-supplied system_clock (hazard #7). - ApplyHeartbeat single-seq CAS; SEQ_GAP keeps liveness but not caps/seq (hazard #1). ExpireStaleClients flips ALIVE->EXPIRED and keeps the row, cascading block + external-KV cleanup atomically (hazard #3). - RegisterExternalKvIfAlive fuses the alive-check with the write (TOCTOU fix). - MatchExternalKv(count_as_hit=true) is the one formerly-shared path that takes the unique lock; hit last_seen is system_clock and feeds GarbageCollectHits. - EnumerateLruForEviction = Option A (full scan + sort + greedy byte budget, no maintained index) so tie-timestamps never drop candidates. Adds the §6a behavioral suite (31 cases) written against IMasterMetadataStore& so it is reused for the Redis backend; all green, concurrency cases clean under ThreadSanitizer. Co-Authored-By: Claude Opus 4.8 --- src/umbp/CMakeLists.txt | 1 + .../in_memory_master_metadata_store.cpp | 628 ++++++++++++++++++ .../master/in_memory_master_metadata_store.h | 186 ++++++ src/umbp/tests/CMakeLists.txt | 15 + .../test_in_memory_master_metadata_store.cpp | 616 +++++++++++++++++ 5 files changed, 1446 insertions(+) create mode 100644 src/umbp/distributed/master/in_memory_master_metadata_store.cpp create mode 100644 src/umbp/include/umbp/distributed/master/in_memory_master_metadata_store.h create mode 100644 src/umbp/tests/test_in_memory_master_metadata_store.cpp diff --git a/src/umbp/CMakeLists.txt b/src/umbp/CMakeLists.txt index 08edd1c43..84087e006 100644 --- a/src/umbp/CMakeLists.txt +++ b/src/umbp/CMakeLists.txt @@ -312,6 +312,7 @@ add_library( distributed/master/external_kv_block_index.cpp distributed/master/client_registry.cpp distributed/master/external_kv_hit_index.cpp + distributed/master/in_memory_master_metadata_store.cpp distributed/master/master_server.cpp distributed/master/master_client.cpp distributed/master/rpc_latency_timer.cpp diff --git a/src/umbp/distributed/master/in_memory_master_metadata_store.cpp b/src/umbp/distributed/master/in_memory_master_metadata_store.cpp new file mode 100644 index 000000000..a5d90aa4c --- /dev/null +++ b/src/umbp/distributed/master/in_memory_master_metadata_store.cpp @@ -0,0 +1,628 @@ +// Copyright © Advanced Micro Devices, Inc. All rights reserved. +// +// MIT License +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +#include "umbp/distributed/master/in_memory_master_metadata_store.h" + +#include +#include +#include +#include +#include + +#include "mori/utils/mori_log.hpp" + +namespace mori::umbp { + +namespace { + +// Locate (or insert) the location for (node_id, tier) within a location list. +// Caller MUST hold the unique lock. Returns a pointer into `locations` that's +// stable until the next mutation. +std::pair FindOrInsertLocation(std::vector& locations, + const std::string& node_id, TierType tier) { + for (auto& loc : locations) { + if (loc.node_id == node_id && loc.tier == tier) return {&loc, false}; + } + locations.push_back(Location{node_id, /*size=*/0, tier}); + return {&locations.back(), true}; +} + +bool HasLocationForNode(const std::vector& locations, const std::string& node_id) { + return std::any_of(locations.begin(), locations.end(), + [&](const Location& loc) { return loc.node_id == node_id; }); +} + +} // namespace + +// ===================================================================== +// Locked helpers +// ===================================================================== + +size_t InMemoryMasterMetadataStore::ApplyEventsLocked(const std::string& node_id, + const std::vector& events, + std::chrono::system_clock::time_point now) { + size_t mutated = 0; + for (const auto& ev : events) { + if (ev.kind == KvEvent::Kind::CLEAR_AT_TIER) { + for (auto it = entries_.begin(); it != entries_.end();) { + auto& locs = it->second.locations; + const size_t before = locs.size(); + locs.erase(std::remove_if(locs.begin(), locs.end(), + [&](const Location& l) { + return l.node_id == node_id && l.tier == ev.tier; + }), + locs.end()); + const size_t removed = before - locs.size(); + mutated += removed; + if (removed != 0 && !HasLocationForNode(it->second.locations, node_id)) { + auto rev_it = node_to_keys_.find(node_id); + if (rev_it != node_to_keys_.end()) { + rev_it->second.erase(it->first); + if (rev_it->second.empty()) node_to_keys_.erase(rev_it); + } + } + if (locs.empty()) { + it = entries_.erase(it); + } else { + ++it; + } + } + } else if (ev.kind == KvEvent::Kind::ADD) { + auto& entry = entries_[ev.key]; + if (entry.locations.empty()) { + entry.metrics.created_at = now; + entry.metrics.last_accessed_at = now; + entry.metrics.access_count = 0; + entry.last_accessed_rep.store(now.time_since_epoch().count(), std::memory_order_release); + entry.atomic_access_count.store(0, std::memory_order_relaxed); + } + auto [loc, inserted] = FindOrInsertLocation(entry.locations, node_id, ev.tier); + // Idempotent; must run on duplicate ADDs too. + node_to_keys_[node_id].insert(ev.key); + if (!inserted) { + MORI_UMBP_WARN( + "[MetadataStore] duplicate ADD for key='{}' node={} tier={} old_size={} " + "new_size={}; keeping existing location", + ev.key, node_id, TierTypeName(ev.tier), loc->size, ev.size); + } else { + loc->size = ev.size; + ++mutated; + } + } else { // REMOVE + auto it = entries_.find(ev.key); + if (it == entries_.end()) continue; + auto& locs = it->second.locations; + const size_t before = locs.size(); + locs.erase(std::remove_if( + locs.begin(), locs.end(), + [&](const Location& l) { return l.node_id == node_id && l.tier == ev.tier; }), + locs.end()); + if (locs.size() != before) { + ++mutated; + if (!HasLocationForNode(it->second.locations, node_id)) { + auto rev_it = node_to_keys_.find(node_id); + if (rev_it != node_to_keys_.end()) { + rev_it->second.erase(ev.key); + if (rev_it->second.empty()) node_to_keys_.erase(rev_it); + } + } + if (locs.empty()) entries_.erase(it); + } + } + } + return mutated; +} + +void InMemoryMasterMetadataStore::ReplaceNodeLocationsLocked( + const std::string& node_id, const std::vector& adds, + std::chrono::system_clock::time_point now) { + // O(N_node + |adds|) via the reverse index. + auto rev_it = node_to_keys_.find(node_id); + if (rev_it != node_to_keys_.end()) { + auto old_keys = std::move(rev_it->second); + node_to_keys_.erase(rev_it); + for (const auto& key : old_keys) { + auto eit = entries_.find(key); + if (eit == entries_.end()) continue; + auto& locs = eit->second.locations; + locs.erase(std::remove_if(locs.begin(), locs.end(), + [&](const Location& l) { return l.node_id == node_id; }), + locs.end()); + if (locs.empty()) entries_.erase(eit); + } + } + + for (const auto& ev : adds) { + if (ev.kind != KvEvent::Kind::ADD) continue; + auto& entry = entries_[ev.key]; + if (entry.locations.empty()) { + entry.metrics.created_at = now; + entry.metrics.last_accessed_at = now; + entry.metrics.access_count = 0; + entry.last_accessed_rep.store(now.time_since_epoch().count(), std::memory_order_release); + entry.atomic_access_count.store(0, std::memory_order_relaxed); + } + auto [loc, inserted] = FindOrInsertLocation(entry.locations, node_id, ev.tier); + (void)inserted; + loc->size = ev.size; + node_to_keys_[node_id].insert(ev.key); + } +} + +void InMemoryMasterMetadataStore::RemoveBlocksByNodeLocked(const std::string& node_id) { + for (auto it = entries_.begin(); it != entries_.end();) { + auto& locs = it->second.locations; + const size_t before = locs.size(); + locs.erase(std::remove_if(locs.begin(), locs.end(), + [&](const Location& l) { return l.node_id == node_id; }), + locs.end()); + if (locs.empty()) { + it = entries_.erase(it); + } else { + ++it; + } + (void)before; + } + node_to_keys_.erase(node_id); +} + +void InMemoryMasterMetadataStore::RemoveExternalKvByNodeLocked(const std::string& node_id) { + auto it = external_kv_entries_.begin(); + while (it != external_kv_entries_.end()) { + it->second.erase(node_id); + if (it->second.empty()) { + it = external_kv_entries_.erase(it); + } else { + ++it; + } + } +} + +bool InMemoryMasterMetadataStore::IsClientAliveLocked(const std::string& node_id) const { + auto it = clients_.find(node_id); + return it != clients_.end() && it->second.status == ClientStatus::ALIVE; +} + +// ===================================================================== +// Cross-store writes +// ===================================================================== + +bool InMemoryMasterMetadataStore::RegisterClient(const ClientRegistration& registration, + std::chrono::system_clock::time_point now, + std::chrono::system_clock::duration stale_after) { + std::unique_lock lock(mutex_); + + auto it = clients_.find(registration.node_id); + if (it != clients_.end()) { + const bool is_stale = (now - it->second.last_heartbeat > stale_after) || + (it->second.status == ClientStatus::EXPIRED); + if (it->second.status == ClientStatus::ALIVE && !is_stale) { + MORI_UMBP_WARN("[MetadataStore] Rejecting re-registration for alive node: {}", + registration.node_id); + return false; + } + MORI_UMBP_INFO("[MetadataStore] Re-registering stale/expired node: {}", registration.node_id); + } + + ClientRecord record; + record.node_id = registration.node_id; + record.node_address = registration.node_address; + record.status = ClientStatus::ALIVE; + record.last_heartbeat = now; + record.registered_at = now; + record.tier_capacities = registration.tier_capacities; + record.peer_address = registration.peer_address; + record.engine_desc_bytes = registration.engine_desc_bytes; + record.last_applied_seq = 0; + record.tags = registration.tags; + + clients_[registration.node_id] = std::move(record); + + std::string tags_str; + for (const auto& t : registration.tags) { + if (!tags_str.empty()) tags_str += ','; + tags_str += t; + } + MORI_UMBP_INFO("[MetadataStore] Registered node: {} at {} (peer={}) tags=[{}]", + registration.node_id, registration.node_address, registration.peer_address, + tags_str); + return true; +} + +void InMemoryMasterMetadataStore::UnregisterClient(const std::string& node_id) { + std::unique_lock lock(mutex_); + auto it = clients_.find(node_id); + if (it == clients_.end()) return; + clients_.erase(it); + RemoveBlocksByNodeLocked(node_id); + RemoveExternalKvByNodeLocked(node_id); + MORI_UMBP_INFO("[MetadataStore] Unregistered node: {}", node_id); +} + +HeartbeatResult InMemoryMasterMetadataStore::ApplyHeartbeat( + const std::string& node_id, uint64_t seq, std::chrono::system_clock::time_point now, + const std::map& caps, const std::vector& events, + bool is_full_sync) { + std::unique_lock lock(mutex_); + + auto it = clients_.find(node_id); + if (it == clients_.end()) { + MORI_UMBP_WARN("[MetadataStore] Heartbeat from unknown node: {}", node_id); + return HeartbeatResult{HeartbeatResult::UNKNOWN, 0}; + } + auto& record = it->second; + + // Gap check (CAS) on the delta path only — full_sync replaces wholesale and + // re-baselines last_applied_seq. + if (!is_full_sync && seq != record.last_applied_seq + 1) { + // SEQ_GAP: keep the node alive (it IS heartbeating, just mid-recovery) but + // do NOT advance caps or last_applied_seq. See hazard #1. + MORI_UMBP_WARN( + "[MetadataStore] Heartbeat seq gap from {}: got {}, expected {} — requesting full sync", + node_id, seq, record.last_applied_seq + 1); + record.last_heartbeat = now; + record.status = ClientStatus::ALIVE; + return HeartbeatResult{HeartbeatResult::SEQ_GAP, record.last_applied_seq}; + } + + record.last_heartbeat = now; + record.status = ClientStatus::ALIVE; + record.tier_capacities = caps; + record.last_applied_seq = seq; + + if (is_full_sync) { + ReplaceNodeLocationsLocked(node_id, events, now); + } else { + ApplyEventsLocked(node_id, events, now); + } + return HeartbeatResult{HeartbeatResult::APPLIED, seq}; +} + +std::vector InMemoryMasterMetadataStore::ExpireStaleClients( + std::chrono::system_clock::time_point cutoff) { + std::unique_lock lock(mutex_); + std::vector dead_nodes; + + for (auto& [node_id, record] : clients_) { + // Only ALIVE rows can transition to EXPIRED; an already-EXPIRED row is left + // alone so re-ticking the reaper is idempotent (its locations are already + // gone). EXPIRED rows are KEPT, not erased — see hazard #3. + if (record.status == ClientStatus::ALIVE && record.last_heartbeat < cutoff) { + MORI_UMBP_WARN("[MetadataStore] Expiring stale client: {}", node_id); + record.status = ClientStatus::EXPIRED; + dead_nodes.push_back(node_id); + } + } + + for (const auto& dead_id : dead_nodes) { + RemoveBlocksByNodeLocked(dead_id); + RemoveExternalKvByNodeLocked(dead_id); + } + return dead_nodes; +} + +// ===================================================================== +// External-KV writes +// ===================================================================== + +bool InMemoryMasterMetadataStore::RegisterExternalKvIfAlive(const std::string& node_id, + const std::vector& hashes, + TierType tier) { + std::unique_lock lock(mutex_); + if (!IsClientAliveLocked(node_id)) return false; + for (const auto& hash : hashes) { + external_kv_entries_[hash][node_id].insert(tier); + } + return true; +} + +void InMemoryMasterMetadataStore::UnregisterExternalKv(const std::string& node_id, + const std::vector& hashes, + TierType tier) { + std::unique_lock lock(mutex_); + for (const auto& hash : hashes) { + auto it = external_kv_entries_.find(hash); + if (it == external_kv_entries_.end()) continue; + auto node_it = it->second.find(node_id); + if (node_it == it->second.end()) continue; + node_it->second.erase(tier); + if (node_it->second.empty()) it->second.erase(node_it); + if (it->second.empty()) external_kv_entries_.erase(it); + } +} + +void InMemoryMasterMetadataStore::UnregisterExternalKvByTier(const std::string& node_id, + TierType tier) { + std::unique_lock lock(mutex_); + auto it = external_kv_entries_.begin(); + while (it != external_kv_entries_.end()) { + auto node_it = it->second.find(node_id); + if (node_it != it->second.end()) { + node_it->second.erase(tier); + if (node_it->second.empty()) it->second.erase(node_it); + } + if (it->second.empty()) { + it = external_kv_entries_.erase(it); + } else { + ++it; + } + } +} + +std::size_t InMemoryMasterMetadataStore::GarbageCollectHits( + std::chrono::system_clock::time_point cutoff) { + std::unique_lock lock(mutex_); + std::size_t dropped = 0; + auto it = external_kv_hits_.begin(); + while (it != external_kv_hits_.end()) { + if (it->second.last_seen < cutoff) { + it = external_kv_hits_.erase(it); + ++dropped; + } else { + ++it; + } + } + return dropped; +} + +// ===================================================================== +// Block reads +// ===================================================================== + +std::vector InMemoryMasterMetadataStore::LookupBlock(const std::string& key) const { + std::shared_lock lock(mutex_); + auto it = entries_.find(key); + if (it == entries_.end()) return {}; + return it->second.locations; +} + +std::vector InMemoryMasterMetadataStore::LookupBlockForRouteGet( + const std::string& key, const std::unordered_set& exclude_nodes, + std::chrono::system_clock::time_point now, std::chrono::system_clock::duration lease_duration) { + std::shared_lock lock(mutex_); + auto it = entries_.find(key); + if (it == entries_.end()) return {}; + + std::vector out; + for (const auto& loc : it->second.locations) { + if (!exclude_nodes.empty() && exclude_nodes.count(loc.node_id)) continue; + out.push_back(loc); + } + if (out.empty()) return out; + it->second.RecordAccessAtomic(now); + it->second.GrantLease(now, lease_duration); + return out; +} + +std::vector> InMemoryMasterMetadataStore::BatchLookupBlockForRouteGet( + const std::vector& keys, const std::unordered_set& exclude_nodes, + std::chrono::system_clock::time_point now, std::chrono::system_clock::duration lease_duration) { + std::vector> out(keys.size()); + if (keys.empty()) return out; + std::shared_lock lock(mutex_); + for (size_t i = 0; i < keys.size(); ++i) { + auto it = entries_.find(keys[i]); + if (it == entries_.end()) continue; + auto& locs = out[i]; + for (const auto& loc : it->second.locations) { + if (!exclude_nodes.empty() && exclude_nodes.count(loc.node_id)) continue; + locs.push_back(loc); + } + if (locs.empty()) continue; + it->second.RecordAccessAtomic(now); + it->second.GrantLease(now, lease_duration); + } + return out; +} + +std::vector InMemoryMasterMetadataStore::BatchExistsBlock( + const std::vector& keys) const { + std::vector results(keys.size(), false); + if (keys.empty()) return results; + std::shared_lock lock(mutex_); + for (size_t i = 0; i < keys.size(); ++i) { + auto it = entries_.find(keys[i]); + results[i] = (it != entries_.end()) && !it->second.locations.empty(); + } + return results; +} + +std::map> +InMemoryMasterMetadataStore::EnumerateLruForEviction( + const std::map& bytes_to_free, + std::chrono::system_clock::time_point now) const { + std::map> result; + if (bytes_to_free.empty()) return result; + + std::shared_lock lock(mutex_); + + // 1. Full scan: collect non-leased candidates whose (node, tier) is a budget + // key. No maintained LRU index — the scan reads entries_ directly, so it + // is always consistent and tie-timestamp candidates are never dropped + // (§2d, Option A). + std::map> buckets; + for (const auto& [key, entry] : entries_) { + if (entry.IsLeased(now)) continue; + const auto last_accessed = entry.GetLastAccessed(); + for (const auto& loc : entry.locations) { + NodeTierKey ntk{loc.node_id, loc.tier}; + if (bytes_to_free.find(ntk) == bytes_to_free.end()) continue; + EvictionCandidate c; + c.key = key; + c.location = loc; + c.last_accessed_at = last_accessed; + c.size = loc.size; + buckets[ntk].push_back(std::move(c)); + } + } + + // 2 + 3. Sort each bucket oldest-first, then greedily take until the byte + // budget for that bucket is met. + for (auto& [ntk, candidates] : buckets) { + std::sort(candidates.begin(), candidates.end(), + [](const EvictionCandidate& a, const EvictionCandidate& b) { + return a.last_accessed_at < b.last_accessed_at; + }); + const uint64_t budget = bytes_to_free.at(ntk); + uint64_t freed = 0; + std::vector selected; + for (auto& c : candidates) { + if (freed >= budget) break; + freed += c.size; + selected.push_back(std::move(c)); + } + if (!selected.empty()) result[ntk] = std::move(selected); + } + return result; +} + +// ===================================================================== +// Client reads +// ===================================================================== + +std::optional InMemoryMasterMetadataStore::GetClient( + const std::string& node_id) const { + std::shared_lock lock(mutex_); + auto it = clients_.find(node_id); + if (it == clients_.end()) return std::nullopt; + return it->second; +} + +bool InMemoryMasterMetadataStore::IsClientAlive(const std::string& node_id) const { + std::shared_lock lock(mutex_); + return IsClientAliveLocked(node_id); +} + +std::optional InMemoryMasterMetadataStore::GetPeerAddress( + const std::string& node_id) const { + std::shared_lock lock(mutex_); + auto it = clients_.find(node_id); + if (it == clients_.end()) return std::nullopt; + return it->second.peer_address; +} + +std::vector InMemoryMasterMetadataStore::ListAliveClients() const { + std::shared_lock lock(mutex_); + std::vector result; + for (const auto& [id, record] : clients_) { + if (record.status == ClientStatus::ALIVE) result.push_back(record); + } + return result; +} + +std::size_t InMemoryMasterMetadataStore::AliveClientCount() const { + std::shared_lock lock(mutex_); + std::size_t count = 0; + for (const auto& [id, record] : clients_) { + if (record.status == ClientStatus::ALIVE) ++count; + } + return count; +} + +std::vector InMemoryMasterMetadataStore::GetClientTags( + const std::string& node_id) const { + std::shared_lock lock(mutex_); + auto it = clients_.find(node_id); + if (it == clients_.end()) return {}; + return it->second.tags; +} + +// ===================================================================== +// External-KV reads +// ===================================================================== + +std::vector InMemoryMasterMetadataStore::MatchExternalKv( + const std::vector& hashes, bool count_as_hit, + std::chrono::system_clock::time_point now) { + // count_as_hit mutates external_kv_hits_, so take the exclusive lock in that + // case; a pure read stays shared. This is the one formerly-shared path that + // becomes exclusive under the single mutex (§2a), but it's one acquisition + // per RPC, not per hash. + std::unordered_map>> acc; + + auto match_into = [&]() { + for (const auto& hash : hashes) { + auto it = external_kv_entries_.find(hash); + if (it == external_kv_entries_.end()) continue; + for (const auto& [node_id, tiers] : it->second) { + auto& by_tier = acc[node_id]; + for (TierType tier : tiers) by_tier[tier].push_back(hash); + } + } + }; + + if (count_as_hit) { + std::unique_lock lock(mutex_); + match_into(); + // Increment each unique matched hash once and stamp last_seen = now. + std::unordered_set matched; + for (const auto& [node_id, by_tier] : acc) { + for (const auto& [tier, hs] : by_tier) { + for (const auto& h : hs) matched.insert(h); + } + } + for (const auto& h : matched) { + auto& entry = external_kv_hits_[h]; + ++entry.count; + if (entry.last_seen < now) entry.last_seen = now; + } + } else { + std::shared_lock lock(mutex_); + match_into(); + } + + std::vector result; + result.reserve(acc.size()); + for (auto& [node_id, by_tier] : acc) { + NodeMatch m; + m.node_id = node_id; + m.hashes_by_tier = std::move(by_tier); + result.push_back(std::move(m)); + } + return result; +} + +std::vector InMemoryMasterMetadataStore::GetExternalKvHitCounts( + const std::vector& hashes) const { + std::shared_lock lock(mutex_); + std::vector out; + std::unordered_set seen; + seen.reserve(hashes.size()); + for (const auto& hash : hashes) { + if (!seen.insert(hash).second) continue; + auto it = external_kv_hits_.find(hash); + if (it == external_kv_hits_.end()) continue; + out.push_back(ExternalKvHitCountEntry{hash, it->second.count}); + } + return out; +} + +std::size_t InMemoryMasterMetadataStore::GetExternalKvCount(const std::string& node_id) const { + std::shared_lock lock(mutex_); + std::size_t count = 0; + for (const auto& [hash, nodes] : external_kv_entries_) { + (void)hash; + if (nodes.count(node_id)) ++count; + } + return count; +} + +} // namespace mori::umbp diff --git a/src/umbp/include/umbp/distributed/master/in_memory_master_metadata_store.h b/src/umbp/include/umbp/distributed/master/in_memory_master_metadata_store.h new file mode 100644 index 000000000..1b26d8474 --- /dev/null +++ b/src/umbp/include/umbp/distributed/master/in_memory_master_metadata_store.h @@ -0,0 +1,186 @@ +// Copyright © Advanced Micro Devices, Inc. All rights reserved. +// +// MIT License +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +// In-process implementation of IMasterMetadataStore. +// +// This is the single-master / unit-test backend: it folds the four former +// state holders (GlobalBlockIndex, ClientRegistry, ExternalKvBlockIndex, +// ExternalKvHitIndex) into one class behind one std::shared_mutex. The logic +// is a near-verbatim lift of those classes; what changes is the locking +// (four independent lock domains collapse to one) and that every timestamp +// crossing the interface boundary is now caller-supplied system_clock — see +// the hazards in master_metadata_store.h. +// +// Per-hash hit counts live in process memory and are lost on restart, exactly +// as the old ExternalKvHitIndex did; crash-durability is a Redis-backend +// concern only. +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "umbp/distributed/master/master_metadata_store.h" +#include "umbp/distributed/types.h" + +namespace mori::umbp { + +class InMemoryMasterMetadataStore : public IMasterMetadataStore { + public: + InMemoryMasterMetadataStore() = default; + ~InMemoryMasterMetadataStore() override = default; + + InMemoryMasterMetadataStore(const InMemoryMasterMetadataStore&) = delete; + InMemoryMasterMetadataStore& operator=(const InMemoryMasterMetadataStore&) = delete; + + // --- Cross-store writes --- + bool RegisterClient(const ClientRegistration& registration, + std::chrono::system_clock::time_point now, + std::chrono::system_clock::duration stale_after) override; + void UnregisterClient(const std::string& node_id) override; + HeartbeatResult ApplyHeartbeat(const std::string& node_id, uint64_t seq, + std::chrono::system_clock::time_point now, + const std::map& caps, + const std::vector& events, bool is_full_sync) override; + std::vector ExpireStaleClients( + std::chrono::system_clock::time_point cutoff) override; + + // --- External-KV writes --- + bool RegisterExternalKvIfAlive(const std::string& node_id, const std::vector& hashes, + TierType tier) override; + void UnregisterExternalKv(const std::string& node_id, const std::vector& hashes, + TierType tier) override; + void UnregisterExternalKvByTier(const std::string& node_id, TierType tier) override; + std::size_t GarbageCollectHits(std::chrono::system_clock::time_point cutoff) override; + + // --- Block reads --- + std::vector LookupBlock(const std::string& key) const override; + std::vector LookupBlockForRouteGet( + const std::string& key, const std::unordered_set& exclude_nodes, + std::chrono::system_clock::time_point now, + std::chrono::system_clock::duration lease_duration) override; + std::vector> BatchLookupBlockForRouteGet( + const std::vector& keys, const std::unordered_set& exclude_nodes, + std::chrono::system_clock::time_point now, + std::chrono::system_clock::duration lease_duration) override; + std::vector BatchExistsBlock(const std::vector& keys) const override; + std::map> EnumerateLruForEviction( + const std::map& bytes_to_free, + std::chrono::system_clock::time_point now) const override; + + // --- Client reads --- + std::optional GetClient(const std::string& node_id) const override; + bool IsClientAlive(const std::string& node_id) const override; + std::optional GetPeerAddress(const std::string& node_id) const override; + std::vector ListAliveClients() const override; + std::size_t AliveClientCount() const override; + std::vector GetClientTags(const std::string& node_id) const override; + + // --- External-KV reads --- + std::vector MatchExternalKv(const std::vector& hashes, bool count_as_hit, + std::chrono::system_clock::time_point now) override; + std::vector GetExternalKvHitCounts( + const std::vector& hashes) const override; + std::size_t GetExternalKvCount(const std::string& node_id) const override; + + private: + // One block's locations + LRU/lease metadata. Lifted from + // GlobalBlockIndex::BlockEntry, but the lease/access mutators now take a + // caller-supplied `now` (system_clock) instead of reading the clock + // internally — the value crosses the store boundary (hazard #7). The + // lease/access state stays in atomics so the RouteGet path can mutate it + // under a shared lock, exactly as today (§2a). + struct BlockEntry { + std::vector locations; + BlockMetrics metrics; + + std::atomic lease_expiry_rep{0}; + std::atomic last_accessed_rep{0}; + std::atomic atomic_access_count{0}; + + void GrantLease(std::chrono::system_clock::time_point now, + std::chrono::system_clock::duration duration) { + auto expiry = now + duration; + lease_expiry_rep.store(expiry.time_since_epoch().count(), std::memory_order_release); + } + + bool IsLeased(std::chrono::system_clock::time_point now) const { + return lease_expiry_rep.load(std::memory_order_acquire) > now.time_since_epoch().count(); + } + + void RecordAccessAtomic(std::chrono::system_clock::time_point now) { + last_accessed_rep.store(now.time_since_epoch().count(), std::memory_order_release); + atomic_access_count.fetch_add(1, std::memory_order_relaxed); + } + + std::chrono::system_clock::time_point GetLastAccessed() const { + auto rep = last_accessed_rep.load(std::memory_order_acquire); + return std::chrono::system_clock::time_point(std::chrono::system_clock::duration(rep)); + } + }; + + // Per-hash cumulative hit counter (lifted from ExternalKvHitIndex, collapsed + // from 256 atomic shards to a single map under mutex_). last_seen is + // system_clock now that it crosses the boundary and feeds GarbageCollectHits. + struct HitEntry { + uint64_t count = 0; + std::chrono::system_clock::time_point last_seen; + }; + + // --- Locked helpers (caller MUST hold the unique lock) --- + size_t ApplyEventsLocked(const std::string& node_id, const std::vector& events, + std::chrono::system_clock::time_point now); + void ReplaceNodeLocationsLocked(const std::string& node_id, const std::vector& adds, + std::chrono::system_clock::time_point now); + void RemoveBlocksByNodeLocked(const std::string& node_id); + void RemoveExternalKvByNodeLocked(const std::string& node_id); + bool IsClientAliveLocked(const std::string& node_id) const; + + mutable std::shared_mutex mutex_; + + // Block locations (from GlobalBlockIndex). + std::unordered_map entries_; + // Reverse index node_id -> keys, so node-scoped removal skips a full scan. + std::unordered_map> node_to_keys_; + + // Client records (from ClientRegistry). + std::unordered_map clients_; + + // External-KV locations (from ExternalKvBlockIndex): hash -> node -> tier-set. + // Keyed hash-first so MatchExternalKv (the hot RPC path) stays O(1) per hash. + std::unordered_map>> + external_kv_entries_; + + // Per-hash hit counts (from ExternalKvHitIndex). + std::unordered_map external_kv_hits_; +}; + +} // namespace mori::umbp diff --git a/src/umbp/tests/CMakeLists.txt b/src/umbp/tests/CMakeLists.txt index 8a04425a0..4c7d3ee57 100644 --- a/src/umbp/tests/CMakeLists.txt +++ b/src/umbp/tests/CMakeLists.txt @@ -40,6 +40,21 @@ target_compile_features(test_master_metadata_store_interface PRIVATE cxx_std_17) gtest_discover_tests(test_master_metadata_store_interface) +# --------------------------------------------------------------------------- +# test_in_memory_master_metadata_store — Phase 2 behavioral suite for the +# InMemoryMasterMetadataStore implementation of IMasterMetadataStore (§6a). +# Written against IMasterMetadataStore& so it is reused for the Redis backend. +# --------------------------------------------------------------------------- +add_executable(test_in_memory_master_metadata_store + test_in_memory_master_metadata_store.cpp) + +target_link_libraries(test_in_memory_master_metadata_store + PRIVATE umbp_common GTest::gtest_main) + +target_compile_features(test_in_memory_master_metadata_store PRIVATE cxx_std_17) + +gtest_discover_tests(test_in_memory_master_metadata_store) + # --------------------------------------------------------------------------- # test_external_kv_block_index # --------------------------------------------------------------------------- diff --git a/src/umbp/tests/test_in_memory_master_metadata_store.cpp b/src/umbp/tests/test_in_memory_master_metadata_store.cpp new file mode 100644 index 000000000..166c1084b --- /dev/null +++ b/src/umbp/tests/test_in_memory_master_metadata_store.cpp @@ -0,0 +1,616 @@ +// Copyright © Advanced Micro Devices, Inc. All rights reserved. +// +// MIT License +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +// Phase 2 behavioral suite for InMemoryMasterMetadataStore (§6a). Written +// against IMasterMetadataStore& so the same cases validate the Redis backend +// later. Tests use injected system_clock times (no real-time sleeps) so they +// are deterministic in CI. +// +// State that the interface does not expose directly — lease_expiry and +// last_accessed_at on block entries — is observed through EnumerateLruForEviction: +// a leased entry is filtered out, and LRU ordering reflects last_accessed_at. + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "umbp/distributed/master/in_memory_master_metadata_store.h" +#include "umbp/distributed/types.h" + +namespace mori::umbp { +namespace { + +using namespace std::chrono_literals; +using Clock = std::chrono::system_clock; + +// Fixed, NTP-plausible base instant so offsets read cleanly. +const Clock::time_point kT0 = Clock::time_point(std::chrono::hours(24 * 365 * 50)); + +std::map Caps(uint64_t total = 1000, uint64_t available = 1000) { + return {{TierType::HBM, TierCapacity{total, available}}}; +} + +ClientRegistration MakeReg(const std::string& node_id) { + ClientRegistration reg; + reg.node_id = node_id; + reg.node_address = "addr:" + node_id; + reg.peer_address = "peer:" + node_id; + reg.tier_capacities = Caps(); + reg.tags = {"role=test"}; + return reg; +} + +KvEvent Add(const std::string& key, TierType tier, uint64_t size) { + return KvEvent{KvEvent::Kind::ADD, key, tier, size}; +} +KvEvent Remove(const std::string& key, TierType tier) { + return KvEvent{KvEvent::Kind::REMOVE, key, tier, 0}; +} + +// Register `node` ALIVE at `now`. +void RegisterAlive(IMasterMetadataStore& store, const std::string& node, + Clock::time_point now = kT0) { + ASSERT_TRUE(store.RegisterClient(MakeReg(node), now, 30s)); +} + +// Apply a delta heartbeat carrying `events` at sequence `seq`. +HeartbeatResult Beat(IMasterMetadataStore& store, const std::string& node, uint64_t seq, + std::vector events, Clock::time_point now) { + return store.ApplyHeartbeat(node, seq, now, Caps(), events, /*is_full_sync=*/false); +} + +// --------------------------------------------------------------------------- +// RegisterClient +// --------------------------------------------------------------------------- + +TEST(InMemoryStore, RegisterNewClient) { + InMemoryMasterMetadataStore store; + EXPECT_TRUE(store.RegisterClient(MakeReg("n1"), kT0, 30s)); + EXPECT_TRUE(store.IsClientAlive("n1")); + EXPECT_EQ(store.AliveClientCount(), 1u); + + auto rec = store.GetClient("n1"); + ASSERT_TRUE(rec.has_value()); + EXPECT_EQ(rec->status, ClientStatus::ALIVE); + EXPECT_EQ(rec->last_applied_seq, 0u); + EXPECT_EQ(rec->peer_address, "peer:n1"); + EXPECT_EQ(rec->last_heartbeat, kT0); + EXPECT_EQ(rec->registered_at, kT0); +} + +TEST(InMemoryStore, RejectReRegisterNonStaleAlive) { + InMemoryMasterMetadataStore store; + ASSERT_TRUE(store.RegisterClient(MakeReg("n1"), kT0, 30s)); + // Still well within stale_after window. + EXPECT_FALSE(store.RegisterClient(MakeReg("n1"), kT0 + 5s, 30s)); +} + +TEST(InMemoryStore, AcceptReRegisterStaleAlive) { + InMemoryMasterMetadataStore store; + ASSERT_TRUE(store.RegisterClient(MakeReg("n1"), kT0, 30s)); + // last_heartbeat is kT0; now - last_heartbeat > stale_after → re-register OK + // even though the reaper has not flipped the status yet (hazard #2). + EXPECT_TRUE(store.RegisterClient(MakeReg("n1"), kT0 + 31s, 30s)); + EXPECT_TRUE(store.IsClientAlive("n1")); +} + +TEST(InMemoryStore, AcceptReRegisterExpired) { + InMemoryMasterMetadataStore store; + ASSERT_TRUE(store.RegisterClient(MakeReg("n1"), kT0, 30s)); + ASSERT_EQ(store.ExpireStaleClients(kT0 + 1s).size(), 1u); + EXPECT_FALSE(store.IsClientAlive("n1")); + // Re-register an EXPIRED record at the same instant: accepted, back to ALIVE. + EXPECT_TRUE(store.RegisterClient(MakeReg("n1"), kT0 + 2s, 30s)); + EXPECT_TRUE(store.IsClientAlive("n1")); +} + +// --------------------------------------------------------------------------- +// UnregisterClient — cascade to block locations AND external KV +// --------------------------------------------------------------------------- + +TEST(InMemoryStore, UnregisterClientCascades) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + ASSERT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 10)}, kT0).status, + HeartbeatResult::APPLIED); + ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1", "h2"}, TierType::HBM)); + + ASSERT_FALSE(store.LookupBlock("k1").empty()); + ASSERT_EQ(store.GetExternalKvCount("n1"), 2u); + + store.UnregisterClient("n1"); + + EXPECT_FALSE(store.GetClient("n1").has_value()); + EXPECT_TRUE(store.LookupBlock("k1").empty()); + EXPECT_EQ(store.GetExternalKvCount("n1"), 0u); + EXPECT_TRUE(store.MatchExternalKv({"h1", "h2"}, false, kT0).empty()); +} + +TEST(InMemoryStore, UnregisterUnknownIsNoOp) { + InMemoryMasterMetadataStore store; + store.UnregisterClient("ghost"); // must not crash + EXPECT_EQ(store.AliveClientCount(), 0u); +} + +// --------------------------------------------------------------------------- +// ApplyHeartbeat +// --------------------------------------------------------------------------- + +TEST(InMemoryStore, HeartbeatUnknownNode) { + InMemoryMasterMetadataStore store; + auto r = Beat(store, "ghost", 1, {}, kT0); + EXPECT_EQ(r.status, HeartbeatResult::UNKNOWN); +} + +TEST(InMemoryStore, HeartbeatCasSequence) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + EXPECT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 10)}, kT0).status, + HeartbeatResult::APPLIED); + EXPECT_EQ(Beat(store, "n1", 2, {Add("k2", TierType::HBM, 20)}, kT0).status, + HeartbeatResult::APPLIED); + // Out-of-order seq → SEQ_GAP, acked echoes last applied (2). + auto gap = Beat(store, "n1", 4, {Add("k3", TierType::HBM, 30)}, kT0); + EXPECT_EQ(gap.status, HeartbeatResult::SEQ_GAP); + EXPECT_EQ(gap.acked_seq, 2u); + // k3 must not have been applied. + EXPECT_TRUE(store.LookupBlock("k3").empty()); +} + +TEST(InMemoryStore, SeqGapKeepsLivenessNotCapsOrSeq) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + ASSERT_EQ(Beat(store, "n1", 1, {}, kT0).status, HeartbeatResult::APPLIED); + + // A gap heartbeat at a later time with different caps. + std::map new_caps = {{TierType::HBM, TierCapacity{9999, 9999}}}; + auto gap = store.ApplyHeartbeat("n1", 5, kT0 + 10s, new_caps, {}, /*is_full_sync=*/false); + ASSERT_EQ(gap.status, HeartbeatResult::SEQ_GAP); + + auto rec = store.GetClient("n1"); + ASSERT_TRUE(rec.has_value()); + EXPECT_EQ(rec->status, ClientStatus::ALIVE); // kept alive + EXPECT_EQ(rec->last_heartbeat, kT0 + 10s); // last_heartbeat bumped + EXPECT_EQ(rec->last_applied_seq, 1u); // seq NOT advanced + EXPECT_EQ(rec->tier_capacities.at(TierType::HBM).total_bytes, 1000u); // caps NOT replaced +} + +TEST(InMemoryStore, HeartbeatDeltaAddRemove) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + ASSERT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 10)}, kT0).status, + HeartbeatResult::APPLIED); + ASSERT_EQ(store.LookupBlock("k1").size(), 1u); + + ASSERT_EQ(Beat(store, "n1", 2, {Remove("k1", TierType::HBM)}, kT0).status, + HeartbeatResult::APPLIED); + EXPECT_TRUE(store.LookupBlock("k1").empty()); +} + +TEST(InMemoryStore, HeartbeatFullSyncReplaces) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + ASSERT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 10), Add("k2", TierType::HBM, 20)}, kT0) + .status, + HeartbeatResult::APPLIED); + + // full_sync wipes prior locations and installs only the ADDs carried here. + auto r = store.ApplyHeartbeat("n1", 7, kT0, Caps(), {Add("k3", TierType::HBM, 30)}, + /*is_full_sync=*/true); + EXPECT_EQ(r.status, HeartbeatResult::APPLIED); + EXPECT_EQ(r.acked_seq, 7u); + + EXPECT_TRUE(store.LookupBlock("k1").empty()); + EXPECT_TRUE(store.LookupBlock("k2").empty()); + EXPECT_EQ(store.LookupBlock("k3").size(), 1u); + + auto rec = store.GetClient("n1"); + ASSERT_TRUE(rec.has_value()); + EXPECT_EQ(rec->last_applied_seq, 7u); // full_sync re-baselines the seq +} + +// --------------------------------------------------------------------------- +// ExpireStaleClients — flip to EXPIRED, keep row, cascade, idempotent +// --------------------------------------------------------------------------- + +TEST(InMemoryStore, ExpireStaleFlipsKeepsRowAndCascades) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1", kT0); + RegisterAlive(store, "n2", kT0 + 20s); // fresher + ASSERT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 10)}, kT0).status, + HeartbeatResult::APPLIED); + ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1"}, TierType::HBM)); + + // Cutoff after n1's heartbeat but before n2's. + auto dead = store.ExpireStaleClients(kT0 + 10s); + ASSERT_EQ(dead.size(), 1u); + EXPECT_EQ(dead[0], "n1"); + + // Row KEPT but EXPIRED (hazard #3). + auto rec = store.GetClient("n1"); + ASSERT_TRUE(rec.has_value()); + EXPECT_EQ(rec->status, ClientStatus::EXPIRED); + EXPECT_FALSE(store.IsClientAlive("n1")); + + // Cascade dropped its blocks and external KV. + EXPECT_TRUE(store.LookupBlock("k1").empty()); + EXPECT_EQ(store.GetExternalKvCount("n1"), 0u); + + // n2 untouched. + EXPECT_TRUE(store.IsClientAlive("n2")); +} + +TEST(InMemoryStore, ExpireStaleIsIdempotent) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1", kT0); + ASSERT_EQ(store.ExpireStaleClients(kT0 + 10s).size(), 1u); + // Re-tick: already EXPIRED, nothing new to report. + EXPECT_TRUE(store.ExpireStaleClients(kT0 + 10s).empty()); +} + +TEST(InMemoryStore, ExpiredRowExcludedFromAliveAccounting) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1", kT0); + RegisterAlive(store, "n2", kT0); + ASSERT_EQ(store.AliveClientCount(), 2u); + ASSERT_EQ(store.ExpireStaleClients(kT0 + 10s).size(), 2u); + + EXPECT_EQ(store.AliveClientCount(), 0u); // not 2, even though rows remain + EXPECT_TRUE(store.ListAliveClients().empty()); + EXPECT_TRUE(store.GetClient("n1").has_value()); // row still present +} + +// --------------------------------------------------------------------------- +// Block reads — lease/access observed via EnumerateLruForEviction +// --------------------------------------------------------------------------- + +// Helper: budget large enough to take everything in one bucket. +std::map Budget(const std::string& node, TierType tier, uint64_t bytes) { + return {{NodeTierKey{node, tier}, bytes}}; +} + +TEST(InMemoryStore, LookupBlockHasNoLeaseOrAccessSideEffects) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + ASSERT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 10)}, kT0).status, + HeartbeatResult::APPLIED); + + // Plain read twice. + EXPECT_EQ(store.LookupBlock("k1").size(), 1u); + EXPECT_EQ(store.LookupBlock("k1").size(), 1u); + + // Not leased → still an eviction candidate at kT0. + auto cands = store.EnumerateLruForEviction(Budget("n1", TierType::HBM, 1000), kT0); + ASSERT_EQ(cands.size(), 1u); + EXPECT_EQ(cands.begin()->second.size(), 1u); +} + +TEST(InMemoryStore, LookupBlockForRouteGetGrantsLeaseAndAccess) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + ASSERT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 10)}, kT0).status, + HeartbeatResult::APPLIED); + + auto locs = store.LookupBlockForRouteGet("k1", {}, kT0, 60s); + ASSERT_EQ(locs.size(), 1u); + + // Leased until kT0+60s → filtered out of eviction at kT0+10s. + EXPECT_TRUE(store.EnumerateLruForEviction(Budget("n1", TierType::HBM, 1000), kT0 + 10s).empty()); + // After lease expiry it is a candidate again. + EXPECT_FALSE(store.EnumerateLruForEviction(Budget("n1", TierType::HBM, 1000), kT0 + 61s).empty()); +} + +TEST(InMemoryStore, RouteGetExcludeNodesNoLeaseWhenFullyExcluded) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + ASSERT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 10)}, kT0).status, + HeartbeatResult::APPLIED); + + std::unordered_set exclude = {"n1"}; + auto locs = store.LookupBlockForRouteGet("k1", exclude, kT0, 60s); + EXPECT_TRUE(locs.empty()); // every location excluded + + // No lease granted (hazard #4) → still an eviction candidate immediately. + EXPECT_FALSE(store.EnumerateLruForEviction(Budget("n1", TierType::HBM, 1000), kT0).empty()); +} + +TEST(InMemoryStore, BatchLookupForRouteGetParallelToKeys) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + ASSERT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 10), Add("k3", TierType::HBM, 30)}, kT0) + .status, + HeartbeatResult::APPLIED); + + auto out = store.BatchLookupBlockForRouteGet({"k1", "missing", "k3"}, {}, kT0, 60s); + ASSERT_EQ(out.size(), 3u); + EXPECT_EQ(out[0].size(), 1u); + EXPECT_TRUE(out[1].empty()); // missing key + EXPECT_EQ(out[2].size(), 1u); +} + +TEST(InMemoryStore, BatchExistsBlockNoSideEffects) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + ASSERT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 10)}, kT0).status, + HeartbeatResult::APPLIED); + + auto exists = store.BatchExistsBlock({"k1", "missing"}); + ASSERT_EQ(exists.size(), 2u); + EXPECT_TRUE(exists[0]); + EXPECT_FALSE(exists[1]); + + // No lease granted by an existence check. + EXPECT_FALSE(store.EnumerateLruForEviction(Budget("n1", TierType::HBM, 1000), kT0).empty()); +} + +// --------------------------------------------------------------------------- +// EnumerateLruForEviction +// --------------------------------------------------------------------------- + +TEST(InMemoryStore, EvictionLruOrderAndBudget) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + // Three keys, each 100 bytes, accessed at increasing times so LRU order is + // k_old < k_mid < k_new. + ASSERT_EQ(Beat(store, "n1", 1, {Add("k_old", TierType::HBM, 100)}, kT0).status, + HeartbeatResult::APPLIED); + ASSERT_EQ(Beat(store, "n1", 2, {Add("k_mid", TierType::HBM, 100)}, kT0 + 1s).status, + HeartbeatResult::APPLIED); + ASSERT_EQ(Beat(store, "n1", 3, {Add("k_new", TierType::HBM, 100)}, kT0 + 2s).status, + HeartbeatResult::APPLIED); + + // Budget 150 bytes → should take the two oldest (200 bytes ≥ 150 after second). + auto cands = store.EnumerateLruForEviction(Budget("n1", TierType::HBM, 150), kT0 + 10s); + ASSERT_EQ(cands.size(), 1u); + auto& bucket = cands.at(NodeTierKey{"n1", TierType::HBM}); + ASSERT_EQ(bucket.size(), 2u); + EXPECT_EQ(bucket[0].key, "k_old"); // oldest first + EXPECT_EQ(bucket[1].key, "k_mid"); +} + +TEST(InMemoryStore, EvictionSkipsLeased) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + ASSERT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 100)}, kT0).status, + HeartbeatResult::APPLIED); + // Lease k1 well past the enumeration time. + store.LookupBlockForRouteGet("k1", {}, kT0, 1h); + EXPECT_TRUE(store.EnumerateLruForEviction(Budget("n1", TierType::HBM, 1000), kT0 + 1s).empty()); +} + +TEST(InMemoryStore, EvictionTieTimestampsAllSurvive) { + // §2d correctness claim: many candidates sharing one identical last_accessed_at + // (the common case, since a batch RouteGet stamps one `now` across all keys) + // must all be enumerable — none dropped by tie collisions. + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + std::vector adds; + for (int i = 0; i < 50; ++i) { + adds.push_back(Add("k" + std::to_string(i), TierType::HBM, 10)); + } + // All keys created (and thus last_accessed) at the identical instant kT0. + ASSERT_EQ(Beat(store, "n1", 1, adds, kT0).status, HeartbeatResult::APPLIED); + + // Huge budget → take everything; all 50 tied-timestamp candidates must appear. + auto cands = store.EnumerateLruForEviction(Budget("n1", TierType::HBM, 100000), kT0 + 10s); + ASSERT_EQ(cands.size(), 1u); + EXPECT_EQ(cands.at(NodeTierKey{"n1", TierType::HBM}).size(), 50u); +} + +TEST(InMemoryStore, EvictionOnlyBudgetedBuckets) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + ASSERT_EQ(Beat(store, "n1", 1, {Add("kh", TierType::HBM, 10), Add("kd", TierType::DRAM, 10)}, kT0) + .status, + HeartbeatResult::APPLIED); + // Only ask about the HBM bucket. + auto cands = store.EnumerateLruForEviction(Budget("n1", TierType::HBM, 1000), kT0 + 1s); + ASSERT_EQ(cands.size(), 1u); + EXPECT_EQ(cands.begin()->first.tier, TierType::HBM); +} + +// --------------------------------------------------------------------------- +// External KV +// --------------------------------------------------------------------------- + +TEST(InMemoryStore, RegisterExternalKvAliveGate) { + InMemoryMasterMetadataStore store; + // Dead/unknown node → rejected, nothing written. + EXPECT_FALSE(store.RegisterExternalKvIfAlive("ghost", {"h1"}, TierType::HBM)); + EXPECT_TRUE(store.MatchExternalKv({"h1"}, false, kT0).empty()); + + RegisterAlive(store, "n1"); + EXPECT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1"}, TierType::HBM)); + EXPECT_EQ(store.MatchExternalKv({"h1"}, false, kT0).size(), 1u); +} + +TEST(InMemoryStore, UnregisterExternalKvAndByTier) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1"}, TierType::HBM)); + ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1"}, TierType::DRAM)); + + // Remove only the HBM tier; DRAM remains. + store.UnregisterExternalKv("n1", {"h1"}, TierType::HBM); + auto m = store.MatchExternalKv({"h1"}, false, kT0); + ASSERT_EQ(m.size(), 1u); + EXPECT_EQ(m[0].hashes_by_tier.count(TierType::HBM), 0u); + EXPECT_EQ(m[0].hashes_by_tier.count(TierType::DRAM), 1u); + + // Whole-tier wipe of DRAM → entry gone. + store.UnregisterExternalKvByTier("n1", TierType::DRAM); + EXPECT_TRUE(store.MatchExternalKv({"h1"}, false, kT0).empty()); +} + +TEST(InMemoryStore, MatchCountsHitsWhenRequested) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1", "h2"}, TierType::HBM)); + + // count_as_hit=false: pure read, hit map untouched. + store.MatchExternalKv({"h1", "h2"}, /*count_as_hit=*/false, kT0); + EXPECT_TRUE(store.GetExternalKvHitCounts({"h1", "h2"}).empty()); + + // count_as_hit=true: increments accumulate across calls. + store.MatchExternalKv({"h1", "h2"}, /*count_as_hit=*/true, kT0); + store.MatchExternalKv({"h1"}, /*count_as_hit=*/true, kT0 + 1s); + + auto counts = store.GetExternalKvHitCounts({"h1", "h2"}); + std::map by_hash; + for (const auto& e : counts) by_hash[e.hash] = e.hit_count_total; + EXPECT_EQ(by_hash["h1"], 2u); + EXPECT_EQ(by_hash["h2"], 1u); +} + +TEST(InMemoryStore, MatchedHashCountAcrossTiers) { + // Preserves the NodeMatch::MatchedHashCount coverage from + // test_external_kv_block_index.cpp:57 — one hash mirrored across two tiers + // counts once. + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1"}, TierType::HBM)); + ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1"}, TierType::DRAM)); + + auto m = store.MatchExternalKv({"h1"}, false, kT0); + ASSERT_EQ(m.size(), 1u); + EXPECT_EQ(m[0].hashes_by_tier.size(), 2u); // appears in two tier buckets + EXPECT_EQ(m[0].MatchedHashCount(), 1u); // but is one unique hash +} + +TEST(InMemoryStore, GetExternalKvHitCountsDedupesAndSkipsMissing) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1"}, TierType::HBM)); + store.MatchExternalKv({"h1"}, true, kT0); + + auto counts = store.GetExternalKvHitCounts({"missing", "h1", "h1"}); + ASSERT_EQ(counts.size(), 1u); + EXPECT_EQ(counts[0].hash, "h1"); + EXPECT_EQ(counts[0].hit_count_total, 1u); +} + +TEST(InMemoryStore, GarbageCollectHitsByLastSeen) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"old", "fresh"}, TierType::HBM)); + store.MatchExternalKv({"old"}, true, kT0); + store.MatchExternalKv({"fresh"}, true, kT0 + 100s); + + // Drop entries last seen before kT0+50s → only "old" goes. + EXPECT_EQ(store.GarbageCollectHits(kT0 + 50s), 1u); + + auto counts = store.GetExternalKvHitCounts({"old", "fresh"}); + ASSERT_EQ(counts.size(), 1u); + EXPECT_EQ(counts[0].hash, "fresh"); +} + +// --------------------------------------------------------------------------- +// Concurrency +// --------------------------------------------------------------------------- + +TEST(InMemoryStore, ConcurrentHeartbeatCasExactlyOneApplied) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + + std::atomic applied{0}; + std::atomic gap{0}; + std::atomic start{false}; + std::vector threads; + for (int t = 0; t < 2; ++t) { + threads.emplace_back([&] { + while (!start.load(std::memory_order_acquire)) std::this_thread::yield(); + // Both race to apply seq=1 (last_applied starts at 0). + auto r = store.ApplyHeartbeat("n1", 1, kT0, Caps(), {}, /*is_full_sync=*/false); + if (r.status == HeartbeatResult::APPLIED) { + applied.fetch_add(1); + } else if (r.status == HeartbeatResult::SEQ_GAP) { + gap.fetch_add(1); + } + }); + } + start.store(true, std::memory_order_release); + for (auto& th : threads) th.join(); + + EXPECT_EQ(applied.load(), 1); + EXPECT_EQ(gap.load(), 1); + EXPECT_EQ(store.GetClient("n1")->last_applied_seq, 1u); +} + +// ThreadSanitizer safety net for collapsing four lock domains into one: a mixed +// read/write workload across the shared/unique split must be race-free. +TEST(InMemoryStore, MixedWorkloadIsRaceFree) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + for (int i = 0; i < 100; ++i) { + store.ApplyHeartbeat("n1", i + 1, kT0, Caps(), + {Add("k" + std::to_string(i), TierType::HBM, 10)}, + /*is_full_sync=*/false); + } + ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1", "h2", "h3"}, TierType::HBM)); + + std::atomic start{false}; + std::vector threads; + + // RouteGet readers (shared-lock path with atomic lease/access mutation). + for (int r = 0; r < 4; ++r) { + threads.emplace_back([&] { + while (!start.load(std::memory_order_acquire)) std::this_thread::yield(); + for (int i = 0; i < 500; ++i) { + store.BatchLookupBlockForRouteGet({"k1", "k50", "k99"}, {}, kT0 + std::chrono::seconds(i), + 30s); + store.BatchExistsBlock({"k1", "k2"}); + } + }); + } + // Hit writers (the formerly-shared path that becomes exclusive). + threads.emplace_back([&] { + while (!start.load(std::memory_order_acquire)) std::this_thread::yield(); + for (int i = 0; i < 500; ++i) { + store.MatchExternalKv({"h1", "h2", "h3"}, /*count_as_hit=*/true, + kT0 + std::chrono::seconds(i)); + } + }); + // Eviction-enumeration reader. + threads.emplace_back([&] { + while (!start.load(std::memory_order_acquire)) std::this_thread::yield(); + for (int i = 0; i < 500; ++i) { + store.EnumerateLruForEviction(Budget("n1", TierType::HBM, 50), kT0 + std::chrono::seconds(i)); + } + }); + + start.store(true, std::memory_order_release); + for (auto& th : threads) th.join(); + + // After the storm, hit counts reflect exactly the 500 hit-writer iterations. + auto counts = store.GetExternalKvHitCounts({"h1"}); + ASSERT_EQ(counts.size(), 1u); + EXPECT_EQ(counts[0].hit_count_total, 500u); +} + +} // namespace +} // namespace mori::umbp From 86293fc6372786d6ed9923286bec610c65278a55 Mon Sep 17 00:00:00 2001 From: TianDi101 Date: Mon, 15 Jun 2026 13:29:16 +0000 Subject: [PATCH 4/8] umbp: wire consumers to IMasterMetadataStore (Phase 3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the direct GlobalBlockIndex / ClientRegistry / ExternalKvBlockIndex / ExternalKvHitIndex references in every master-side consumer with a single IMasterMetadataStore&. The four old classes are still present, so this isolates rewire regressions from the Phase 4 deletion. - Router: holds one store_ instead of index_ + registry_. RoutePut / BatchRoutePut read ListAliveClients() and BatchExistsBlock(); BatchRouteGet now passes an explicit system_clock::now() into BatchLookupBlockForRouteGet (the old GlobalBlockIndex read the clock internally — the timestamp now crosses the store boundary, hazard #7). - EvictionManager: constructor takes the store. RunOnce passes its existing per-(node,tier) byte budget (already computed down to the low watermark) straight into EnumerateLruForEviction and drops its own std::sort + greedy budget walk, since the store returns candidates LRU-ordered and budget-trimmed. The standalone overloaded-set is gone — the budget map's keys already identify the overloaded buckets. - MasterServer: the four state members collapse to one unique_ptr (InMemoryMasterMetadataStore), declared before router_/service_ so it outlives their references. gRPC handlers rewired to store_ calls. The Heartbeat handler flattens EventBundle[] into per-bundle single-seq ApplyHeartbeat calls and short-circuits on SEQ_GAP (§3e). The client-expiry reaper moves out of ClientRegistry into MasterServer (per-tick store_->ExpireStaleClients(cutoff)); the hit-index GC tick becomes store_->GarbageCollectHits(now - max_age) — both cutoffs on the system_clock basis so they compare against last_heartbeat / last_seen. - Add UnregisterExternalKvByNode to the interface, in-memory impl, and mock: it backs the live RevokeAllExternalKvBlocksForNode RPC (whole-node external-KV wipe that leaves the client record + block locations intact), which the §1b draft had dropped. Tests: - test_router_dedup migrated to construct InMemoryMasterMetadataStore instead of GlobalBlockIndex + ClientRegistry. - Store suite extended with the previously-uncovered methods: UnregisterExternalKvByNode (distinct from UnregisterClient), GetPeerAddress (ALIVE + EXPIRED + unknown), GetClientTags, and ListAliveClients content; the interface "every method callable" gate now names UnregisterExternalKvByNode. - InMemoryMasterMetadataStore behavioral suite green (36 cases) via the standalone g++ build. Co-Authored-By: Claude Opus 4.8 --- .../distributed/master/eviction_manager.cpp | 65 ++--- .../in_memory_master_metadata_store.cpp | 5 + src/umbp/distributed/master/master_server.cpp | 258 +++++++++++------- src/umbp/distributed/routing/router.cpp | 19 +- .../distributed/master/eviction_manager.h | 8 +- .../master/in_memory_master_metadata_store.h | 1 + .../master/master_metadata_store.h | 6 + .../umbp/distributed/master/master_server.h | 27 +- .../include/umbp/distributed/routing/router.h | 9 +- src/umbp/tests/CMakeLists.txt | 2 +- src/umbp/tests/mock_master_metadata_store.h | 1 + .../test_in_memory_master_metadata_store.cpp | 77 ++++++ .../test_master_metadata_store_interface.cpp | 1 + src/umbp/tests/test_router_dedup.cpp | 63 +++-- 14 files changed, 361 insertions(+), 181 deletions(-) diff --git a/src/umbp/distributed/master/eviction_manager.cpp b/src/umbp/distributed/master/eviction_manager.cpp index 0a994cd0d..d877da34f 100644 --- a/src/umbp/distributed/master/eviction_manager.cpp +++ b/src/umbp/distributed/master/eviction_manager.cpp @@ -21,23 +21,23 @@ // SOFTWARE. #include "umbp/distributed/master/eviction_manager.h" -#include +#include #include -#include +#include #include #include #include #include #include "mori/utils/mori_log.hpp" -#include "umbp/distributed/master/client_registry.h" -#include "umbp/distributed/master/global_block_index.h" +#include "umbp/distributed/master/master_metadata_store.h" +#include "umbp/distributed/types.h" namespace mori::umbp { -EvictionManager::EvictionManager(GlobalBlockIndex& index, ClientRegistry& registry, - const EvictionConfig& config, EvictKeyDispatcher* dispatcher) - : index_(index), registry_(registry), config_(config), dispatcher_(dispatcher) {} +EvictionManager::EvictionManager(IMasterMetadataStore& store, const EvictionConfig& config, + EvictKeyDispatcher* dispatcher) + : store_(store), config_(config), dispatcher_(dispatcher) {} EvictionManager::~EvictionManager() { Stop(); } @@ -74,11 +74,15 @@ void EvictionManager::EvictionLoop() { // This function picks victims and dispatches EvictKey to each peer via the // dispatcher; master state itself is left untouched here. void EvictionManager::RunOnce() { - auto clients = registry_.GetAliveClients(); + auto clients = store_.ListAliveClients(); - using NodeTierKey = GlobalBlockIndex::NodeTierKey; - std::set overloaded_node_tiers; - std::unordered_map> bytes_to_free; + // Per-(node, tier) byte budget down to the LOW watermark. This is the same + // computation the manager did before the store refactor; the only change is + // that the budget map is now passed straight into EnumerateLruForEviction + // (keyed by NodeTierKey) instead of being consumed locally after a separate + // FindEvictionCandidates call. The budget map's keys also identify the + // overloaded buckets, so the old standalone overloaded-set is gone. + std::map bytes_to_free; for (const auto& client : clients) { for (const auto& [tier, cap] : client.tier_capacities) { @@ -93,44 +97,35 @@ void EvictionManager::RunOnce() { if (usage >= config_.high_watermark) { auto target_used = static_cast(static_cast(cap.total_bytes) * config_.low_watermark); - auto to_free = static_cast(used) - static_cast(target_used); - if (to_free > 0) { - overloaded_node_tiers.insert({client.node_id, tier}); - bytes_to_free[client.node_id][tier] += to_free; + if (used > target_used) { + bytes_to_free[{client.node_id, tier}] += used - target_used; } } } } - if (overloaded_node_tiers.empty()) return; - MORI_UMBP_INFO("[EvictionManager] {} overloaded node-tiers detected", - overloaded_node_tiers.size()); + if (bytes_to_free.empty()) return; + MORI_UMBP_INFO("[EvictionManager] {} overloaded node-tiers detected", bytes_to_free.size()); - auto candidates = index_.FindEvictionCandidates(overloaded_node_tiers); - if (candidates.empty()) { + // The store returns candidates already LRU-ordered (oldest first) and already + // trimmed to each bucket's byte budget, so the manager no longer sorts or + // runs its own greedy budget walk. + auto candidates_by_bucket = + store_.EnumerateLruForEviction(bytes_to_free, std::chrono::system_clock::now()); + if (candidates_by_bucket.empty()) { MORI_UMBP_DEBUG("[EvictionManager] No eviction candidates found"); return; } - // Sort by oldest-access first (LRU). Depth-aware tiebreaking went away - // along with master's per-key depth field — peers don't ship depth in - // KvEvent — so a pure LRU sort is what we get. - std::sort(candidates.begin(), candidates.end(), - [](const EvictionCandidate& a, const EvictionCandidate& b) { - return a.last_accessed_at < b.last_accessed_at; - }); - // Group selected victims by node so the eventual EvictKey RPC takes a // single keys[] per peer instead of N round trips. std::unordered_map> per_node_keys; size_t selected = 0; - for (const auto& c : candidates) { - auto& tier_budget = bytes_to_free[c.location.node_id]; - auto it = tier_budget.find(c.location.tier); - if (it == tier_budget.end() || it->second <= 0) continue; - per_node_keys[c.location.node_id].push_back(c.key); - it->second -= static_cast(c.size); - ++selected; + for (const auto& [bucket, candidates] : candidates_by_bucket) { + for (const auto& c : candidates) { + per_node_keys[c.location.node_id].push_back(c.key); + ++selected; + } } if (selected == 0) return; diff --git a/src/umbp/distributed/master/in_memory_master_metadata_store.cpp b/src/umbp/distributed/master/in_memory_master_metadata_store.cpp index a5d90aa4c..af978ba4c 100644 --- a/src/umbp/distributed/master/in_memory_master_metadata_store.cpp +++ b/src/umbp/distributed/master/in_memory_master_metadata_store.cpp @@ -367,6 +367,11 @@ void InMemoryMasterMetadataStore::UnregisterExternalKvByTier(const std::string& } } +void InMemoryMasterMetadataStore::UnregisterExternalKvByNode(const std::string& node_id) { + std::unique_lock lock(mutex_); + RemoveExternalKvByNodeLocked(node_id); +} + std::size_t InMemoryMasterMetadataStore::GarbageCollectHits( std::chrono::system_clock::time_point cutoff) { std::unique_lock lock(mutex_); diff --git a/src/umbp/distributed/master/master_server.cpp b/src/umbp/distributed/master/master_server.cpp index 2e3fac410..e65023e98 100644 --- a/src/umbp/distributed/master/master_server.cpp +++ b/src/umbp/distributed/master/master_server.cpp @@ -33,8 +33,8 @@ #include "mori/utils/mori_log.hpp" #include "umbp.grpc.pb.h" #include "umbp/common/env_time.h" -#include "umbp/distributed/master/external_kv_block_index.h" -#include "umbp/distributed/master/external_kv_hit_index.h" +#include "umbp/distributed/master/in_memory_master_metadata_store.h" +#include "umbp/distributed/master/master_metadata_store.h" #include "umbp/distributed/master/master_metrics.h" #include "umbp/distributed/routing/router.h" #include "umbp_peer.grpc.pb.h" @@ -71,16 +71,6 @@ uint32_t HitQueryMaxBatch() { return v; } -uint64_t NowNs() { - return static_cast(std::chrono::duration_cast( - std::chrono::system_clock::now().time_since_epoch()) - .count()); -} - -uint64_t ToNs(std::chrono::seconds value) { - return static_cast(std::chrono::duration_cast(value).count()); -} - KvEvent FromProtoEvent(const ::umbp::KvEvent& pe) { KvEvent ev; switch (pe.kind()) { @@ -187,17 +177,9 @@ MasterServerConfig MasterServerConfig::FromEnvironment() { // --------------------------------------------------------------------------- class MasterServer::UMBPMasterServiceImpl final : public ::umbp::UMBPMaster::Service { public: - UMBPMasterServiceImpl(ClientRegistry& registry, GlobalBlockIndex& index, - ExternalKvBlockIndex& external_kv_index, - ExternalKvHitIndex& external_kv_hit_index, Router& router, + UMBPMasterServiceImpl(IMasterMetadataStore& store, Router& router, const ClientRegistryConfig& config, mori::metrics::MetricsServer* metrics) - : registry_(registry), - index_(index), - external_kv_index_(external_kv_index), - external_kv_hit_index_(external_kv_hit_index), - router_(router), - config_(config), - metrics_(metrics) {} + : store_(store), router_(router), config_(config), metrics_(metrics) {} // -------- Client lifecycle -------- @@ -213,13 +195,21 @@ class MasterServer::UMBPMasterServiceImpl final : public ::umbp::UMBPMaster::Ser } const auto& engine_desc_str = request->engine_desc(); - std::vector engine_desc_bytes(engine_desc_str.begin(), engine_desc_str.end()); - - std::vector tags(request->tags().begin(), request->tags().end()); + ClientRegistration registration; + registration.node_id = request->node_id(); + registration.node_address = request->node_address(); + registration.tier_capacities = caps; + registration.peer_address = request->peer_address(); + registration.engine_desc_bytes.assign(engine_desc_str.begin(), engine_desc_str.end()); + registration.tags.assign(request->tags().begin(), request->tags().end()); + + // stale_after mirrors ClientRegistry::ExpiryDuration() (heartbeat_ttl × + // max_missed_heartbeats) so a TTL-stale ALIVE record the reaper hasn't yet + // flipped can still be re-registered (hazard #2). + const auto stale_after = config_.heartbeat_ttl * config_.max_missed_heartbeats; const bool registered = - registry_.RegisterClient(request->node_id(), request->node_address(), caps, - request->peer_address(), engine_desc_bytes, tags); + store_.RegisterClient(registration, std::chrono::system_clock::now(), stale_after); if (!registered) { return grpc::Status(grpc::StatusCode::ALREADY_EXISTS, "node is already alive and cannot be re-registered"); @@ -238,7 +228,7 @@ class MasterServer::UMBPMasterServiceImpl final : public ::umbp::UMBPMaster::Ser grpc::Status UnregisterClient(grpc::ServerContext* /*ctx*/, const ::umbp::UnregisterClientRequest* request, ::umbp::UnregisterClientResponse* /*response*/) override { - registry_.UnregisterClient(request->node_id()); + store_.UnregisterClient(request->node_id()); UpdateClientCountMetric(); return grpc::Status::OK; } @@ -265,13 +255,65 @@ class MasterServer::UMBPMasterServiceImpl final : public ::umbp::UMBPMaster::Ser bundles.push_back(std::move(bundle)); } + // §3e heartbeat adapter: the wire protocol ships EventBundle[] (each with + // its own seq) but IMasterMetadataStore::ApplyHeartbeat takes one seq at a + // time. Translate here — the store sees one seq per call, which is exactly + // what the seq-CAS (hazard #1) requires. + const auto now = std::chrono::system_clock::now(); uint64_t acked_seq = 0; bool request_full_sync = false; - auto status = - registry_.Heartbeat(request->node_id(), caps, bundles, request->is_full_sync(), - request->delta_seq_baseline(), &acked_seq, &request_full_sync); + ClientStatus client_status = ClientStatus::ALIVE; + + if (request->is_full_sync()) { + // Full sync replaces this node's locations wholesale and re-baselines + // last_applied_seq to delta_seq_baseline. Flatten every bundle's events + // into one ApplyHeartbeat call (ReplaceNodeLocations keeps only ADDs). + std::vector events; + for (const auto& bundle : bundles) { + for (const auto& ev : bundle.events) events.push_back(ev); + } + auto result = store_.ApplyHeartbeat(request->node_id(), request->delta_seq_baseline(), now, + caps, events, /*is_full_sync=*/true); + if (result.status == HeartbeatResult::UNKNOWN) { + client_status = ClientStatus::UNKNOWN; + } else { + acked_seq = result.acked_seq; + } + } else if (bundles.empty()) { + // Keepalive heartbeat: no events, but liveness must still refresh so the + // reaper doesn't expire an idle-but-alive node. seq=0 deterministically + // hits the SEQ_GAP branch, which bumps last_heartbeat + status←ALIVE + // without advancing last_applied_seq; the gap is ignored (no full-sync + // request) because there is nothing to recover. + auto result = store_.ApplyHeartbeat(request->node_id(), /*seq=*/0, now, caps, + /*events=*/{}, /*is_full_sync=*/false); + if (result.status == HeartbeatResult::UNKNOWN) { + client_status = ClientStatus::UNKNOWN; + } else { + acked_seq = result.acked_seq; + } + } else { + // Delta path: apply bundles in ascending-seq order. A real forward gap + // short-circuits the loop and requests a full sync, leaving earlier + // bundles applied (Risk item 2 — application is per-bundle, not atomic + // across the batch). + for (const auto& bundle : bundles) { + auto result = store_.ApplyHeartbeat(request->node_id(), bundle.seq, now, caps, + bundle.events, /*is_full_sync=*/false); + if (result.status == HeartbeatResult::UNKNOWN) { + client_status = ClientStatus::UNKNOWN; + break; + } + if (result.status == HeartbeatResult::SEQ_GAP) { + acked_seq = result.acked_seq; + request_full_sync = true; + break; + } + acked_seq = result.acked_seq; + } + } - response->set_status(static_cast<::umbp::ClientStatus>(status)); + response->set_status(static_cast<::umbp::ClientStatus>(client_status)); response->set_acked_seq(acked_seq); response->set_request_full_sync(request_full_sync); @@ -279,7 +321,7 @@ class MasterServer::UMBPMasterServiceImpl final : public ::umbp::UMBPMaster::Ser if (metrics_ != nullptr && request->tier_kv_counts_size() > 0) { mori::metrics::MetricsServer::Labels base = {{"node", request->node_id()}}; - for (const auto& tag : registry_.GetClientTags(request->node_id())) { + for (const auto& tag : store_.GetClientTags(request->node_id())) { const auto sep = tag.find('='); if (sep != std::string::npos) { base.push_back({tag.substr(0, sep), tag.substr(sep + 1)}); @@ -430,7 +472,7 @@ class MasterServer::UMBPMasterServiceImpl final : public ::umbp::UMBPMaster::Ser grpc::Status BatchLookup(grpc::ServerContext* /*ctx*/, const ::umbp::BatchLookupRequest* request, ::umbp::BatchLookupResponse* response) override { std::vector keys(request->keys().begin(), request->keys().end()); - auto found = index_.BatchLookupExists(keys); + auto found = store_.BatchExistsBlock(keys); for (bool b : found) response->add_found(b); return grpc::Status::OK; } @@ -448,7 +490,11 @@ class MasterServer::UMBPMasterServiceImpl final : public ::umbp::UMBPMaster::Ser } const TierType tier = static_cast(request->tier()); - if (!registry_.IsClientAlive(request->node_id())) { + std::vector hashes(request->hashes().begin(), request->hashes().end()); + // Alive-check + write are fused into one atomic store call (closes the + // TOCTOU gap between the old IsClientAlive check and Register — §2b.5). + const bool applied = store_.RegisterExternalKvIfAlive(request->node_id(), hashes, tier); + if (!applied) { MORI_UMBP_WARN("[Server] ReportExternalKvBlocks rejected: node not alive: {}", request->node_id()); if (metrics_) { @@ -461,14 +507,12 @@ class MasterServer::UMBPMasterServiceImpl final : public ::umbp::UMBPMaster::Ser return grpc::Status::OK; } - std::vector hashes(request->hashes().begin(), request->hashes().end()); - const size_t mutated = external_kv_index_.Register(request->node_id(), hashes, tier); if (metrics_) { const mori::metrics::MetricsServer::Labels labels = {{"node", request->node_id()}, {"tier", TierTypeName(tier)}}; metrics_->addCounter(MORI_UMBP_METRIC_EXT_KV_REPORT_BLOCKS_TOTAL, MORI_UMBP_METRIC_EXT_KV_REPORT_BLOCKS_TOTAL_HELP, labels, - static_cast(mutated)); + static_cast(hashes.size())); metrics_->addCounter( MORI_UMBP_METRIC_EXT_KV_REPORT_TOTAL, MORI_UMBP_METRIC_EXT_KV_REPORT_TOTAL_HELP, {{"node", request->node_id()}, {"tier", TierTypeName(tier)}, {"result", "ok"}}); @@ -488,13 +532,13 @@ class MasterServer::UMBPMasterServiceImpl final : public ::umbp::UMBPMaster::Ser const TierType tier = static_cast(request->tier()); std::vector hashes(request->hashes().begin(), request->hashes().end()); - const size_t mutated = external_kv_index_.Unregister(request->node_id(), hashes, tier); + store_.UnregisterExternalKv(request->node_id(), hashes, tier); if (metrics_) { const mori::metrics::MetricsServer::Labels labels = {{"node", request->node_id()}, {"tier", TierTypeName(tier)}}; metrics_->addCounter(MORI_UMBP_METRIC_EXT_KV_REVOKE_BLOCKS_TOTAL, MORI_UMBP_METRIC_EXT_KV_REVOKE_BLOCKS_TOTAL_HELP, labels, - static_cast(mutated)); + static_cast(hashes.size())); metrics_->addCounter( MORI_UMBP_METRIC_EXT_KV_REVOKE_TOTAL, MORI_UMBP_METRIC_EXT_KV_REVOKE_TOTAL_HELP, {{"node", request->node_id()}, {"tier", TierTypeName(tier)}, {"result", "ok"}}); @@ -510,13 +554,12 @@ class MasterServer::UMBPMasterServiceImpl final : public ::umbp::UMBPMaster::Ser } const TierType tier = static_cast(request->tier()); - const size_t mutated = external_kv_index_.UnregisterByNodeAtTier(request->node_id(), tier); + // Whole-tier wipe. UnregisterExternalKvByTier returns void (the store + // interface does not surface a mutated-block count), so the per-block + // counter is no longer emitted for this admin path; the revoke_total + // counter still records the operation. + store_.UnregisterExternalKvByTier(request->node_id(), tier); if (metrics_) { - const mori::metrics::MetricsServer::Labels labels = {{"node", request->node_id()}, - {"tier", TierTypeName(tier)}}; - metrics_->addCounter(MORI_UMBP_METRIC_EXT_KV_REVOKE_BLOCKS_TOTAL, - MORI_UMBP_METRIC_EXT_KV_REVOKE_BLOCKS_TOTAL_HELP, labels, - static_cast(mutated)); metrics_->addCounter( MORI_UMBP_METRIC_EXT_KV_REVOKE_TOTAL, MORI_UMBP_METRIC_EXT_KV_REVOKE_TOTAL_HELP, {{"node", request->node_id()}, {"tier", TierTypeName(tier)}, {"result", "ok"}}); @@ -531,13 +574,10 @@ class MasterServer::UMBPMasterServiceImpl final : public ::umbp::UMBPMaster::Ser return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "node_id must not be empty"); } - const size_t mutated = external_kv_index_.UnregisterByNode(request->node_id()); + // All-tier wipe for one node (full-sync recovery path). Returns void, so + // the per-block counter is dropped here as in the per-tier wipe above. + store_.UnregisterExternalKvByNode(request->node_id()); if (metrics_) { - const mori::metrics::MetricsServer::Labels labels = {{"node", request->node_id()}, - {"tier", "ALL"}}; - metrics_->addCounter(MORI_UMBP_METRIC_EXT_KV_REVOKE_BLOCKS_TOTAL, - MORI_UMBP_METRIC_EXT_KV_REVOKE_BLOCKS_TOTAL_HELP, labels, - static_cast(mutated)); metrics_->addCounter(MORI_UMBP_METRIC_EXT_KV_REVOKE_TOTAL, MORI_UMBP_METRIC_EXT_KV_REVOKE_TOTAL_HELP, {{"node", request->node_id()}, {"tier", "ALL"}, {"result", "ok"}}); @@ -549,10 +589,15 @@ class MasterServer::UMBPMasterServiceImpl final : public ::umbp::UMBPMaster::Ser const ::umbp::MatchExternalKvRequest* request, ::umbp::MatchExternalKvResponse* response) override { std::vector hashes(request->hashes().begin(), request->hashes().end()); - auto matches = external_kv_index_.Match(hashes); + // The store fuses the match with the hit-count increment + last_seen stamp + // (when count_as_hit) into one lock acquisition; the handler no longer + // makes a separate IncrementHits call. `now` crosses the store boundary + // (system_clock) and feeds GarbageCollectHits. + auto matches = + store_.MatchExternalKv(hashes, request->count_as_hit(), std::chrono::system_clock::now()); std::unordered_map peer_map; - for (const auto& record : registry_.GetAliveClients()) { + for (const auto& record : store_.ListAliveClients()) { peer_map[record.node_id] = record.peer_address; } for (auto& m : matches) { @@ -567,21 +612,6 @@ class MasterServer::UMBPMasterServiceImpl final : public ::umbp::UMBPMaster::Ser } } - if (request->count_as_hit() && !matches.empty()) { - std::unordered_set matched_hashes; - for (const auto& m : matches) { - for (const auto& [tier, hashes_in_tier] : m.hashes_by_tier) { - for (const auto& hash : hashes_in_tier) matched_hashes.insert(hash); - } - } - if (!matched_hashes.empty()) { - std::vector unique_matched; - unique_matched.reserve(matched_hashes.size()); - for (const auto& hash : matched_hashes) unique_matched.push_back(hash); - external_kv_hit_index_.IncrementHits(unique_matched, NowNs()); - } - } - size_t total_matched = 0; for (const auto& m : matches) total_matched += m.MatchedHashCount(); if (metrics_) { @@ -608,13 +638,11 @@ class MasterServer::UMBPMasterServiceImpl final : public ::umbp::UMBPMaster::Ser } std::vector hashes(request->hashes().begin(), request->hashes().end()); - std::vector> entries; - entries.reserve(hashes.size()); - external_kv_hit_index_.Lookup(hashes, &entries); - for (const auto& [hash, total] : entries) { + auto entries = store_.GetExternalKvHitCounts(hashes); + for (const auto& e : entries) { auto* entry = response->add_entries(); - entry->set_hash(hash); - entry->set_hit_count_total(total); + entry->set_hash(e.hash); + entry->set_hit_count_total(e.hit_count_total); } return grpc::Status::OK; } @@ -625,7 +653,7 @@ class MasterServer::UMBPMasterServiceImpl final : public ::umbp::UMBPMaster::Ser if (!metrics_) return grpc::Status::OK; mori::metrics::MetricsServer::Labels base = {{"node", request->node_id()}}; - for (const auto& tag : registry_.GetClientTags(request->node_id())) { + for (const auto& tag : store_.GetClientTags(request->node_id())) { const auto sep = tag.find('='); if (sep != std::string::npos) { base.push_back({tag.substr(0, sep), tag.substr(sep + 1)}); @@ -663,7 +691,7 @@ class MasterServer::UMBPMasterServiceImpl final : public ::umbp::UMBPMaster::Ser void UpdateClientCountMetric() { if (!metrics_) return; metrics_->setGauge(MORI_UMBP_METRIC_CLIENT_COUNT, MORI_UMBP_METRIC_CLIENT_COUNT_HELP, - static_cast(registry_.GetAliveClients().size())); + static_cast(store_.AliveClientCount())); } void UpdateClientCapacityMetrics(const std::string& node_id, @@ -691,10 +719,7 @@ class MasterServer::UMBPMasterServiceImpl final : public ::umbp::UMBPMaster::Ser } } - ClientRegistry& registry_; - GlobalBlockIndex& index_; - ExternalKvBlockIndex& external_kv_index_; - ExternalKvHitIndex& external_kv_hit_index_; + IMasterMetadataStore& store_; Router& router_; ClientRegistryConfig config_; mori::metrics::MetricsServer* metrics_ = nullptr; @@ -705,17 +730,13 @@ class MasterServer::UMBPMasterServiceImpl final : public ::umbp::UMBPMaster::Ser // --------------------------------------------------------------------------- MasterServer::MasterServer(MasterServerConfig config) : config_(std::move(config)), - index_(), - external_kv_index_(), - external_kv_hit_index_(), - registry_(config_.registry_config, index_, &external_kv_index_), - router_(index_, registry_, std::move(config_.get_strategy), std::move(config_.put_strategy)), - service_(std::make_unique(registry_, index_, external_kv_index_, - external_kv_hit_index_, router_, - config_.registry_config, nullptr)), + store_(std::make_unique()), + router_(*store_, std::move(config_.get_strategy), std::move(config_.put_strategy)), + service_(std::make_unique(*store_, router_, config_.registry_config, + nullptr)), peer_stub_pool_(std::make_unique()), - eviction_manager_(std::make_unique( - index_, registry_, config_.eviction_config, peer_stub_pool_.get())) { + eviction_manager_(std::make_unique(*store_, config_.eviction_config, + peer_stub_pool_.get())) { router_.SetLeaseDuration(config_.eviction_config.lease_duration); } @@ -733,7 +754,7 @@ void MasterServer::Run() { MORI_UMBP_INFO("[Master] Metrics server listening on port {}", config_.metrics_port); } - registry_.StartReaper(); + StartReaper(); eviction_manager_->Start(); StartHitIndexGc(); @@ -758,7 +779,7 @@ void MasterServer::Shutdown() { MORI_UMBP_INFO("[Master] Shutting down"); server_->Shutdown(deadline); } - registry_.StopReaper(); + StopReaper(); StopHitIndexGc(); } @@ -779,7 +800,7 @@ void MasterServer::StopHitIndexGc() { } void MasterServer::HitIndexGcLoop() { - const uint64_t ttl_ns = ToNs(HitIndexTtl()); + const auto ttl = HitIndexTtl(); while (hit_index_gc_running_) { { std::unique_lock lock(hit_index_gc_cv_mutex_); @@ -788,14 +809,55 @@ void MasterServer::HitIndexGcLoop() { } if (!hit_index_gc_running_) break; - const uint64_t now_ns = NowNs(); - const uint64_t cutoff_ns = now_ns > ttl_ns ? now_ns - ttl_ns : 0; - if (cutoff_ns == 0) continue; - const size_t dropped = external_kv_hit_index_.GarbageCollect(cutoff_ns); + // cutoff is a system_clock time_point now (hazard #7) so it's comparable + // to the last_seen the store stamps in MatchExternalKv(count_as_hit=true). + const auto cutoff = std::chrono::system_clock::now() - ttl; + const size_t dropped = store_->GarbageCollectHits(cutoff); if (dropped > 0) { MORI_UMBP_DEBUG("[Master] External KV hit index GC dropped {} entries", dropped); } } } +// --------------------------------------------------------------------------- +// Client-expiry reaper. Lifted from ClientRegistry — only the schedule lives +// here; the per-tick action is one store_->ExpireStaleClients(cutoff) call. +// --------------------------------------------------------------------------- +void MasterServer::StartReaper() { + bool expected = false; + if (!reaper_running_.compare_exchange_strong(expected, true)) return; + reaper_thread_ = std::thread(&MasterServer::ReaperLoop, this); + const auto expiry = + config_.registry_config.heartbeat_ttl * config_.registry_config.max_missed_heartbeats; + MORI_UMBP_INFO("[Reaper] Started (interval={}s, expiry={}s)", + config_.registry_config.reaper_interval.count(), expiry.count()); +} + +void MasterServer::StopReaper() { + bool expected = true; + if (!reaper_running_.compare_exchange_strong(expected, false)) return; + reaper_cv_.notify_one(); + if (reaper_thread_.joinable()) reaper_thread_.join(); + MORI_UMBP_INFO("[Reaper] Stopped"); +} + +void MasterServer::ReaperLoop() { + const auto expiry = + config_.registry_config.heartbeat_ttl * config_.registry_config.max_missed_heartbeats; + while (reaper_running_) { + { + std::unique_lock lock(reaper_cv_mutex_); + reaper_cv_.wait_for(lock, config_.registry_config.reaper_interval, + [this] { return !reaper_running_.load(); }); + } + if (!reaper_running_) break; + + const auto cutoff = std::chrono::system_clock::now() - expiry; + auto expired = store_->ExpireStaleClients(cutoff); + for (const auto& node_id : expired) { + MORI_UMBP_WARN("[Reaper] Expired client: {}", node_id); + } + } +} + } // namespace mori::umbp diff --git a/src/umbp/distributed/routing/router.cpp b/src/umbp/distributed/routing/router.cpp index 26c14693f..c9f85fcb3 100644 --- a/src/umbp/distributed/routing/router.cpp +++ b/src/umbp/distributed/routing/router.cpp @@ -28,10 +28,9 @@ namespace mori::umbp { -Router::Router(GlobalBlockIndex& index, ClientRegistry& registry, - std::unique_ptr get_strategy, +Router::Router(IMasterMetadataStore& store, std::unique_ptr get_strategy, std::unique_ptr put_strategy) - : index_(index), registry_(registry) { + : store_(store) { // Default to tier-priority (HBM > DRAM > SSD): with the SSD cold tier live, a // random pick could route a key that also has a DRAM/HBM copy to the slow // SSD. Callers can still inject RandomRouteGetStrategy (or any other) via @@ -55,7 +54,7 @@ std::optional Router::RoutePut( // Master-side dedup lives only in BatchRoutePut (single RoutePut // proto carries no already_exists; PoolClient::Put wraps BatchPut). (void)key; - auto candidates = registry_.GetAliveClients(); + auto candidates = store_.ListAliveClients(); if (candidates.empty()) { MORI_UMBP_DEBUG("[Router] RoutePut from={}: no alive clients", node_id); return std::nullopt; @@ -77,8 +76,8 @@ std::vector> Router::BatchRoutePut( // Single shared_lock for the whole batch: dedup mask + alive snapshot. // Two entries picking the same (node, tier) is fine — peer will sort // out ENOSPC at AllocateSlot. - auto exists_mask = index_.BatchLookupExists(keys); - auto candidates = registry_.GetAliveClients(); + auto exists_mask = store_.BatchExistsBlock(keys); + auto candidates = store_.ListAliveClients(); for (size_t i = 0; i < keys.size(); ++i) { if (i < exists_mask.size() && exists_mask[i]) { results[i] = RoutePutResult{.outcome = RoutePutOutcome::kAlreadyExists}; @@ -98,11 +97,15 @@ std::vector> Router::BatchRouteGet( // Snapshot peer addresses once for the whole batch. Master assumes // the snapshot is stable for the duration of one BatchRouteGet. std::unordered_map node_to_peer; - for (const auto& client : registry_.GetAliveClients()) { + for (const auto& client : store_.ListAliveClients()) { node_to_peer[client.node_id] = client.peer_address; } - auto all_locs = index_.BatchLookupForRouteGet(keys, exclude_nodes, lease_duration_); + // Unlike the old GlobalBlockIndex::BatchLookupForRouteGet (which read the + // clock internally), BatchLookupBlockForRouteGet takes an explicit `now` + // the router supplies — the timestamp now crosses the store boundary. + auto all_locs = store_.BatchLookupBlockForRouteGet( + keys, exclude_nodes, std::chrono::system_clock::now(), lease_duration_); for (size_t i = 0; i < keys.size(); ++i) { auto& locations = all_locs[i]; if (locations.empty()) { diff --git a/src/umbp/include/umbp/distributed/master/eviction_manager.h b/src/umbp/include/umbp/distributed/master/eviction_manager.h index a0b1f1163..096985be9 100644 --- a/src/umbp/include/umbp/distributed/master/eviction_manager.h +++ b/src/umbp/include/umbp/distributed/master/eviction_manager.h @@ -32,8 +32,7 @@ namespace mori::umbp { -class GlobalBlockIndex; -class ClientRegistry; +class IMasterMetadataStore; // Fire-and-forget callback for shipping EvictKey RPCs to a peer. The // EvictionManager calls this once per (node_id, peer_address) group of @@ -57,7 +56,7 @@ class EvictionManager { // logs intent but does not ship EvictKey RPCs (useful for routing- // only tests). Master-server-side construction passes a concrete // MasterPeerStubPool here. - EvictionManager(GlobalBlockIndex& index, ClientRegistry& registry, const EvictionConfig& config, + EvictionManager(IMasterMetadataStore& store, const EvictionConfig& config, EvictKeyDispatcher* dispatcher = nullptr); ~EvictionManager(); @@ -71,8 +70,7 @@ class EvictionManager { void EvictionLoop(); void RunOnce(); - GlobalBlockIndex& index_; - ClientRegistry& registry_; + IMasterMetadataStore& store_; EvictionConfig config_; EvictKeyDispatcher* dispatcher_; std::thread thread_; diff --git a/src/umbp/include/umbp/distributed/master/in_memory_master_metadata_store.h b/src/umbp/include/umbp/distributed/master/in_memory_master_metadata_store.h index 1b26d8474..f0f12bf54 100644 --- a/src/umbp/include/umbp/distributed/master/in_memory_master_metadata_store.h +++ b/src/umbp/include/umbp/distributed/master/in_memory_master_metadata_store.h @@ -79,6 +79,7 @@ class InMemoryMasterMetadataStore : public IMasterMetadataStore { void UnregisterExternalKv(const std::string& node_id, const std::vector& hashes, TierType tier) override; void UnregisterExternalKvByTier(const std::string& node_id, TierType tier) override; + void UnregisterExternalKvByNode(const std::string& node_id) override; std::size_t GarbageCollectHits(std::chrono::system_clock::time_point cutoff) override; // --- Block reads --- diff --git a/src/umbp/include/umbp/distributed/master/master_metadata_store.h b/src/umbp/include/umbp/distributed/master/master_metadata_store.h index ef0cff208..32a7e1dce 100644 --- a/src/umbp/include/umbp/distributed/master/master_metadata_store.h +++ b/src/umbp/include/umbp/distributed/master/master_metadata_store.h @@ -324,6 +324,12 @@ class IMasterMetadataStore { // wipe — admin path, not heartbeat). virtual void UnregisterExternalKvByTier(const std::string& node_id, TierType tier) = 0; + // Drop every external-kv entry (all tiers) belonging to `node_id` without + // touching the client record. Backs the live RevokeAllExternalKvBlocksForNode + // RPC, which a peer issues to wipe its external-KV registration before a + // full re-sync. Does NOT check liveness. Idempotent on unknown nodes. + virtual void UnregisterExternalKvByNode(const std::string& node_id) = 0; + // Drop every per-hash hit-count entry whose last_seen < cutoff. // Returns the number of entries dropped. Replaces // ExternalKvHitIndex::GarbageCollect; the cutoff is a system_clock diff --git a/src/umbp/include/umbp/distributed/master/master_server.h b/src/umbp/include/umbp/distributed/master/master_server.h index 79eecd3ac..1874a300d 100644 --- a/src/umbp/include/umbp/distributed/master/master_server.h +++ b/src/umbp/include/umbp/distributed/master/master_server.h @@ -34,11 +34,9 @@ #include "mori/metrics/prometheus_metrics_server.hpp" #include "umbp/distributed/config.h" -#include "umbp/distributed/master/client_registry.h" #include "umbp/distributed/master/eviction_manager.h" -#include "umbp/distributed/master/external_kv_block_index.h" -#include "umbp/distributed/master/external_kv_hit_index.h" -#include "umbp/distributed/master/global_block_index.h" +#include "umbp/distributed/master/in_memory_master_metadata_store.h" +#include "umbp/distributed/master/master_metadata_store.h" #include "umbp/distributed/routing/route_get_strategy.h" #include "umbp/distributed/routing/route_put_strategy.h" #include "umbp/distributed/routing/router.h" @@ -63,10 +61,10 @@ class MasterServer { private: MasterServerConfig config_; - GlobalBlockIndex index_; - ExternalKvBlockIndex external_kv_index_; - ExternalKvHitIndex external_kv_hit_index_; - ClientRegistry registry_; + // Single owner of all master metadata state (block locations, client + // records, external-KV locations, hit counts). Declared before router_ and + // service_ so it outlives the references they hold. + std::unique_ptr store_; Router router_; std::unique_ptr metrics_server_; @@ -93,6 +91,19 @@ class MasterServer { std::atomic hit_index_gc_running_{false}; std::mutex hit_index_gc_cv_mutex_; std::condition_variable hit_index_gc_cv_; + + // Client-expiry reaper. Formerly owned by ClientRegistry; the schedule + // (timer + cv) lives here now and the per-tick action is a single + // store_->ExpireStaleClients(cutoff) call, where cutoff is on the + // system_clock basis so it's comparable to the records' last_heartbeat. + void StartReaper(); + void StopReaper(); + void ReaperLoop(); + + std::thread reaper_thread_; + std::atomic reaper_running_{false}; + std::mutex reaper_cv_mutex_; + std::condition_variable reaper_cv_; }; } // namespace mori::umbp diff --git a/src/umbp/include/umbp/distributed/routing/router.h b/src/umbp/include/umbp/distributed/routing/router.h index 9ae5ea4c4..2d89431c9 100644 --- a/src/umbp/include/umbp/distributed/routing/router.h +++ b/src/umbp/include/umbp/distributed/routing/router.h @@ -29,8 +29,7 @@ #include #include -#include "umbp/distributed/master/client_registry.h" -#include "umbp/distributed/master/global_block_index.h" +#include "umbp/distributed/master/master_metadata_store.h" #include "umbp/distributed/routing/route_get_strategy.h" #include "umbp/distributed/routing/route_put_strategy.h" @@ -45,8 +44,7 @@ struct RouteGetResolution { class Router { public: - Router(GlobalBlockIndex& index, ClientRegistry& registry, - std::unique_ptr get_strategy = nullptr, + Router(IMasterMetadataStore& store, std::unique_ptr get_strategy = nullptr, std::unique_ptr put_strategy = nullptr); ~Router() = default; @@ -78,8 +76,7 @@ class Router { void SetLeaseDuration(std::chrono::system_clock::duration d) { lease_duration_ = d; } private: - GlobalBlockIndex& index_; - ClientRegistry& registry_; + IMasterMetadataStore& store_; std::unique_ptr get_strategy_; std::unique_ptr put_strategy_; std::chrono::system_clock::duration lease_duration_{std::chrono::seconds{10}}; diff --git a/src/umbp/tests/CMakeLists.txt b/src/umbp/tests/CMakeLists.txt index 4c7d3ee57..e5ed6fe98 100644 --- a/src/umbp/tests/CMakeLists.txt +++ b/src/umbp/tests/CMakeLists.txt @@ -131,7 +131,7 @@ target_compile_features(test_global_block_index_events PRIVATE cxx_std_17) gtest_discover_tests(test_global_block_index_events) # --------------------------------------------------------------------------- -# test_router_dedup — master-side BatchRoutePut dedup via GlobalBlockIndex +# test_router_dedup — master-side BatchRoutePut dedup via IMasterMetadataStore # --------------------------------------------------------------------------- add_executable(test_router_dedup test_router_dedup.cpp) diff --git a/src/umbp/tests/mock_master_metadata_store.h b/src/umbp/tests/mock_master_metadata_store.h index 8810c243c..ae62ea4d7 100644 --- a/src/umbp/tests/mock_master_metadata_store.h +++ b/src/umbp/tests/mock_master_metadata_store.h @@ -78,6 +78,7 @@ class MockMasterMetadataStore : public IMasterMetadataStore { (override)); MOCK_METHOD(void, UnregisterExternalKvByTier, (const std::string& node_id, TierType tier), (override)); + MOCK_METHOD(void, UnregisterExternalKvByNode, (const std::string& node_id), (override)); MOCK_METHOD(std::size_t, GarbageCollectHits, (std::chrono::system_clock::time_point cutoff), (override)); diff --git a/src/umbp/tests/test_in_memory_master_metadata_store.cpp b/src/umbp/tests/test_in_memory_master_metadata_store.cpp index 166c1084b..a8ca2f6a2 100644 --- a/src/umbp/tests/test_in_memory_master_metadata_store.cpp +++ b/src/umbp/tests/test_in_memory_master_metadata_store.cpp @@ -529,6 +529,83 @@ TEST(InMemoryStore, GarbageCollectHitsByLastSeen) { EXPECT_EQ(counts[0].hash, "fresh"); } +TEST(InMemoryStore, UnregisterExternalKvByNodeWipesAllTiersOnly) { + // Whole-node external-KV wipe (backs RevokeAllExternalKvBlocksForNode). Unlike + // UnregisterClient, it must NOT touch the client record or block locations. + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + ASSERT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 10)}, kT0).status, + HeartbeatResult::APPLIED); + ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1", "h2"}, TierType::HBM)); + ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1"}, TierType::DRAM)); + ASSERT_EQ(store.GetExternalKvCount("n1"), 2u); + + store.UnregisterExternalKvByNode("n1"); + + // External KV gone across every tier. + EXPECT_EQ(store.GetExternalKvCount("n1"), 0u); + EXPECT_TRUE(store.MatchExternalKv({"h1", "h2"}, false, kT0).empty()); + + // Client record and block locations untouched (distinguishes from UnregisterClient). + EXPECT_TRUE(store.IsClientAlive("n1")); + EXPECT_EQ(store.LookupBlock("k1").size(), 1u); +} + +TEST(InMemoryStore, UnregisterExternalKvByNodeUnknownIsNoOp) { + InMemoryMasterMetadataStore store; + store.UnregisterExternalKvByNode("ghost"); // must not crash + EXPECT_EQ(store.GetExternalKvCount("ghost"), 0u); +} + +// --------------------------------------------------------------------------- +// Client reads — GetPeerAddress, GetClientTags, ListAliveClients content +// --------------------------------------------------------------------------- + +TEST(InMemoryStore, GetPeerAddressAliveExpiredAndUnknown) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + + // ALIVE → peer surfaced (MakeReg sets peer:). + auto alive = store.GetPeerAddress("n1"); + ASSERT_TRUE(alive.has_value()); + EXPECT_EQ(*alive, "peer:n1"); + + // EXPIRED rows still surface their peer_address (contract: the row is kept). + ASSERT_EQ(store.ExpireStaleClients(kT0 + 10s).size(), 1u); + auto expired = store.GetPeerAddress("n1"); + ASSERT_TRUE(expired.has_value()); + EXPECT_EQ(*expired, "peer:n1"); + + // Unknown node → nullopt. + EXPECT_FALSE(store.GetPeerAddress("ghost").has_value()); +} + +TEST(InMemoryStore, GetClientTagsReturnsRegisteredTagsAndEmptyForUnknown) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); // MakeReg sets tags = {"role=test"} + + auto tags = store.GetClientTags("n1"); + ASSERT_EQ(tags.size(), 1u); + EXPECT_EQ(tags[0], "role=test"); + + EXPECT_TRUE(store.GetClientTags("ghost").empty()); +} + +TEST(InMemoryStore, ListAliveClientsReturnsAliveRecordsExcludingExpired) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1", kT0); + RegisterAlive(store, "n2", kT0 + 20s); // fresher, survives the cutoff below + + // Expire only n1. + ASSERT_EQ(store.ExpireStaleClients(kT0 + 10s).size(), 1u); + + auto alive = store.ListAliveClients(); + ASSERT_EQ(alive.size(), 1u); // n1 excluded even though its row still exists + EXPECT_EQ(alive[0].node_id, "n2"); + EXPECT_EQ(alive[0].status, ClientStatus::ALIVE); + EXPECT_EQ(alive[0].peer_address, "peer:n2"); +} + // --------------------------------------------------------------------------- // Concurrency // --------------------------------------------------------------------------- diff --git a/src/umbp/tests/test_master_metadata_store_interface.cpp b/src/umbp/tests/test_master_metadata_store_interface.cpp index 28c83ccdc..ed82a3d2a 100644 --- a/src/umbp/tests/test_master_metadata_store_interface.cpp +++ b/src/umbp/tests/test_master_metadata_store_interface.cpp @@ -105,6 +105,7 @@ TEST(MasterMetadataStoreInterface, EveryMethodIsCallableThroughInterface) { EXPECT_TRUE(store.RegisterExternalKvIfAlive("node-a", {"h0"}, TierType::HBM)); store.UnregisterExternalKv("node-a", {"h0"}, TierType::HBM); store.UnregisterExternalKvByTier("node-a", TierType::HBM); + store.UnregisterExternalKvByNode("node-a"); EXPECT_EQ(store.GarbageCollectHits(now), 0u); // Block reads. diff --git a/src/umbp/tests/test_router_dedup.cpp b/src/umbp/tests/test_router_dedup.cpp index 721547839..6de571fd1 100644 --- a/src/umbp/tests/test_router_dedup.cpp +++ b/src/umbp/tests/test_router_dedup.cpp @@ -24,14 +24,14 @@ // with already_exists=true and bypass node selection. #include +#include #include #include #include #include #include -#include "umbp/distributed/master/client_registry.h" -#include "umbp/distributed/master/global_block_index.h" +#include "umbp/distributed/master/in_memory_master_metadata_store.h" #include "umbp/distributed/routing/router.h" #include "umbp/distributed/types.h" @@ -47,19 +47,40 @@ std::map MakeDramCaps(uint64_t total = 8 * kGB) { return caps; } +ClientRegistration MakeRegistration(const std::string& node_id, const std::string& node_address, + const std::string& peer_address) { + ClientRegistration reg; + reg.node_id = node_id; + reg.node_address = node_address; + reg.tier_capacities = MakeDramCaps(); + reg.peer_address = peer_address; + return reg; +} + +// Register `node_id` ALIVE and apply one ADD event for `key` so it has a block +// location in the store. Under the merged store a location can only be created +// through an ApplyHeartbeat from a registered (alive) node — locations no +// longer exist independently of a client record the way the old +// GlobalBlockIndex allowed. +void RegisterWithKey(InMemoryMasterMetadataStore& store, const std::string& node_id, + const std::string& key, std::chrono::system_clock::time_point now) { + ASSERT_TRUE(store.RegisterClient(MakeRegistration(node_id, node_id + ":1", node_id + ":peer"), + now, std::chrono::seconds{30})); + auto hb = store.ApplyHeartbeat(node_id, /*seq=*/1, now, MakeDramCaps(), + {KvEvent{KvEvent::Kind::ADD, key, TierType::DRAM, 4096}}, + /*is_full_sync=*/false); + ASSERT_EQ(hb.status, HeartbeatResult::APPLIED); +} + } // namespace // Indexed keys are marked already_exists; unknown keys still routed. TEST(RouterDedup, BatchRoutePutMarksAlreadyExistsForIndexedKey) { - GlobalBlockIndex index; - ClientRegistry registry(ClientRegistryConfig{}, index); - Router router(index, registry); + const auto now = std::chrono::system_clock::now(); + InMemoryMasterMetadataStore store; + Router router(store); - ASSERT_TRUE(registry.RegisterClient("node-a", "node-a:1", MakeDramCaps(), - /*peer_address=*/"node-a:peer")); - ASSERT_EQ( - index.ApplyEvents("node-a", {KvEvent{KvEvent::Kind::ADD, "key-X", TierType::DRAM, 4096}}), - 1u); + RegisterWithKey(store, "node-a", "key-X", now); std::vector keys{"key-X", "key-Y"}; std::vector sizes{4096, 4096}; @@ -77,20 +98,22 @@ TEST(RouterDedup, BatchRoutePutMarksAlreadyExistsForIndexedKey) { EXPECT_EQ(results[1]->node_id, "node-a"); } -// already_exists wins over no-alive-client: caller drops Put even if -// registry is empty (some other node owns the key). -TEST(RouterDedup, BatchRoutePutAlreadyExistsBypassesNoAliveClient) { - GlobalBlockIndex index; - ClientRegistry registry(ClientRegistryConfig{}, index); - Router router(index, registry); +// already_exists wins over an unroutable Put: an existing key is marked +// kAlreadyExists even when no node can accept the write. In the old design +// "no node" meant an empty registry while a foreign node owned the key; under +// the merged store a location can't outlive its alive owner, so the +// unroutable condition is expressed by excluding the only candidate node. The +// property under test is unchanged: dedup wins over node selection. +TEST(RouterDedup, BatchRoutePutAlreadyExistsBypassesUnroutablePut) { + const auto now = std::chrono::system_clock::now(); + InMemoryMasterMetadataStore store; + Router router(store); - ASSERT_EQ( - index.ApplyEvents("node-a", {KvEvent{KvEvent::Kind::ADD, "key-X", TierType::DRAM, 4096}}), - 1u); + RegisterWithKey(store, "node-a", "key-X", now); std::vector keys{"key-X", "key-Y"}; std::vector sizes{4096, 4096}; - std::unordered_set excludes; + std::unordered_set excludes{"node-a"}; // no routable target left auto results = router.BatchRoutePut(keys, "requester", sizes, excludes); ASSERT_EQ(results.size(), 2u); From c8f5dab2b47a99c6bf00c697e7165fa3fe4a6174 Mon Sep 17 00:00:00 2001 From: TianDi101 Date: Mon, 15 Jun 2026 14:39:30 +0000 Subject: [PATCH 5/8] fix heartbeat bug in client main --- src/umbp/distributed/bin/client_main.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/umbp/distributed/bin/client_main.cpp b/src/umbp/distributed/bin/client_main.cpp index c9ee7286e..7673441a6 100644 --- a/src/umbp/distributed/bin/client_main.cpp +++ b/src/umbp/distributed/bin/client_main.cpp @@ -99,6 +99,11 @@ int main(int argc, char** argv) { return 1; } + // This demo drives MasterClient directly (it does not go through PoolClient, + // which is what normally honors config.auto_heartbeat on Init). Start the + // heartbeat thread so the master's reaper does not expire us. + client.StartHeartbeat(); + constexpr auto kOperationInterval = std::chrono::seconds(3); uint64_t iteration = 0; From 00652fafd48844d5d5b1382d867bf675150b469c Mon Sep 17 00:00:00 2001 From: TianDi101 Date: Mon, 15 Jun 2026 14:58:12 +0000 Subject: [PATCH 6/8] test(umbp): consolidate C++ tests under tests/cpp/umbp UMBP C++ tests were split across two trees with two CMake wirings and a duplicate GoogleTest FetchContent: the standard tests/cpp/umbp and a second suite under src/umbp/tests (built unconditionally, with its own gtest fetch). Move the src/umbp/tests suite into tests/cpp/umbp/distributed/ (master/peer/ index/store/router logic = distributed layer), merge its target definitions into distributed/CMakeLists.txt (keeping gtest_discover_tests registration), and drop the src/umbp/tests subdirectory and its duplicate googletest fetch. The whole UMBP suite now builds under the single BUILD_TESTS + BUILD_UMBP gate. Co-Authored-By: Claude Opus 4.8 --- src/umbp/CMakeLists.txt | 4 +- src/umbp/tests/CMakeLists.txt | 235 ----- tests/cpp/umbp/distributed/CMakeLists.txt | 141 +++ .../master_metadata_store_self_compile.cpp | 27 + .../distributed/mock_master_metadata_store.h | 125 +++ .../umbp/distributed/test_client_registry.cpp | 289 ++++++ .../test_client_registry_external_kv.cpp | 55 ++ .../test_external_kv_block_index.cpp | 103 ++ .../test_external_kv_hit_index.cpp | 116 +++ .../test_global_block_index_events.cpp | 505 ++++++++++ .../test_in_memory_master_metadata_store.cpp | 693 ++++++++++++++ .../test_master_metadata_store_interface.cpp | 133 +++ .../distributed/test_peer_dram_allocator.cpp | 899 ++++++++++++++++++ .../distributed/test_peer_ssd_eviction.cpp | 406 ++++++++ .../distributed/test_peer_ssd_manager.cpp | 233 +++++ .../distributed/test_peer_ssd_read_rpc.cpp | 258 +++++ .../umbp/distributed/test_router_dedup.cpp | 126 +++ .../distributed/test_ssd_copy_pipeline.cpp | 341 +++++++ .../test_ssd_read_lease_gating.cpp | 97 ++ .../umbp/distributed/test_ssd_reliability.cpp | 345 +++++++ .../test_tier_priority_route_get.cpp | 112 +++ 21 files changed, 5007 insertions(+), 236 deletions(-) delete mode 100644 src/umbp/tests/CMakeLists.txt create mode 100644 tests/cpp/umbp/distributed/master_metadata_store_self_compile.cpp create mode 100644 tests/cpp/umbp/distributed/mock_master_metadata_store.h create mode 100644 tests/cpp/umbp/distributed/test_client_registry.cpp create mode 100644 tests/cpp/umbp/distributed/test_client_registry_external_kv.cpp create mode 100644 tests/cpp/umbp/distributed/test_external_kv_block_index.cpp create mode 100644 tests/cpp/umbp/distributed/test_external_kv_hit_index.cpp create mode 100644 tests/cpp/umbp/distributed/test_global_block_index_events.cpp create mode 100644 tests/cpp/umbp/distributed/test_in_memory_master_metadata_store.cpp create mode 100644 tests/cpp/umbp/distributed/test_master_metadata_store_interface.cpp create mode 100644 tests/cpp/umbp/distributed/test_peer_dram_allocator.cpp create mode 100644 tests/cpp/umbp/distributed/test_peer_ssd_eviction.cpp create mode 100644 tests/cpp/umbp/distributed/test_peer_ssd_manager.cpp create mode 100644 tests/cpp/umbp/distributed/test_peer_ssd_read_rpc.cpp create mode 100644 tests/cpp/umbp/distributed/test_router_dedup.cpp create mode 100644 tests/cpp/umbp/distributed/test_ssd_copy_pipeline.cpp create mode 100644 tests/cpp/umbp/distributed/test_ssd_read_lease_gating.cpp create mode 100644 tests/cpp/umbp/distributed/test_ssd_reliability.cpp create mode 100644 tests/cpp/umbp/distributed/test_tier_priority_route_get.cpp diff --git a/src/umbp/CMakeLists.txt b/src/umbp/CMakeLists.txt index 84087e006..e680050c9 100644 --- a/src/umbp/CMakeLists.txt +++ b/src/umbp/CMakeLists.txt @@ -383,4 +383,6 @@ endif() target_link_libraries(umbp_client PRIVATE umbp_common ${_PROTOBUF_LIBS} ${_GRPCPP_LIB}) -add_subdirectory(tests) +# UMBP unit tests live under tests/cpp/umbp (built when BUILD_TESTS=ON); they +# were previously duplicated here under src/umbp/tests with a second GoogleTest +# fetch. diff --git a/src/umbp/tests/CMakeLists.txt b/src/umbp/tests/CMakeLists.txt deleted file mode 100644 index e5ed6fe98..000000000 --- a/src/umbp/tests/CMakeLists.txt +++ /dev/null @@ -1,235 +0,0 @@ -# --------------------------------------------------------------------------- -# UMBP unit tests — requires GTest and umbp_common -# --------------------------------------------------------------------------- -cmake_minimum_required(VERSION 3.14) - -include(FetchContent) - -FetchContent_Declare( - googletest - GIT_REPOSITORY https://github.com/google/googletest.git - GIT_TAG v1.14.0) - -# Prevent GoogleTest from overriding compiler/linker options when built as a -# subproject. -set(gtest_force_shared_crt - ON - CACHE BOOL "" FORCE) -FetchContent_MakeAvailable(googletest) - -enable_testing() - -include(GoogleTest) - -# --------------------------------------------------------------------------- -# test_master_metadata_store_interface — Phase 1 compile/instantiation gate for -# IMasterMetadataStore. Includes an isolated self-compile TU (proves the header -# is self-contained) and a GMock mock instantiation/signature-completeness test. -# --------------------------------------------------------------------------- -add_executable( - test_master_metadata_store_interface test_master_metadata_store_interface.cpp - master_metadata_store_self_compile.cpp) - -target_include_directories(test_master_metadata_store_interface - PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) - -target_link_libraries(test_master_metadata_store_interface - PRIVATE umbp_common GTest::gmock GTest::gtest_main) - -target_compile_features(test_master_metadata_store_interface PRIVATE cxx_std_17) - -gtest_discover_tests(test_master_metadata_store_interface) - -# --------------------------------------------------------------------------- -# test_in_memory_master_metadata_store — Phase 2 behavioral suite for the -# InMemoryMasterMetadataStore implementation of IMasterMetadataStore (§6a). -# Written against IMasterMetadataStore& so it is reused for the Redis backend. -# --------------------------------------------------------------------------- -add_executable(test_in_memory_master_metadata_store - test_in_memory_master_metadata_store.cpp) - -target_link_libraries(test_in_memory_master_metadata_store - PRIVATE umbp_common GTest::gtest_main) - -target_compile_features(test_in_memory_master_metadata_store PRIVATE cxx_std_17) - -gtest_discover_tests(test_in_memory_master_metadata_store) - -# --------------------------------------------------------------------------- -# test_external_kv_block_index -# --------------------------------------------------------------------------- -add_executable(test_external_kv_block_index test_external_kv_block_index.cpp) - -target_link_libraries(test_external_kv_block_index PRIVATE umbp_common - GTest::gtest_main) - -target_compile_features(test_external_kv_block_index PRIVATE cxx_std_17) - -gtest_discover_tests(test_external_kv_block_index) - -# --------------------------------------------------------------------------- -# test_client_registry — membership ledger: register/re-register, capacity -# round-trip, heartbeat status, and the silent-node reaper. -# --------------------------------------------------------------------------- -add_executable(test_client_registry test_client_registry.cpp) - -target_link_libraries(test_client_registry PRIVATE umbp_common - GTest::gtest_main) - -target_compile_features(test_client_registry PRIVATE cxx_std_17) - -gtest_discover_tests(test_client_registry) - -# --------------------------------------------------------------------------- -# test_client_registry_external_kv -# --------------------------------------------------------------------------- -add_executable(test_client_registry_external_kv - test_client_registry_external_kv.cpp) - -target_link_libraries(test_client_registry_external_kv - PRIVATE umbp_common GTest::gtest_main) - -target_compile_features(test_client_registry_external_kv PRIVATE cxx_std_17) - -gtest_discover_tests(test_client_registry_external_kv) - -# --------------------------------------------------------------------------- -# test_external_kv_hit_index -# --------------------------------------------------------------------------- -add_executable(test_external_kv_hit_index test_external_kv_hit_index.cpp) - -target_link_libraries(test_external_kv_hit_index PRIVATE umbp_common - GTest::gtest_main) - -target_compile_features(test_external_kv_hit_index PRIVATE cxx_std_17) - -gtest_discover_tests(test_external_kv_hit_index) - -# --------------------------------------------------------------------------- -# test_peer_dram_allocator -# --------------------------------------------------------------------------- -add_executable(test_peer_dram_allocator test_peer_dram_allocator.cpp) - -target_link_libraries(test_peer_dram_allocator PRIVATE umbp_common - GTest::gtest_main) - -target_compile_features(test_peer_dram_allocator PRIVATE cxx_std_17) - -gtest_discover_tests(test_peer_dram_allocator) - -# --------------------------------------------------------------------------- -# test_global_block_index_events -# --------------------------------------------------------------------------- -add_executable(test_global_block_index_events - test_global_block_index_events.cpp) - -target_link_libraries(test_global_block_index_events PRIVATE umbp_common - GTest::gtest_main) - -target_compile_features(test_global_block_index_events PRIVATE cxx_std_17) - -gtest_discover_tests(test_global_block_index_events) - -# --------------------------------------------------------------------------- -# test_router_dedup — master-side BatchRoutePut dedup via IMasterMetadataStore -# --------------------------------------------------------------------------- -add_executable(test_router_dedup test_router_dedup.cpp) - -target_link_libraries(test_router_dedup PRIVATE umbp_common GTest::gtest_main) - -target_compile_features(test_router_dedup PRIVATE cxx_std_17) - -gtest_discover_tests(test_router_dedup) - -# --------------------------------------------------------------------------- -# test_peer_ssd_manager — SSD tier ownership + owned-location source -# --------------------------------------------------------------------------- -add_executable(test_peer_ssd_manager test_peer_ssd_manager.cpp) - -target_link_libraries(test_peer_ssd_manager PRIVATE umbp_common - GTest::gtest_main) - -target_compile_features(test_peer_ssd_manager PRIVATE cxx_std_17) - -gtest_discover_tests(test_peer_ssd_manager) - -# --------------------------------------------------------------------------- -# test_peer_ssd_eviction — LRU + watermark eviction + in-flight guard -# --------------------------------------------------------------------------- -add_executable(test_peer_ssd_eviction test_peer_ssd_eviction.cpp) - -target_link_libraries(test_peer_ssd_eviction PRIVATE umbp_common - GTest::gtest_main) - -target_compile_features(test_peer_ssd_eviction PRIVATE cxx_std_17) - -gtest_discover_tests(test_peer_ssd_eviction) - -# --------------------------------------------------------------------------- -# test_ssd_copy_pipeline — DramCopyPin + async copy-on-commit pipeline -# --------------------------------------------------------------------------- -add_executable(test_ssd_copy_pipeline test_ssd_copy_pipeline.cpp) - -target_link_libraries(test_ssd_copy_pipeline PRIVATE umbp_common - GTest::gtest_main) - -target_compile_features(test_ssd_copy_pipeline PRIVATE cxx_std_17) - -gtest_discover_tests(test_ssd_copy_pipeline) - -# --------------------------------------------------------------------------- -# test_tier_priority_route_get — RouteGet tier-priority strategy -# --------------------------------------------------------------------------- -add_executable(test_tier_priority_route_get test_tier_priority_route_get.cpp) - -target_link_libraries(test_tier_priority_route_get PRIVATE umbp_common - GTest::gtest_main) - -target_compile_features(test_tier_priority_route_get PRIVATE cxx_std_17) - -gtest_discover_tests(test_tier_priority_route_get) - -# --------------------------------------------------------------------------- -# test_peer_ssd_read_rpc — SSD read RPC over a gRPC loopback (status -# distinctions: OK / NOT_FOUND / NO_SLOT / SIZE_TOO_LARGE + slot lifecycle). -# Needs the gRPC peer service + generated stubs (umbp_core) and protobuf/gRPC. -# --------------------------------------------------------------------------- -add_executable(test_peer_ssd_read_rpc test_peer_ssd_read_rpc.cpp) -if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") - target_link_options(test_peer_ssd_read_rpc PRIVATE -Wl,--no-as-needed) -endif() - -target_link_libraries( - test_peer_ssd_read_rpc PRIVATE umbp_core umbp_common ${_PROTOBUF_LIB} - ${_GRPCPP_LIB} GTest::gtest_main) - -target_compile_features(test_peer_ssd_read_rpc PRIVATE cxx_std_17) - -gtest_discover_tests(test_peer_ssd_read_rpc) - -# --------------------------------------------------------------------------- -# test_ssd_read_lease_gating — pure reader-side lease gating decision logic -# (ssd_read_lease.h). Header-only under test (no gRPC / RDMA deps). -# --------------------------------------------------------------------------- -add_executable(test_ssd_read_lease_gating test_ssd_read_lease_gating.cpp) - -target_link_libraries(test_ssd_read_lease_gating PRIVATE umbp_common - GTest::gtest_main) - -target_compile_features(test_ssd_read_lease_gating PRIVATE cxx_std_17) - -gtest_discover_tests(test_ssd_read_lease_gating) - -# --------------------------------------------------------------------------- -# test_ssd_reliability — cross-component reliability: owned-source DRAM+SSD -# merge, SSD evict -> REMOVE -> master index convergence, tier-priority over the -# real index, crash-restart discard, and observability counters. -# --------------------------------------------------------------------------- -add_executable(test_ssd_reliability test_ssd_reliability.cpp) - -target_link_libraries(test_ssd_reliability PRIVATE umbp_common - GTest::gtest_main) - -target_compile_features(test_ssd_reliability PRIVATE cxx_std_17) - -gtest_discover_tests(test_ssd_reliability) diff --git a/tests/cpp/umbp/distributed/CMakeLists.txt b/tests/cpp/umbp/distributed/CMakeLists.txt index 6e116512a..be147569c 100644 --- a/tests/cpp/umbp/distributed/CMakeLists.txt +++ b/tests/cpp/umbp/distributed/CMakeLists.txt @@ -22,6 +22,10 @@ else() find_library(_TEST_GRPCPP_LIB NAMES grpc++ REQUIRED) endif() +# Enables gtest_discover_tests() for the metadata-store / external-KV / SSD-tier +# suite migrated below from src/umbp/tests. +include(GoogleTest) + # MasterClient lifecycle: destructor budget, heartbeat thread shutdown, etc. add_executable(test_umbp_master_client_lifecycle test_master_client_lifecycle.cpp) @@ -172,3 +176,140 @@ target_link_libraries( bench_umbp_pool_client_batch_get PRIVATE umbp_core umbp_common ${_TEST_PROTOBUF_LIB} ${_TEST_GRPCPP_LIB} gtest_main) + +# =========================================================================== +# Metadata-store / external-KV / SSD-tier suite (migrated from src/umbp/tests). +# These link umbp_common (pure logic) except test_peer_ssd_read_rpc, which +# exercises the gRPC peer service and links umbp_core + protobuf/gRPC. Test +# cases are registered individually via gtest_discover_tests. +# =========================================================================== + +# test_master_metadata_store_interface — Phase 1 compile/instantiation gate for +# IMasterMetadataStore. Includes an isolated self-compile TU (proves the header +# is self-contained) and a GMock mock instantiation/signature-completeness test. +add_executable( + test_master_metadata_store_interface test_master_metadata_store_interface.cpp + master_metadata_store_self_compile.cpp) +target_include_directories(test_master_metadata_store_interface + PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) +target_link_libraries(test_master_metadata_store_interface + PRIVATE umbp_common GTest::gmock GTest::gtest_main) +target_compile_features(test_master_metadata_store_interface PRIVATE cxx_std_17) +gtest_discover_tests(test_master_metadata_store_interface) + +# test_in_memory_master_metadata_store — Phase 2 behavioral suite for the +# InMemoryMasterMetadataStore implementation of IMasterMetadataStore (§6a). +add_executable(test_in_memory_master_metadata_store + test_in_memory_master_metadata_store.cpp) +target_link_libraries(test_in_memory_master_metadata_store + PRIVATE umbp_common GTest::gtest_main) +target_compile_features(test_in_memory_master_metadata_store PRIVATE cxx_std_17) +gtest_discover_tests(test_in_memory_master_metadata_store) + +# test_external_kv_block_index +add_executable(test_external_kv_block_index test_external_kv_block_index.cpp) +target_link_libraries(test_external_kv_block_index PRIVATE umbp_common + GTest::gtest_main) +target_compile_features(test_external_kv_block_index PRIVATE cxx_std_17) +gtest_discover_tests(test_external_kv_block_index) + +# test_client_registry — membership ledger: register/re-register, capacity +# round-trip, heartbeat status, and the silent-node reaper. +add_executable(test_client_registry test_client_registry.cpp) +target_link_libraries(test_client_registry PRIVATE umbp_common + GTest::gtest_main) +target_compile_features(test_client_registry PRIVATE cxx_std_17) +gtest_discover_tests(test_client_registry) + +# test_client_registry_external_kv +add_executable(test_client_registry_external_kv + test_client_registry_external_kv.cpp) +target_link_libraries(test_client_registry_external_kv + PRIVATE umbp_common GTest::gtest_main) +target_compile_features(test_client_registry_external_kv PRIVATE cxx_std_17) +gtest_discover_tests(test_client_registry_external_kv) + +# test_external_kv_hit_index +add_executable(test_external_kv_hit_index test_external_kv_hit_index.cpp) +target_link_libraries(test_external_kv_hit_index PRIVATE umbp_common + GTest::gtest_main) +target_compile_features(test_external_kv_hit_index PRIVATE cxx_std_17) +gtest_discover_tests(test_external_kv_hit_index) + +# test_peer_dram_allocator +add_executable(test_peer_dram_allocator test_peer_dram_allocator.cpp) +target_link_libraries(test_peer_dram_allocator PRIVATE umbp_common + GTest::gtest_main) +target_compile_features(test_peer_dram_allocator PRIVATE cxx_std_17) +gtest_discover_tests(test_peer_dram_allocator) + +# test_global_block_index_events +add_executable(test_global_block_index_events + test_global_block_index_events.cpp) +target_link_libraries(test_global_block_index_events PRIVATE umbp_common + GTest::gtest_main) +target_compile_features(test_global_block_index_events PRIVATE cxx_std_17) +gtest_discover_tests(test_global_block_index_events) + +# test_router_dedup — master-side BatchRoutePut dedup via IMasterMetadataStore +add_executable(test_router_dedup test_router_dedup.cpp) +target_link_libraries(test_router_dedup PRIVATE umbp_common GTest::gtest_main) +target_compile_features(test_router_dedup PRIVATE cxx_std_17) +gtest_discover_tests(test_router_dedup) + +# test_peer_ssd_manager — SSD tier ownership + owned-location source +add_executable(test_peer_ssd_manager test_peer_ssd_manager.cpp) +target_link_libraries(test_peer_ssd_manager PRIVATE umbp_common + GTest::gtest_main) +target_compile_features(test_peer_ssd_manager PRIVATE cxx_std_17) +gtest_discover_tests(test_peer_ssd_manager) + +# test_peer_ssd_eviction — LRU + watermark eviction + in-flight guard +add_executable(test_peer_ssd_eviction test_peer_ssd_eviction.cpp) +target_link_libraries(test_peer_ssd_eviction PRIVATE umbp_common + GTest::gtest_main) +target_compile_features(test_peer_ssd_eviction PRIVATE cxx_std_17) +gtest_discover_tests(test_peer_ssd_eviction) + +# test_ssd_copy_pipeline — DramCopyPin + async copy-on-commit pipeline +add_executable(test_ssd_copy_pipeline test_ssd_copy_pipeline.cpp) +target_link_libraries(test_ssd_copy_pipeline PRIVATE umbp_common + GTest::gtest_main) +target_compile_features(test_ssd_copy_pipeline PRIVATE cxx_std_17) +gtest_discover_tests(test_ssd_copy_pipeline) + +# test_tier_priority_route_get — RouteGet tier-priority strategy +add_executable(test_tier_priority_route_get test_tier_priority_route_get.cpp) +target_link_libraries(test_tier_priority_route_get PRIVATE umbp_common + GTest::gtest_main) +target_compile_features(test_tier_priority_route_get PRIVATE cxx_std_17) +gtest_discover_tests(test_tier_priority_route_get) + +# test_peer_ssd_read_rpc — SSD read RPC over a gRPC loopback. Needs the gRPC +# peer service + generated stubs (umbp_core) and protobuf/gRPC. +add_executable(test_peer_ssd_read_rpc test_peer_ssd_read_rpc.cpp) +if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang") + target_link_options(test_peer_ssd_read_rpc PRIVATE -Wl,--no-as-needed) +endif() +target_link_libraries( + test_peer_ssd_read_rpc PRIVATE umbp_core umbp_common ${_TEST_PROTOBUF_LIB} + ${_TEST_GRPCPP_LIB} GTest::gtest_main) +target_compile_features(test_peer_ssd_read_rpc PRIVATE cxx_std_17) +gtest_discover_tests(test_peer_ssd_read_rpc) + +# test_ssd_read_lease_gating — pure reader-side lease gating decision logic +# (ssd_read_lease.h). Header-only under test (no gRPC / RDMA deps). +add_executable(test_ssd_read_lease_gating test_ssd_read_lease_gating.cpp) +target_link_libraries(test_ssd_read_lease_gating PRIVATE umbp_common + GTest::gtest_main) +target_compile_features(test_ssd_read_lease_gating PRIVATE cxx_std_17) +gtest_discover_tests(test_ssd_read_lease_gating) + +# test_ssd_reliability — cross-component reliability: owned-source DRAM+SSD +# merge, SSD evict -> REMOVE -> master index convergence, tier-priority over the +# real index, crash-restart discard, and observability counters. +add_executable(test_ssd_reliability test_ssd_reliability.cpp) +target_link_libraries(test_ssd_reliability PRIVATE umbp_common + GTest::gtest_main) +target_compile_features(test_ssd_reliability PRIVATE cxx_std_17) +gtest_discover_tests(test_ssd_reliability) diff --git a/tests/cpp/umbp/distributed/master_metadata_store_self_compile.cpp b/tests/cpp/umbp/distributed/master_metadata_store_self_compile.cpp new file mode 100644 index 000000000..e8e016ce8 --- /dev/null +++ b/tests/cpp/umbp/distributed/master_metadata_store_self_compile.cpp @@ -0,0 +1,27 @@ +// Copyright © Advanced Micro Devices, Inc. All rights reserved. +// +// MIT License +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +// Phase 1 "header self-compiles" gate: this translation unit includes ONLY the +// interface header (which in turn includes types.h). If it compiles, the header +// is self-contained — no missing includes or forward declarations. Deliberately +// has no other includes and no symbols of its own. +#include "umbp/distributed/master/master_metadata_store.h" diff --git a/tests/cpp/umbp/distributed/mock_master_metadata_store.h b/tests/cpp/umbp/distributed/mock_master_metadata_store.h new file mode 100644 index 000000000..ae62ea4d7 --- /dev/null +++ b/tests/cpp/umbp/distributed/mock_master_metadata_store.h @@ -0,0 +1,125 @@ +// Copyright © Advanced Micro Devices, Inc. All rights reserved. +// +// MIT License +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +// GMock mock for IMasterMetadataStore. +// +// Phase 1 use: instantiation gate. If this type compiles and instantiates, +// every pure-virtual on the interface is overridden with a well-typed +// signature — proving the contract has no orphaned/ill-typed methods. +// +// Reused in Phase 3 (consumer-integration) to assert that each rewired +// consumer (Router / EvictionManager / UMBPMasterServiceImpl handlers) calls +// the right store method with correctly-translated arguments. +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "umbp/distributed/master/master_metadata_store.h" +#include "umbp/distributed/types.h" + +namespace mori::umbp { + +class MockMasterMetadataStore : public IMasterMetadataStore { + public: + // Aliases for types whose commas would otherwise break the MOCK_METHOD macro + // parser (it splits the argument list on top-level commas). + using CapsMap = std::map; + using BudgetMap = std::map; + using LruResult = std::map>; + using LocationBatch = std::vector>; + + // --- Cross-store writes --- + MOCK_METHOD(bool, RegisterClient, + (const ClientRegistration& registration, std::chrono::system_clock::time_point now, + std::chrono::system_clock::duration stale_after), + (override)); + MOCK_METHOD(void, UnregisterClient, (const std::string& node_id), (override)); + MOCK_METHOD(HeartbeatResult, ApplyHeartbeat, + (const std::string& node_id, uint64_t seq, std::chrono::system_clock::time_point now, + const CapsMap& caps, (const std::vector&)events, bool is_full_sync), + (override)); + MOCK_METHOD(std::vector, ExpireStaleClients, + (std::chrono::system_clock::time_point cutoff), (override)); + + // --- External-KV writes --- + MOCK_METHOD(bool, RegisterExternalKvIfAlive, + (const std::string& node_id, (const std::vector&)hashes, TierType tier), + (override)); + MOCK_METHOD(void, UnregisterExternalKv, + (const std::string& node_id, (const std::vector&)hashes, TierType tier), + (override)); + MOCK_METHOD(void, UnregisterExternalKvByTier, (const std::string& node_id, TierType tier), + (override)); + MOCK_METHOD(void, UnregisterExternalKvByNode, (const std::string& node_id), (override)); + MOCK_METHOD(std::size_t, GarbageCollectHits, (std::chrono::system_clock::time_point cutoff), + (override)); + + // --- Block reads --- + MOCK_METHOD(std::vector, LookupBlock, (const std::string& key), (const, override)); + MOCK_METHOD(std::vector, LookupBlockForRouteGet, + (const std::string& key, (const std::unordered_set&)exclude_nodes, + std::chrono::system_clock::time_point now, + std::chrono::system_clock::duration lease_duration), + (override)); + MOCK_METHOD(LocationBatch, BatchLookupBlockForRouteGet, + ((const std::vector&)keys, + (const std::unordered_set&)exclude_nodes, + std::chrono::system_clock::time_point now, + std::chrono::system_clock::duration lease_duration), + (override)); + MOCK_METHOD(std::vector, BatchExistsBlock, ((const std::vector&)keys), + (const, override)); + MOCK_METHOD(LruResult, EnumerateLruForEviction, + (const BudgetMap& bytes_to_free, std::chrono::system_clock::time_point now), + (const, override)); + + // --- Client reads --- + MOCK_METHOD(std::optional, GetClient, (const std::string& node_id), + (const, override)); + MOCK_METHOD(bool, IsClientAlive, (const std::string& node_id), (const, override)); + MOCK_METHOD(std::optional, GetPeerAddress, (const std::string& node_id), + (const, override)); + MOCK_METHOD(std::vector, ListAliveClients, (), (const, override)); + MOCK_METHOD(std::size_t, AliveClientCount, (), (const, override)); + MOCK_METHOD(std::vector, GetClientTags, (const std::string& node_id), + (const, override)); + + // --- External-KV reads --- + MOCK_METHOD(std::vector, MatchExternalKv, + ((const std::vector&)hashes, bool count_as_hit, + std::chrono::system_clock::time_point now), + (override)); + MOCK_METHOD(std::vector, GetExternalKvHitCounts, + ((const std::vector&)hashes), (const, override)); + MOCK_METHOD(std::size_t, GetExternalKvCount, (const std::string& node_id), (const, override)); +}; + +} // namespace mori::umbp diff --git a/tests/cpp/umbp/distributed/test_client_registry.cpp b/tests/cpp/umbp/distributed/test_client_registry.cpp new file mode 100644 index 000000000..adb363a30 --- /dev/null +++ b/tests/cpp/umbp/distributed/test_client_registry.cpp @@ -0,0 +1,289 @@ +// Copyright © Advanced Micro Devices, Inc. All rights reserved. +// +// MIT License +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +// +// Membership-ledger unit tests for ClientRegistry: registration / re- +// registration semantics, capacity round-trip, heartbeat status, and the +// background reaper that expires silent nodes. These exercise the registry +// in isolation (no GlobalBlockIndex / RPC), complementing the index- and +// external-kv-focused suites. In the master-as-advisor design the registry +// stores only membership + the capacities a peer last reported, so the +// assertions below check reported values verbatim rather than any allocator- +// derived view. +#include + +#include +#include +#include +#include +#include + +#include "umbp/distributed/master/client_registry.h" +#include "umbp/distributed/types.h" + +namespace mori::umbp { +namespace { + +std::map Caps(uint64_t total, uint64_t available) { + return {{TierType::HBM, TierCapacity{total, available}}}; +} + +const ClientRecord* FindClient(const std::vector& clients, const std::string& id) { + for (const auto& c : clients) { + if (c.node_id == id) return &c; + } + return nullptr; +} + +// Drive the current 7-arg Heartbeat with no events — the membership-keepalive +// path the reaper cares about. +ClientStatus Beat(ClientRegistry& registry, const std::string& node_id, + const std::map& caps) { + uint64_t acked = 0; + bool need_full = false; + return registry.Heartbeat(node_id, caps, /*bundles=*/{}, /*is_full_sync=*/false, + /*delta_seq_baseline=*/0, &acked, &need_full); +} + +template +bool WaitUntil(Predicate&& predicate, std::chrono::milliseconds timeout, + std::chrono::milliseconds poll = std::chrono::milliseconds(100)) { + const auto deadline = std::chrono::steady_clock::now() + timeout; + while (std::chrono::steady_clock::now() < deadline) { + if (predicate()) return true; + std::this_thread::sleep_for(poll); + } + return predicate(); +} + +// heartbeat_ttl * max_missed_heartbeats == 1s, so a node ages out ~1s after +// its last heartbeat. reaper_interval keeps the sweep responsive. +ClientRegistryConfig FastExpiryConfig() { + ClientRegistryConfig config; + config.heartbeat_ttl = std::chrono::seconds(1); + config.max_missed_heartbeats = 1; + config.reaper_interval = std::chrono::seconds(1); + return config; +} + +} // namespace + +// --- Registration / membership ---------------------------------------------- + +TEST(ClientRegistryTest, RegisterSingle) { + ClientRegistry registry(ClientRegistryConfig{}); + EXPECT_TRUE(registry.RegisterClient("node-1", "127.0.0.1:8080", Caps(80, 64))); + EXPECT_EQ(registry.ClientCount(), 1u); + EXPECT_TRUE(registry.IsClientAlive("node-1")); +} + +TEST(ClientRegistryTest, RegisterMultiple) { + ClientRegistry registry(ClientRegistryConfig{}); + EXPECT_TRUE(registry.RegisterClient("c1", "127.0.0.1:1001", Caps(100, 90))); + EXPECT_TRUE(registry.RegisterClient("c2", "127.0.0.1:1002", Caps(110, 80))); + EXPECT_TRUE(registry.RegisterClient("c3", "127.0.0.1:1003", Caps(120, 70))); + + EXPECT_EQ(registry.ClientCount(), 3u); + EXPECT_TRUE(registry.IsClientAlive("c1")); + EXPECT_TRUE(registry.IsClientAlive("c2")); + EXPECT_TRUE(registry.IsClientAlive("c3")); +} + +TEST(ClientRegistryTest, GetAliveClientsReportsMembershipAndCapacities) { + ClientRegistry registry(ClientRegistryConfig{}); + EXPECT_TRUE(registry.RegisterClient("c1", "host-a:8080", Caps(80, 64))); + EXPECT_TRUE(registry.RegisterClient("c2", "host-b:8080", Caps(96, 32))); + + const auto clients = registry.GetAliveClients(); + ASSERT_EQ(clients.size(), 2u); + + const ClientRecord* c1 = FindClient(clients, "c1"); + const ClientRecord* c2 = FindClient(clients, "c2"); + ASSERT_NE(c1, nullptr); + ASSERT_NE(c2, nullptr); + + EXPECT_EQ(c1->node_address, "host-a:8080"); + EXPECT_EQ(c2->node_address, "host-b:8080"); + EXPECT_EQ(c1->status, ClientStatus::ALIVE); + EXPECT_EQ(c2->status, ClientStatus::ALIVE); + + // Master stores the peer-reported capacities verbatim. + ASSERT_TRUE(c1->tier_capacities.count(TierType::HBM) > 0); + ASSERT_TRUE(c2->tier_capacities.count(TierType::HBM) > 0); + EXPECT_EQ(c1->tier_capacities.at(TierType::HBM).total_bytes, 80u); + EXPECT_EQ(c1->tier_capacities.at(TierType::HBM).available_bytes, 64u); + EXPECT_EQ(c2->tier_capacities.at(TierType::HBM).available_bytes, 32u); +} + +TEST(ClientRegistryTest, ReRegisterAliveRejected) { + ClientRegistry registry(ClientRegistryConfig{}); + EXPECT_TRUE(registry.RegisterClient("c1", "addr-1", Caps(80, 64))); + // A live node may not silently take over its own id with a new address. + EXPECT_FALSE(registry.RegisterClient("c1", "addr-2", Caps(80, 32))); + + EXPECT_EQ(registry.ClientCount(), 1u); + const auto clients = registry.GetAliveClients(); + ASSERT_EQ(clients.size(), 1u); + EXPECT_EQ(clients[0].node_address, "addr-1"); // original record untouched +} + +TEST(ClientRegistryTest, ReRegisterExpiredAllowed) { + // No reaper here: the aged-out branch in RegisterClient (now - last_heartbeat + // > expiry) must accept the re-registration on its own. + ClientRegistry registry(FastExpiryConfig()); + EXPECT_TRUE(registry.RegisterClient("c1", "addr-1", Caps(80, 64))); + + const bool reregistered = + WaitUntil([®istry] { return registry.RegisterClient("c1", "addr-2", Caps(80, 32)); }, + std::chrono::seconds(5)); + EXPECT_TRUE(reregistered); + + EXPECT_EQ(registry.ClientCount(), 1u); + const auto clients = registry.GetAliveClients(); + ASSERT_EQ(clients.size(), 1u); + EXPECT_EQ(clients[0].node_address, "addr-2"); // new address wins + EXPECT_EQ(clients[0].status, ClientStatus::ALIVE); +} + +// --- Unregister -------------------------------------------------------------- + +TEST(ClientRegistryTest, UnregisterExisting) { + ClientRegistry registry(ClientRegistryConfig{}); + EXPECT_TRUE(registry.RegisterClient("c1", "addr", Caps(80, 64))); + + registry.UnregisterClient("c1"); + EXPECT_EQ(registry.ClientCount(), 0u); + EXPECT_FALSE(registry.IsClientAlive("c1")); +} + +TEST(ClientRegistryTest, UnregisterUnknownIsNoop) { + ClientRegistry registry(ClientRegistryConfig{}); + EXPECT_TRUE(registry.RegisterClient("c1", "addr", Caps(80, 64))); + + registry.UnregisterClient("nonexistent"); + EXPECT_EQ(registry.ClientCount(), 1u); + EXPECT_TRUE(registry.IsClientAlive("c1")); +} + +TEST(ClientRegistryTest, UnregisterTwiceIsSafe) { + ClientRegistry registry(ClientRegistryConfig{}); + EXPECT_TRUE(registry.RegisterClient("c1", "addr", Caps(80, 64))); + + registry.UnregisterClient("c1"); + registry.UnregisterClient("c1"); + EXPECT_EQ(registry.ClientCount(), 0u); +} + +// --- Heartbeat --------------------------------------------------------------- + +TEST(ClientRegistryTest, HeartbeatAliveReplacesCapacities) { + ClientRegistry registry(ClientRegistryConfig{}); + EXPECT_TRUE(registry.RegisterClient("c1", "addr", Caps(80, 64))); + + EXPECT_EQ(Beat(registry, "c1", Caps(80, 16)), ClientStatus::ALIVE); + EXPECT_TRUE(registry.IsClientAlive("c1")); + + const auto clients = registry.GetAliveClients(); + ASSERT_EQ(clients.size(), 1u); + ASSERT_TRUE(clients[0].tier_capacities.count(TierType::HBM) > 0); + // The most recent heartbeat's capacities replace the stored values. + EXPECT_EQ(clients[0].tier_capacities.at(TierType::HBM).available_bytes, 16u); +} + +TEST(ClientRegistryTest, HeartbeatUnknownReturnsUnknown) { + ClientRegistry registry(ClientRegistryConfig{}); + EXPECT_EQ(Beat(registry, "nonexistent", Caps(80, 48)), ClientStatus::UNKNOWN); +} + +// --- Reaper ------------------------------------------------------------------ + +TEST(ClientRegistryTest, ReaperExpiresIdleClient) { + ClientRegistry registry(FastExpiryConfig()); + EXPECT_TRUE(registry.RegisterClient("c1", "addr", Caps(80, 64))); + registry.StartReaper(); + + const bool reaped = + WaitUntil([®istry] { return registry.ClientCount() == 0; }, std::chrono::seconds(6)); + + registry.StopReaper(); + EXPECT_TRUE(reaped); + EXPECT_FALSE(registry.IsClientAlive("c1")); +} + +TEST(ClientRegistryTest, ReaperKeepsClientAliveWithHeartbeats) { + ClientRegistry registry(FastExpiryConfig()); + EXPECT_TRUE(registry.RegisterClient("c1", "addr", Caps(80, 64))); + registry.StartReaper(); + + const auto start = std::chrono::steady_clock::now(); + while (std::chrono::steady_clock::now() - start < std::chrono::seconds(3)) { + EXPECT_EQ(Beat(registry, "c1", Caps(80, 48)), ClientStatus::ALIVE); + std::this_thread::sleep_for(std::chrono::milliseconds(300)); + } + + registry.StopReaper(); + EXPECT_EQ(registry.ClientCount(), 1u); + EXPECT_TRUE(registry.IsClientAlive("c1")); +} + +TEST(ClientRegistryTest, ReaperSelectiveExpiry) { + ClientRegistry registry(FastExpiryConfig()); + EXPECT_TRUE(registry.RegisterClient("c1", "addr-1", Caps(80, 64))); + EXPECT_TRUE(registry.RegisterClient("c2", "addr-2", Caps(80, 64))); + registry.StartReaper(); + + // Keep c1 fed; let c2 go silent. c2 must be reaped while c1 survives. + const bool reached = WaitUntil( + [®istry] { + Beat(registry, "c1", Caps(80, 48)); + return registry.IsClientAlive("c1") && !registry.IsClientAlive("c2"); + }, + std::chrono::seconds(6), std::chrono::milliseconds(200)); + + registry.StopReaper(); + EXPECT_TRUE(reached); + EXPECT_TRUE(registry.IsClientAlive("c1")); + EXPECT_FALSE(registry.IsClientAlive("c2")); +} + +TEST(ClientRegistryTest, StopReaperWhenNeverStarted) { + ClientRegistry registry(ClientRegistryConfig{}); + registry.StopReaper(); // must not hang or crash + SUCCEED(); +} + +TEST(ClientRegistryTest, StartStopReaperMultiple) { + ClientRegistry registry(ClientRegistryConfig{}); + registry.StartReaper(); + registry.StopReaper(); + registry.StartReaper(); + registry.StopReaper(); + SUCCEED(); +} + +TEST(ClientRegistryTest, DestructorStopsRunningReaper) { + ClientRegistry registry(ClientRegistryConfig{}); + registry.StartReaper(); + EXPECT_TRUE(registry.RegisterClient("c1", "addr", Caps(80, 64))); + // Falling out of scope must join the reaper thread cleanly. +} + +} // namespace mori::umbp diff --git a/tests/cpp/umbp/distributed/test_client_registry_external_kv.cpp b/tests/cpp/umbp/distributed/test_client_registry_external_kv.cpp new file mode 100644 index 000000000..e232fe58a --- /dev/null +++ b/tests/cpp/umbp/distributed/test_client_registry_external_kv.cpp @@ -0,0 +1,55 @@ +// Copyright © Advanced Micro Devices, Inc. All rights reserved. +// +// MIT License +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +#include + +#include "umbp/distributed/master/client_registry.h" +#include "umbp/distributed/master/external_kv_block_index.h" +#include "umbp/distributed/master/global_block_index.h" + +namespace mori::umbp { + +TEST(ClientRegistryExternalKv, UnregisterClientClearsBothIndices) { + GlobalBlockIndex global_index; + ExternalKvBlockIndex external_index; + ClientRegistry registry(ClientRegistryConfig{}, global_index, &external_index); + + ASSERT_TRUE(registry.RegisterClient("node-A", "127.0.0.1:9000", {}, "127.0.0.1:9001")); + ASSERT_EQ(global_index.ApplyEvents("node-A", + {KvEvent{KvEvent::Kind::ADD, "owned", TierType::DRAM, 128}}), + 1u); + ASSERT_EQ(external_index.Register("node-A", {"external"}, TierType::DRAM), 1u); + + registry.UnregisterClient("node-A"); + + EXPECT_TRUE(global_index.Lookup("owned").empty()); + EXPECT_TRUE(external_index.Match({"external"}).empty()); +} + +TEST(ClientRegistryExternalKv, UnregisterWithoutExternalIndexDoesNotCrash) { + GlobalBlockIndex global_index; + ClientRegistry registry(ClientRegistryConfig{}, global_index); + + ASSERT_TRUE(registry.RegisterClient("node-A", "127.0.0.1:9000", {}, "127.0.0.1:9001")); + EXPECT_NO_THROW(registry.UnregisterClient("node-A")); +} + +} // namespace mori::umbp diff --git a/tests/cpp/umbp/distributed/test_external_kv_block_index.cpp b/tests/cpp/umbp/distributed/test_external_kv_block_index.cpp new file mode 100644 index 000000000..18ee654d5 --- /dev/null +++ b/tests/cpp/umbp/distributed/test_external_kv_block_index.cpp @@ -0,0 +1,103 @@ +// Copyright © Advanced Micro Devices, Inc. All rights reserved. +// +// MIT License +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +#include + +#include +#include +#include + +#include "umbp/distributed/master/external_kv_block_index.h" + +namespace mori::umbp { +namespace { + +const ExternalKvBlockIndex::NodeMatch* FindMatch( + const std::vector& matches, const std::string& node_id) { + for (const auto& match : matches) { + if (match.node_id == node_id) return &match; + } + return nullptr; +} + +std::vector Sorted(std::vector values) { + std::sort(values.begin(), values.end()); + return values; +} + +} // namespace + +TEST(ExternalKvBlockIndex, RegisterIsAdditiveAcrossTiersAndCountsMutations) { + ExternalKvBlockIndex index; + + EXPECT_EQ(index.Register("node-A", {"h1"}, TierType::HBM), 1u); + EXPECT_EQ(index.Register("node-A", {"h1"}, TierType::DRAM), 1u); + EXPECT_EQ(index.Register("node-A", {"h1"}, TierType::DRAM), 0u); + + auto matches = index.Match({"h1"}); + ASSERT_EQ(matches.size(), 1u); + EXPECT_EQ(matches[0].MatchedHashCount(), 1u); + EXPECT_EQ(matches[0].hashes_by_tier.at(TierType::HBM), std::vector({"h1"})); + EXPECT_EQ(matches[0].hashes_by_tier.at(TierType::DRAM), std::vector({"h1"})); + EXPECT_EQ(index.GetKvCount("node-A"), 1u); +} + +TEST(ExternalKvBlockIndex, UnregisterRemovesOnlyRequestedTier) { + ExternalKvBlockIndex index; + ASSERT_EQ(index.Register("node-A", {"h1", "h2"}, TierType::HBM), 2u); + ASSERT_EQ(index.Register("node-A", {"h1"}, TierType::DRAM), 1u); + + EXPECT_EQ(index.Unregister("node-A", {"h1", "missing"}, TierType::HBM), 1u); + EXPECT_EQ(index.Unregister("node-A", {"h1"}, TierType::HBM), 0u); + + auto matches = index.Match({"h1", "h2"}); + ASSERT_EQ(matches.size(), 1u); + const auto& match = matches[0]; + EXPECT_EQ(match.hashes_by_tier.at(TierType::DRAM), std::vector({"h1"})); + EXPECT_EQ(match.hashes_by_tier.at(TierType::HBM), std::vector({"h2"})); + EXPECT_EQ(index.GetKvCount("node-A"), 2u); +} + +TEST(ExternalKvBlockIndex, BulkUnregisterByTierAndNode) { + ExternalKvBlockIndex index; + ASSERT_EQ(index.Register("node-A", {"h1", "h2", "h3"}, TierType::DRAM), 3u); + ASSERT_EQ(index.Register("node-A", {"h1", "h2"}, TierType::SSD), 2u); + ASSERT_EQ(index.Register("node-B", {"h1"}, TierType::SSD), 1u); + + EXPECT_EQ(index.UnregisterByNodeAtTier("node-A", TierType::SSD), 2u); + auto matches = index.Match({"h1", "h2", "h3"}); + ASSERT_EQ(matches.size(), 2u); + const auto* node_a = FindMatch(matches, "node-A"); + ASSERT_NE(node_a, nullptr); + ASSERT_EQ(node_a->hashes_by_tier.size(), 1u); + EXPECT_EQ(Sorted(node_a->hashes_by_tier.at(TierType::DRAM)), + (std::vector{"h1", "h2", "h3"})); + const auto* node_b = FindMatch(matches, "node-B"); + ASSERT_NE(node_b, nullptr); + EXPECT_EQ(node_b->hashes_by_tier.at(TierType::SSD), std::vector({"h1"})); + + EXPECT_EQ(index.UnregisterByNode("node-A"), 3u); + matches = index.Match({"h1", "h2", "h3"}); + ASSERT_EQ(matches.size(), 1u); + EXPECT_EQ(matches[0].node_id, "node-B"); +} + +} // namespace mori::umbp diff --git a/tests/cpp/umbp/distributed/test_external_kv_hit_index.cpp b/tests/cpp/umbp/distributed/test_external_kv_hit_index.cpp new file mode 100644 index 000000000..20685f0b7 --- /dev/null +++ b/tests/cpp/umbp/distributed/test_external_kv_hit_index.cpp @@ -0,0 +1,116 @@ +// Copyright © Advanced Micro Devices, Inc. All rights reserved. +// +// MIT License +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +#include + +#include +#include +#include +#include +#include +#include + +#include "umbp/distributed/master/external_kv_hit_index.h" + +namespace mori::umbp { +namespace { + +std::unordered_map LookupMap(ExternalKvHitIndex& index, + const std::vector& hashes) { + std::vector> entries; + index.Lookup(hashes, &entries); + std::unordered_map out; + for (const auto& [hash, total] : entries) out[hash] = total; + return out; +} + +TEST(ExternalKvHitIndexTest, IncrementAndLookup) { + ExternalKvHitIndex index; + index.IncrementHits({"h1", "h2"}, 100); + + auto counts = LookupMap(index, {"h1", "h2", "missing"}); + ASSERT_EQ(counts.size(), 2); + EXPECT_EQ(counts["h1"], 1); + EXPECT_EQ(counts["h2"], 1); +} + +TEST(ExternalKvHitIndexTest, RepeatedIncrementsAccumulate) { + ExternalKvHitIndex index; + for (int i = 0; i < 10; ++i) index.IncrementHits({"hot"}, 100 + i); + + auto counts = LookupMap(index, {"hot"}); + ASSERT_EQ(counts.size(), 1); + EXPECT_EQ(counts["hot"], 10); +} + +TEST(ExternalKvHitIndexTest, LookupSkipsMissingAndDedupesRequestHashes) { + ExternalKvHitIndex index; + index.IncrementHits({"h1"}, 100); + + std::vector> entries; + index.Lookup({"missing", "h1", "h1", "missing"}, &entries); + ASSERT_EQ(entries.size(), 1); + EXPECT_EQ(entries[0].first, "h1"); + EXPECT_EQ(entries[0].second, 1); +} + +TEST(ExternalKvHitIndexTest, GarbageCollectUsesLastSeenCutoff) { + ExternalKvHitIndex index; + index.IncrementHits({"old"}, 100); + index.IncrementHits({"fresh"}, 200); + + EXPECT_EQ(index.GarbageCollect(150), 1); + EXPECT_EQ(index.Size(), 1); + + auto counts = LookupMap(index, {"old", "fresh"}); + ASSERT_EQ(counts.size(), 1); + EXPECT_EQ(counts["fresh"], 1); +} + +TEST(ExternalKvHitIndexTest, ConcurrentCreationKeepsAllIncrements) { + ExternalKvHitIndex index; + constexpr int kThreads = 32; + constexpr int kIterations = 1000; + + std::atomic start{false}; + std::vector threads; + threads.reserve(kThreads); + for (int t = 0; t < kThreads; ++t) { + threads.emplace_back([&] { + while (!start.load(std::memory_order_acquire)) { + std::this_thread::yield(); + } + for (int i = 0; i < kIterations; ++i) { + index.IncrementHits({"shared"}, static_cast(100 + i)); + } + }); + } + + start.store(true, std::memory_order_release); + for (auto& thread : threads) thread.join(); + + auto counts = LookupMap(index, {"shared"}); + ASSERT_EQ(counts.size(), 1); + EXPECT_EQ(counts["shared"], static_cast(kThreads * kIterations)); +} + +} // namespace +} // namespace mori::umbp diff --git a/tests/cpp/umbp/distributed/test_global_block_index_events.cpp b/tests/cpp/umbp/distributed/test_global_block_index_events.cpp new file mode 100644 index 000000000..1b82e43ea --- /dev/null +++ b/tests/cpp/umbp/distributed/test_global_block_index_events.cpp @@ -0,0 +1,505 @@ +// Copyright © Advanced Micro Devices, Inc. All rights reserved. +// +// MIT License +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +#include + +#include +#include +#include +#include + +#include "umbp/distributed/master/client_registry.h" +#include "umbp/distributed/master/global_block_index.h" +#include "umbp/distributed/types.h" + +namespace mori::umbp { + +namespace { + +KvEvent Add(std::string key, TierType tier, uint64_t size) { + return KvEvent{KvEvent::Kind::ADD, std::move(key), tier, size}; +} + +KvEvent Remove(std::string key, TierType tier) { + return KvEvent{KvEvent::Kind::REMOVE, std::move(key), tier, 0}; +} + +EventBundle Bundle(uint64_t seq, std::vector events) { + return EventBundle{seq, std::move(events)}; +} + +bool HasLocation(const std::vector& locs, const std::string& node, TierType tier, + uint64_t size) { + for (const auto& l : locs) { + if (l.node_id == node && l.tier == tier && l.size == size) return true; + } + return false; +} + +} // namespace + +// ---- ApplyEvents: ADD/REMOVE round-trip ------------------------------------ + +TEST(GlobalBlockIndexEvents, ApplyAddInsertsLocation) { + GlobalBlockIndex idx; + ASSERT_EQ(idx.ApplyEvents("node-A", {Add("k1", TierType::DRAM, 1024)}), 1u); + auto locs = idx.Lookup("k1"); + ASSERT_EQ(locs.size(), 1u); + EXPECT_EQ(locs[0].node_id, "node-A"); + EXPECT_EQ(locs[0].tier, TierType::DRAM); + EXPECT_EQ(locs[0].size, 1024u); +} + +// Duplicate ADD keeps the first observed size; only REMOVE retires it. +TEST(GlobalBlockIndexEvents, ApplyAddSameNodeTierKeepsExistingSize) { + GlobalBlockIndex idx; + idx.ApplyEvents("node-A", {Add("k", TierType::DRAM, 1024)}); + idx.ApplyEvents("node-A", {Add("k", TierType::DRAM, 2048)}); + auto locs = idx.Lookup("k"); + ASSERT_EQ(locs.size(), 1u); + EXPECT_EQ(locs[0].size, 1024u); +} + +TEST(GlobalBlockIndexEvents, MultipleNodesCoexistForSameKey) { + GlobalBlockIndex idx; + idx.ApplyEvents("node-A", {Add("k", TierType::DRAM, 100)}); + idx.ApplyEvents("node-B", {Add("k", TierType::DRAM, 200)}); + idx.ApplyEvents("node-A", {Add("k", TierType::HBM, 300)}); // different tier on A + auto locs = idx.Lookup("k"); + EXPECT_EQ(locs.size(), 3u); + EXPECT_TRUE(HasLocation(locs, "node-A", TierType::DRAM, 100)); + EXPECT_TRUE(HasLocation(locs, "node-B", TierType::DRAM, 200)); + EXPECT_TRUE(HasLocation(locs, "node-A", TierType::HBM, 300)); +} + +TEST(GlobalBlockIndexEvents, RemoveErasesMatchingLocationOnly) { + GlobalBlockIndex idx; + idx.ApplyEvents("node-A", {Add("k", TierType::DRAM, 100)}); + idx.ApplyEvents("node-B", {Add("k", TierType::DRAM, 200)}); + + idx.ApplyEvents("node-A", {Remove("k", TierType::DRAM)}); + auto locs = idx.Lookup("k"); + ASSERT_EQ(locs.size(), 1u); + EXPECT_EQ(locs[0].node_id, "node-B"); +} + +TEST(GlobalBlockIndexEvents, RemoveLastLocationErasesEntry) { + GlobalBlockIndex idx; + idx.ApplyEvents("node-A", {Add("k", TierType::DRAM, 100)}); + idx.ApplyEvents("node-A", {Remove("k", TierType::DRAM)}); + EXPECT_TRUE(idx.Lookup("k").empty()); + EXPECT_FALSE(idx.GetMetrics("k").has_value()); +} + +TEST(GlobalBlockIndexEvents, RemoveUnknownIsNoop) { + GlobalBlockIndex idx; + EXPECT_EQ(idx.ApplyEvents("ghost", {Remove("ghost-key", TierType::DRAM)}), 0u); +} + +// A key mirrored on both DRAM and SSD of one node: a DRAM eviction +// (REMOVE DRAM) must drop only the DRAM bucket and leave the SSD location +// readable. This is the additive-index invariant the SSD tier relies +// on (DRAM evict never touches the SSD copy); no master code is exercised here +// beyond ApplyEvents. +TEST(GlobalBlockIndexEvents, RemoveDramKeepsSsdBucket) { + GlobalBlockIndex idx; + idx.ApplyEvents("node-A", {Add("k", TierType::DRAM, 100), Add("k", TierType::SSD, 100)}); + ASSERT_TRUE(HasLocation(idx.Lookup("k"), "node-A", TierType::DRAM, 100)); + ASSERT_TRUE(HasLocation(idx.Lookup("k"), "node-A", TierType::SSD, 100)); + + idx.ApplyEvents("node-A", {Remove("k", TierType::DRAM)}); + + auto locs = idx.Lookup("k"); + EXPECT_FALSE(HasLocation(locs, "node-A", TierType::DRAM, 100)); // DRAM bucket gone + EXPECT_TRUE(HasLocation(locs, "node-A", TierType::SSD, 100)); // SSD bucket retained +} + +TEST(GlobalBlockIndexEvents, ClearAtTierClearsOnlyTargetNodeTier) { + GlobalBlockIndex idx; + idx.ApplyEvents("node-A", {Add("k1", TierType::DRAM, 1), Add("k2", TierType::SSD, 2), + Add("k3", TierType::DRAM, 3)}); + idx.ApplyEvents("node-B", {Add("k1", TierType::DRAM, 10)}); + + EXPECT_EQ( + idx.ApplyEvents("node-A", {KvEvent{KvEvent::Kind::CLEAR_AT_TIER, "", TierType::DRAM, 0}}), + 2u); + + EXPECT_FALSE(HasLocation(idx.Lookup("k1"), "node-A", TierType::DRAM, 1)); + EXPECT_TRUE(HasLocation(idx.Lookup("k1"), "node-B", TierType::DRAM, 10)); + EXPECT_TRUE(HasLocation(idx.Lookup("k2"), "node-A", TierType::SSD, 2)); + EXPECT_TRUE(idx.Lookup("k3").empty()); +} + +// ---- ReplaceNodeLocations: full-sync recovery ------------------------------ + +TEST(GlobalBlockIndexEvents, ReplaceNodeLocationsClearsThenInserts) { + GlobalBlockIndex idx; + idx.ApplyEvents("node-A", {Add("k1", TierType::DRAM, 100), Add("k2", TierType::DRAM, 200)}); + idx.ApplyEvents("node-B", {Add("k1", TierType::DRAM, 999)}); // shared key, different node + + // Full-sync from node-A: k1 stays (different size), k2 is gone, new k3 appears. + idx.ReplaceNodeLocations("node-A", + {Add("k1", TierType::DRAM, 150), Add("k3", TierType::DRAM, 300)}); + + auto k1 = idx.Lookup("k1"); + EXPECT_TRUE(HasLocation(k1, "node-A", TierType::DRAM, 150)); + EXPECT_TRUE(HasLocation(k1, "node-B", TierType::DRAM, 999)); // node-B untouched + + EXPECT_TRUE(idx.Lookup("k2").empty()); // dropped — node-A's full-sync didn't include it + + auto k3 = idx.Lookup("k3"); + EXPECT_TRUE(HasLocation(k3, "node-A", TierType::DRAM, 300)); +} + +TEST(GlobalBlockIndexEvents, ReplaceNodeLocationsEmptyClearsAllForNode) { + // Used by ClientRegistry::UnregisterClient and the reaper to drop a + // dead node's index entries. + GlobalBlockIndex idx; + idx.ApplyEvents("node-A", {Add("k1", TierType::DRAM, 1), Add("k2", TierType::HBM, 2)}); + idx.ApplyEvents("node-B", {Add("k1", TierType::DRAM, 3)}); + + idx.ReplaceNodeLocations("node-A", {}); + EXPECT_EQ(idx.Lookup("k1").size(), 1u); // node-B still owns k1 + EXPECT_TRUE(idx.Lookup("k2").empty()); // node-A's only HBM location is gone +} + +TEST(GlobalBlockIndexEvents, ReplaceNodeLocationsIgnoresRemoveEntries) { + GlobalBlockIndex idx; + idx.ApplyEvents("node-A", {Add("k1", TierType::DRAM, 100)}); + // Snapshot full-sync conventionally carries only ADDs; sneaking + // a REMOVE in is silently skipped (the snapshot is the truth). + idx.ReplaceNodeLocations("node-A", + {Add("k2", TierType::DRAM, 200), Remove("k3", TierType::DRAM)}); + EXPECT_TRUE(idx.Lookup("k1").empty()); + EXPECT_FALSE(idx.Lookup("k2").empty()); + EXPECT_TRUE(idx.Lookup("k3").empty()); +} + +// ---- Reverse-index (node_to_keys_) invariants ------------------------------ + +TEST(GlobalBlockIndexEvents, ReplaceNodeLocationsAfterMultiTierRemoveKeepsKeyClean) { + GlobalBlockIndex idx; + idx.ApplyEvents("node-A", {Add("k", TierType::DRAM, 100), Add("k", TierType::HBM, 200)}); + idx.ApplyEvents("node-B", {Add("k", TierType::DRAM, 300)}); + + // A still owns (k, HBM): reverse index must keep k. + idx.ApplyEvents("node-A", {Remove("k", TierType::DRAM)}); + auto mid = idx.Lookup("k"); + ASSERT_EQ(mid.size(), 2u); + EXPECT_TRUE(HasLocation(mid, "node-A", TierType::HBM, 200)); + EXPECT_TRUE(HasLocation(mid, "node-B", TierType::DRAM, 300)); + + idx.ReplaceNodeLocations("node-A", {}); + auto after = idx.Lookup("k"); + ASSERT_EQ(after.size(), 1u); + EXPECT_EQ(after[0].node_id, "node-B"); + EXPECT_EQ(after[0].tier, TierType::DRAM); + EXPECT_EQ(after[0].size, 300u); +} + +TEST(GlobalBlockIndexEvents, ReplaceNodeLocationsLeavesOtherNodesIntact) { + GlobalBlockIndex idx; + idx.ApplyEvents("node-A", {Add("k1", TierType::DRAM, 1), Add("k2", TierType::DRAM, 2)}); + idx.ApplyEvents("node-B", {Add("k1", TierType::DRAM, 10), Add("k3", TierType::HBM, 30)}); + idx.ApplyEvents("node-C", {Add("k2", TierType::HBM, 200), Add("k4", TierType::DRAM, 400)}); + + idx.ReplaceNodeLocations("node-A", {Add("k_new", TierType::DRAM, 999)}); + + auto k1 = idx.Lookup("k1"); + ASSERT_EQ(k1.size(), 1u); + EXPECT_EQ(k1[0].node_id, "node-B"); + EXPECT_EQ(k1[0].size, 10u); + + auto k2 = idx.Lookup("k2"); + ASSERT_EQ(k2.size(), 1u); + EXPECT_EQ(k2[0].node_id, "node-C"); + EXPECT_EQ(k2[0].tier, TierType::HBM); + + EXPECT_TRUE(HasLocation(idx.Lookup("k_new"), "node-A", TierType::DRAM, 999)); + + auto k3 = idx.Lookup("k3"); + ASSERT_EQ(k3.size(), 1u); + EXPECT_EQ(k3[0].node_id, "node-B"); + EXPECT_EQ(k3[0].size, 30u); + + auto k4 = idx.Lookup("k4"); + ASSERT_EQ(k4.size(), 1u); + EXPECT_EQ(k4[0].node_id, "node-C"); + EXPECT_EQ(k4[0].size, 400u); +} + +// 2nd sync must see reverse index repopulated by 1st sync's replay. +TEST(GlobalBlockIndexEvents, ReplaceNodeLocationsTwiceRotatesKeys) { + GlobalBlockIndex idx; + idx.ApplyEvents("node-A", {Add("k_old", TierType::DRAM, 1)}); + + idx.ReplaceNodeLocations("node-A", + {Add("k_mid_a", TierType::DRAM, 2), Add("k_mid_b", TierType::HBM, 3)}); + EXPECT_TRUE(idx.Lookup("k_old").empty()); + EXPECT_FALSE(idx.Lookup("k_mid_a").empty()); + EXPECT_FALSE(idx.Lookup("k_mid_b").empty()); + + idx.ReplaceNodeLocations("node-A", {Add("k_final", TierType::DRAM, 4)}); + EXPECT_TRUE(idx.Lookup("k_mid_a").empty()); + EXPECT_TRUE(idx.Lookup("k_mid_b").empty()); + auto final_locs = idx.Lookup("k_final"); + ASSERT_EQ(final_locs.size(), 1u); + EXPECT_EQ(final_locs[0].node_id, "node-A"); + EXPECT_EQ(final_locs[0].size, 4u); +} + +// Reverse-index insert must run even when inserted==false. +TEST(GlobalBlockIndexEvents, DuplicateAddKeepsReverseConsistent) { + GlobalBlockIndex idx; + idx.ApplyEvents("node-A", {Add("dup", TierType::DRAM, 1024)}); + idx.ApplyEvents("node-A", {Add("dup", TierType::DRAM, 2048)}); + idx.ApplyEvents("node-A", {Add("dup", TierType::DRAM, 4096)}); + + auto locs = idx.Lookup("dup"); + ASSERT_EQ(locs.size(), 1u); + + idx.ReplaceNodeLocations("node-A", {}); + EXPECT_TRUE(idx.Lookup("dup").empty()); +} + +// No-op REMOVE must leave node_to_keys_ untouched on both sides. +TEST(GlobalBlockIndexEvents, RemoveNonMatchingTierLeavesReverseUntouched) { + GlobalBlockIndex idx; + idx.ApplyEvents("node-A", {Add("k", TierType::DRAM, 100)}); + + idx.ApplyEvents("node-A", {Remove("k", TierType::HBM)}); + auto mid = idx.Lookup("k"); + ASSERT_EQ(mid.size(), 1u); + EXPECT_EQ(mid[0].tier, TierType::DRAM); + + idx.ReplaceNodeLocations("node-A", {}); + EXPECT_TRUE(idx.Lookup("k").empty()); + + idx.ApplyEvents("node-A", {Add("k", TierType::DRAM, 100)}); + idx.ApplyEvents("node-B", {Remove("k", TierType::DRAM)}); + idx.ReplaceNodeLocations("node-B", {}); + auto after = idx.Lookup("k"); + ASSERT_EQ(after.size(), 1u); + EXPECT_EQ(after[0].node_id, "node-A"); +} + +// ---- ClientRegistry::Heartbeat applies events end-to-end -------------------- + +TEST(ClientRegistryHeartbeat, AppliesEventsAndAdvancesSeq) { + GlobalBlockIndex idx; + ClientRegistryConfig cfg; + ClientRegistry reg(cfg, idx); + + ASSERT_TRUE(reg.RegisterClient("node-A", "10.0.0.1:1", /*caps=*/{}, "10.0.0.1:2", {})); + + uint64_t acked = 0; + bool need_full = false; + auto status = reg.Heartbeat("node-A", /*caps=*/{}, {Bundle(1, {Add("k", TierType::DRAM, 42)})}, + /*is_full_sync=*/false, /*delta_seq_baseline=*/0, &acked, &need_full); + EXPECT_EQ(status, ClientStatus::ALIVE); + EXPECT_EQ(acked, 1u); + EXPECT_FALSE(need_full); + EXPECT_FALSE(idx.Lookup("k").empty()); +} + +TEST(ClientRegistryHeartbeat, SeqGapTriggersFullSyncRequest) { + GlobalBlockIndex idx; + ClientRegistryConfig cfg; + ClientRegistry reg(cfg, idx); + reg.RegisterClient("node-A", "10.0.0.1:1", {}, "10.0.0.1:2", {}); + + uint64_t acked = 0; + bool need_full = false; + // First heartbeat seq=1 — applied normally. + reg.Heartbeat("node-A", {}, {Bundle(1, {Add("k1", TierType::DRAM, 1)})}, + /*is_full_sync=*/false, 0, &acked, &need_full); + ASSERT_FALSE(need_full); + ASSERT_EQ(acked, 1u); + + // Second heartbeat skips seq=2: master detects the gap. + reg.Heartbeat("node-A", {}, {Bundle(3, {Add("k2", TierType::DRAM, 2)})}, + /*is_full_sync=*/false, 0, &acked, &need_full); + EXPECT_TRUE(need_full); + EXPECT_EQ(acked, 1u); // unchanged — no events applied from this batch + + // k2 is NOT in the index because the gap-batch was rejected. + EXPECT_TRUE(idx.Lookup("k2").empty()); + EXPECT_FALSE(idx.Lookup("k1").empty()); +} + +TEST(ClientRegistryHeartbeat, FullSyncReplacesNodeLocations) { + GlobalBlockIndex idx; + ClientRegistryConfig cfg; + ClientRegistry reg(cfg, idx); + reg.RegisterClient("node-A", "10.0.0.1:1", {}, "10.0.0.1:2", {}); + + uint64_t acked = 0; + bool need_full = false; + reg.Heartbeat("node-A", {}, + {Bundle(1, {Add("k1", TierType::DRAM, 1), Add("k2", TierType::DRAM, 2)})}, + /*is_full_sync=*/false, 0, &acked, &need_full); + ASSERT_FALSE(idx.Lookup("k1").empty()); + ASSERT_FALSE(idx.Lookup("k2").empty()); + + // Full-sync: only k1 + k3 should remain for node-A. + reg.Heartbeat("node-A", {}, + {Bundle(2, {Add("k1", TierType::DRAM, 10), Add("k3", TierType::DRAM, 30)})}, + /*is_full_sync=*/true, /*delta_seq_baseline=*/2, &acked, &need_full); + EXPECT_EQ(acked, 2u); + EXPECT_FALSE(need_full); + + auto k1 = idx.Lookup("k1"); + ASSERT_EQ(k1.size(), 1u); + EXPECT_EQ(k1[0].size, 10u); // updated via full-sync + EXPECT_TRUE(idx.Lookup("k2").empty()); + EXPECT_FALSE(idx.Lookup("k3").empty()); +} + +TEST(ClientRegistryHeartbeat, UnregisterClearsNodeFromIndex) { + GlobalBlockIndex idx; + ClientRegistryConfig cfg; + ClientRegistry reg(cfg, idx); + reg.RegisterClient("node-A", "10.0.0.1:1", {}, "10.0.0.1:2", {}); + + uint64_t acked = 0; + bool need_full = false; + reg.Heartbeat("node-A", {}, {Bundle(1, {Add("k1", TierType::DRAM, 1)})}, + /*is_full_sync=*/false, 0, &acked, &need_full); + ASSERT_FALSE(idx.Lookup("k1").empty()); + + reg.UnregisterClient("node-A"); + EXPECT_TRUE(idx.Lookup("k1").empty()); + EXPECT_FALSE(reg.IsClientAlive("node-A")); +} + +// ---- FindEvictionCandidates -------------------------------------------------- + +TEST(GlobalBlockIndexEvents, FindEvictionCandidatesFiltersByOverloadedNodeTier) { + GlobalBlockIndex idx; + idx.ApplyEvents("node-A", {Add("k1", TierType::DRAM, 100), Add("k2", TierType::HBM, 200)}); + idx.ApplyEvents("node-B", {Add("k1", TierType::DRAM, 100)}); + + std::set overloaded = { + {"node-A", TierType::DRAM}, + }; + auto candidates = idx.FindEvictionCandidates(overloaded); + // Only node-A's DRAM location of k1 is a candidate. + ASSERT_EQ(candidates.size(), 1u); + EXPECT_EQ(candidates[0].key, "k1"); + EXPECT_EQ(candidates[0].location.node_id, "node-A"); + EXPECT_EQ(candidates[0].location.tier, TierType::DRAM); +} + +// ---- BatchLookupForRouteGet ------------------------------------------------ + +TEST(GlobalBlockIndexEvents, BatchLookupForRouteGetEmptyInputReturnsEmpty) { + GlobalBlockIndex idx; + EXPECT_TRUE(idx.BatchLookupForRouteGet({}, {}, std::chrono::seconds{1}).empty()); +} + +TEST(GlobalBlockIndexEvents, BatchLookupForRouteGetMixedHitsAndMisses) { + GlobalBlockIndex idx; + idx.ApplyEvents("node-A", {Add("k1", TierType::DRAM, 100)}); + idx.ApplyEvents("node-B", {Add("k1", TierType::DRAM, 200), Add("k2", TierType::HBM, 300)}); + + auto ref_k1 = idx.Lookup("k1"); + auto ref_k2 = idx.Lookup("k2"); + auto before_k1 = idx.GetMetrics("k1"); + auto before_k2 = idx.GetMetrics("k2"); + ASSERT_TRUE(before_k1.has_value()); + ASSERT_TRUE(before_k2.has_value()); + + auto results = idx.BatchLookupForRouteGet({"k1", "ghost", "k2"}, {}, std::chrono::seconds{10}); + ASSERT_EQ(results.size(), 3u); + EXPECT_EQ(results[0], ref_k1); + EXPECT_TRUE(results[1].empty()); + EXPECT_EQ(results[2], ref_k2); + + auto after_k1 = idx.GetMetrics("k1"); + auto after_k2 = idx.GetMetrics("k2"); + ASSERT_TRUE(after_k1.has_value()); + ASSERT_TRUE(after_k2.has_value()); + EXPECT_EQ(after_k1->access_count, before_k1->access_count + 1); + EXPECT_EQ(after_k2->access_count, before_k2->access_count + 1); + EXPECT_FALSE(idx.GetMetrics("ghost").has_value()); +} + +TEST(GlobalBlockIndexEvents, BatchLookupForRouteGetGrantsLeaseForHitsOnly) { + GlobalBlockIndex idx; + idx.ApplyEvents("node-A", {Add("hit", TierType::DRAM, 100), Add("other", TierType::DRAM, 200)}); + + std::set overloaded{{"node-A", TierType::DRAM}}; + ASSERT_EQ(idx.FindEvictionCandidates(overloaded).size(), 2u); + + idx.BatchLookupForRouteGet({"hit", "ghost"}, {}, std::chrono::seconds{10}); + + auto candidates = idx.FindEvictionCandidates(overloaded); + ASSERT_EQ(candidates.size(), 1u); + EXPECT_EQ(candidates[0].key, "other"); +} + +// All replicas excluded -> slot empty, access_count NOT bumped, +// lease NOT granted. A key whose every replica is unreachable must +// not pollute LRU or block eviction. +TEST(GlobalBlockIndexEvents, BatchLookupForRouteGetSkipsSideEffectsWhenAllReplicasExcluded) { + GlobalBlockIndex idx; + idx.ApplyEvents("node-A", {Add("k", TierType::DRAM, 100)}); + idx.ApplyEvents("node-B", {Add("k", TierType::DRAM, 200)}); + + auto before = idx.GetMetrics("k"); + ASSERT_TRUE(before.has_value()); + std::set overloaded{{"node-A", TierType::DRAM}, + {"node-B", TierType::DRAM}}; + ASSERT_EQ(idx.FindEvictionCandidates(overloaded).size(), 2u); + + std::unordered_set excludes{"node-A", "node-B"}; + auto results = idx.BatchLookupForRouteGet({"k"}, excludes, std::chrono::seconds{10}); + ASSERT_EQ(results.size(), 1u); + EXPECT_TRUE(results[0].empty()); + + auto after = idx.GetMetrics("k"); + ASSERT_TRUE(after.has_value()); + EXPECT_EQ(after->access_count, before->access_count); + EXPECT_EQ(idx.FindEvictionCandidates(overloaded).size(), 2u); +} + +// Some replicas excluded but not all -> returned slot has only the +// survivors, access_count IS bumped, lease IS granted. +TEST(GlobalBlockIndexEvents, BatchLookupForRouteGetFiltersAndLeasesWhenSomeReplicasSurvive) { + GlobalBlockIndex idx; + idx.ApplyEvents("node-A", {Add("k", TierType::DRAM, 100)}); + idx.ApplyEvents("node-B", {Add("k", TierType::DRAM, 200)}); + + auto before = idx.GetMetrics("k"); + ASSERT_TRUE(before.has_value()); + + std::unordered_set excludes{"node-A"}; + auto results = idx.BatchLookupForRouteGet({"k"}, excludes, std::chrono::seconds{10}); + ASSERT_EQ(results.size(), 1u); + ASSERT_EQ(results[0].size(), 1u); + EXPECT_EQ(results[0][0].node_id, "node-B"); + + auto after = idx.GetMetrics("k"); + ASSERT_TRUE(after.has_value()); + EXPECT_EQ(after->access_count, before->access_count + 1); +} + +} // namespace mori::umbp diff --git a/tests/cpp/umbp/distributed/test_in_memory_master_metadata_store.cpp b/tests/cpp/umbp/distributed/test_in_memory_master_metadata_store.cpp new file mode 100644 index 000000000..a8ca2f6a2 --- /dev/null +++ b/tests/cpp/umbp/distributed/test_in_memory_master_metadata_store.cpp @@ -0,0 +1,693 @@ +// Copyright © Advanced Micro Devices, Inc. All rights reserved. +// +// MIT License +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +// Phase 2 behavioral suite for InMemoryMasterMetadataStore (§6a). Written +// against IMasterMetadataStore& so the same cases validate the Redis backend +// later. Tests use injected system_clock times (no real-time sleeps) so they +// are deterministic in CI. +// +// State that the interface does not expose directly — lease_expiry and +// last_accessed_at on block entries — is observed through EnumerateLruForEviction: +// a leased entry is filtered out, and LRU ordering reflects last_accessed_at. + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "umbp/distributed/master/in_memory_master_metadata_store.h" +#include "umbp/distributed/types.h" + +namespace mori::umbp { +namespace { + +using namespace std::chrono_literals; +using Clock = std::chrono::system_clock; + +// Fixed, NTP-plausible base instant so offsets read cleanly. +const Clock::time_point kT0 = Clock::time_point(std::chrono::hours(24 * 365 * 50)); + +std::map Caps(uint64_t total = 1000, uint64_t available = 1000) { + return {{TierType::HBM, TierCapacity{total, available}}}; +} + +ClientRegistration MakeReg(const std::string& node_id) { + ClientRegistration reg; + reg.node_id = node_id; + reg.node_address = "addr:" + node_id; + reg.peer_address = "peer:" + node_id; + reg.tier_capacities = Caps(); + reg.tags = {"role=test"}; + return reg; +} + +KvEvent Add(const std::string& key, TierType tier, uint64_t size) { + return KvEvent{KvEvent::Kind::ADD, key, tier, size}; +} +KvEvent Remove(const std::string& key, TierType tier) { + return KvEvent{KvEvent::Kind::REMOVE, key, tier, 0}; +} + +// Register `node` ALIVE at `now`. +void RegisterAlive(IMasterMetadataStore& store, const std::string& node, + Clock::time_point now = kT0) { + ASSERT_TRUE(store.RegisterClient(MakeReg(node), now, 30s)); +} + +// Apply a delta heartbeat carrying `events` at sequence `seq`. +HeartbeatResult Beat(IMasterMetadataStore& store, const std::string& node, uint64_t seq, + std::vector events, Clock::time_point now) { + return store.ApplyHeartbeat(node, seq, now, Caps(), events, /*is_full_sync=*/false); +} + +// --------------------------------------------------------------------------- +// RegisterClient +// --------------------------------------------------------------------------- + +TEST(InMemoryStore, RegisterNewClient) { + InMemoryMasterMetadataStore store; + EXPECT_TRUE(store.RegisterClient(MakeReg("n1"), kT0, 30s)); + EXPECT_TRUE(store.IsClientAlive("n1")); + EXPECT_EQ(store.AliveClientCount(), 1u); + + auto rec = store.GetClient("n1"); + ASSERT_TRUE(rec.has_value()); + EXPECT_EQ(rec->status, ClientStatus::ALIVE); + EXPECT_EQ(rec->last_applied_seq, 0u); + EXPECT_EQ(rec->peer_address, "peer:n1"); + EXPECT_EQ(rec->last_heartbeat, kT0); + EXPECT_EQ(rec->registered_at, kT0); +} + +TEST(InMemoryStore, RejectReRegisterNonStaleAlive) { + InMemoryMasterMetadataStore store; + ASSERT_TRUE(store.RegisterClient(MakeReg("n1"), kT0, 30s)); + // Still well within stale_after window. + EXPECT_FALSE(store.RegisterClient(MakeReg("n1"), kT0 + 5s, 30s)); +} + +TEST(InMemoryStore, AcceptReRegisterStaleAlive) { + InMemoryMasterMetadataStore store; + ASSERT_TRUE(store.RegisterClient(MakeReg("n1"), kT0, 30s)); + // last_heartbeat is kT0; now - last_heartbeat > stale_after → re-register OK + // even though the reaper has not flipped the status yet (hazard #2). + EXPECT_TRUE(store.RegisterClient(MakeReg("n1"), kT0 + 31s, 30s)); + EXPECT_TRUE(store.IsClientAlive("n1")); +} + +TEST(InMemoryStore, AcceptReRegisterExpired) { + InMemoryMasterMetadataStore store; + ASSERT_TRUE(store.RegisterClient(MakeReg("n1"), kT0, 30s)); + ASSERT_EQ(store.ExpireStaleClients(kT0 + 1s).size(), 1u); + EXPECT_FALSE(store.IsClientAlive("n1")); + // Re-register an EXPIRED record at the same instant: accepted, back to ALIVE. + EXPECT_TRUE(store.RegisterClient(MakeReg("n1"), kT0 + 2s, 30s)); + EXPECT_TRUE(store.IsClientAlive("n1")); +} + +// --------------------------------------------------------------------------- +// UnregisterClient — cascade to block locations AND external KV +// --------------------------------------------------------------------------- + +TEST(InMemoryStore, UnregisterClientCascades) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + ASSERT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 10)}, kT0).status, + HeartbeatResult::APPLIED); + ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1", "h2"}, TierType::HBM)); + + ASSERT_FALSE(store.LookupBlock("k1").empty()); + ASSERT_EQ(store.GetExternalKvCount("n1"), 2u); + + store.UnregisterClient("n1"); + + EXPECT_FALSE(store.GetClient("n1").has_value()); + EXPECT_TRUE(store.LookupBlock("k1").empty()); + EXPECT_EQ(store.GetExternalKvCount("n1"), 0u); + EXPECT_TRUE(store.MatchExternalKv({"h1", "h2"}, false, kT0).empty()); +} + +TEST(InMemoryStore, UnregisterUnknownIsNoOp) { + InMemoryMasterMetadataStore store; + store.UnregisterClient("ghost"); // must not crash + EXPECT_EQ(store.AliveClientCount(), 0u); +} + +// --------------------------------------------------------------------------- +// ApplyHeartbeat +// --------------------------------------------------------------------------- + +TEST(InMemoryStore, HeartbeatUnknownNode) { + InMemoryMasterMetadataStore store; + auto r = Beat(store, "ghost", 1, {}, kT0); + EXPECT_EQ(r.status, HeartbeatResult::UNKNOWN); +} + +TEST(InMemoryStore, HeartbeatCasSequence) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + EXPECT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 10)}, kT0).status, + HeartbeatResult::APPLIED); + EXPECT_EQ(Beat(store, "n1", 2, {Add("k2", TierType::HBM, 20)}, kT0).status, + HeartbeatResult::APPLIED); + // Out-of-order seq → SEQ_GAP, acked echoes last applied (2). + auto gap = Beat(store, "n1", 4, {Add("k3", TierType::HBM, 30)}, kT0); + EXPECT_EQ(gap.status, HeartbeatResult::SEQ_GAP); + EXPECT_EQ(gap.acked_seq, 2u); + // k3 must not have been applied. + EXPECT_TRUE(store.LookupBlock("k3").empty()); +} + +TEST(InMemoryStore, SeqGapKeepsLivenessNotCapsOrSeq) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + ASSERT_EQ(Beat(store, "n1", 1, {}, kT0).status, HeartbeatResult::APPLIED); + + // A gap heartbeat at a later time with different caps. + std::map new_caps = {{TierType::HBM, TierCapacity{9999, 9999}}}; + auto gap = store.ApplyHeartbeat("n1", 5, kT0 + 10s, new_caps, {}, /*is_full_sync=*/false); + ASSERT_EQ(gap.status, HeartbeatResult::SEQ_GAP); + + auto rec = store.GetClient("n1"); + ASSERT_TRUE(rec.has_value()); + EXPECT_EQ(rec->status, ClientStatus::ALIVE); // kept alive + EXPECT_EQ(rec->last_heartbeat, kT0 + 10s); // last_heartbeat bumped + EXPECT_EQ(rec->last_applied_seq, 1u); // seq NOT advanced + EXPECT_EQ(rec->tier_capacities.at(TierType::HBM).total_bytes, 1000u); // caps NOT replaced +} + +TEST(InMemoryStore, HeartbeatDeltaAddRemove) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + ASSERT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 10)}, kT0).status, + HeartbeatResult::APPLIED); + ASSERT_EQ(store.LookupBlock("k1").size(), 1u); + + ASSERT_EQ(Beat(store, "n1", 2, {Remove("k1", TierType::HBM)}, kT0).status, + HeartbeatResult::APPLIED); + EXPECT_TRUE(store.LookupBlock("k1").empty()); +} + +TEST(InMemoryStore, HeartbeatFullSyncReplaces) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + ASSERT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 10), Add("k2", TierType::HBM, 20)}, kT0) + .status, + HeartbeatResult::APPLIED); + + // full_sync wipes prior locations and installs only the ADDs carried here. + auto r = store.ApplyHeartbeat("n1", 7, kT0, Caps(), {Add("k3", TierType::HBM, 30)}, + /*is_full_sync=*/true); + EXPECT_EQ(r.status, HeartbeatResult::APPLIED); + EXPECT_EQ(r.acked_seq, 7u); + + EXPECT_TRUE(store.LookupBlock("k1").empty()); + EXPECT_TRUE(store.LookupBlock("k2").empty()); + EXPECT_EQ(store.LookupBlock("k3").size(), 1u); + + auto rec = store.GetClient("n1"); + ASSERT_TRUE(rec.has_value()); + EXPECT_EQ(rec->last_applied_seq, 7u); // full_sync re-baselines the seq +} + +// --------------------------------------------------------------------------- +// ExpireStaleClients — flip to EXPIRED, keep row, cascade, idempotent +// --------------------------------------------------------------------------- + +TEST(InMemoryStore, ExpireStaleFlipsKeepsRowAndCascades) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1", kT0); + RegisterAlive(store, "n2", kT0 + 20s); // fresher + ASSERT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 10)}, kT0).status, + HeartbeatResult::APPLIED); + ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1"}, TierType::HBM)); + + // Cutoff after n1's heartbeat but before n2's. + auto dead = store.ExpireStaleClients(kT0 + 10s); + ASSERT_EQ(dead.size(), 1u); + EXPECT_EQ(dead[0], "n1"); + + // Row KEPT but EXPIRED (hazard #3). + auto rec = store.GetClient("n1"); + ASSERT_TRUE(rec.has_value()); + EXPECT_EQ(rec->status, ClientStatus::EXPIRED); + EXPECT_FALSE(store.IsClientAlive("n1")); + + // Cascade dropped its blocks and external KV. + EXPECT_TRUE(store.LookupBlock("k1").empty()); + EXPECT_EQ(store.GetExternalKvCount("n1"), 0u); + + // n2 untouched. + EXPECT_TRUE(store.IsClientAlive("n2")); +} + +TEST(InMemoryStore, ExpireStaleIsIdempotent) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1", kT0); + ASSERT_EQ(store.ExpireStaleClients(kT0 + 10s).size(), 1u); + // Re-tick: already EXPIRED, nothing new to report. + EXPECT_TRUE(store.ExpireStaleClients(kT0 + 10s).empty()); +} + +TEST(InMemoryStore, ExpiredRowExcludedFromAliveAccounting) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1", kT0); + RegisterAlive(store, "n2", kT0); + ASSERT_EQ(store.AliveClientCount(), 2u); + ASSERT_EQ(store.ExpireStaleClients(kT0 + 10s).size(), 2u); + + EXPECT_EQ(store.AliveClientCount(), 0u); // not 2, even though rows remain + EXPECT_TRUE(store.ListAliveClients().empty()); + EXPECT_TRUE(store.GetClient("n1").has_value()); // row still present +} + +// --------------------------------------------------------------------------- +// Block reads — lease/access observed via EnumerateLruForEviction +// --------------------------------------------------------------------------- + +// Helper: budget large enough to take everything in one bucket. +std::map Budget(const std::string& node, TierType tier, uint64_t bytes) { + return {{NodeTierKey{node, tier}, bytes}}; +} + +TEST(InMemoryStore, LookupBlockHasNoLeaseOrAccessSideEffects) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + ASSERT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 10)}, kT0).status, + HeartbeatResult::APPLIED); + + // Plain read twice. + EXPECT_EQ(store.LookupBlock("k1").size(), 1u); + EXPECT_EQ(store.LookupBlock("k1").size(), 1u); + + // Not leased → still an eviction candidate at kT0. + auto cands = store.EnumerateLruForEviction(Budget("n1", TierType::HBM, 1000), kT0); + ASSERT_EQ(cands.size(), 1u); + EXPECT_EQ(cands.begin()->second.size(), 1u); +} + +TEST(InMemoryStore, LookupBlockForRouteGetGrantsLeaseAndAccess) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + ASSERT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 10)}, kT0).status, + HeartbeatResult::APPLIED); + + auto locs = store.LookupBlockForRouteGet("k1", {}, kT0, 60s); + ASSERT_EQ(locs.size(), 1u); + + // Leased until kT0+60s → filtered out of eviction at kT0+10s. + EXPECT_TRUE(store.EnumerateLruForEviction(Budget("n1", TierType::HBM, 1000), kT0 + 10s).empty()); + // After lease expiry it is a candidate again. + EXPECT_FALSE(store.EnumerateLruForEviction(Budget("n1", TierType::HBM, 1000), kT0 + 61s).empty()); +} + +TEST(InMemoryStore, RouteGetExcludeNodesNoLeaseWhenFullyExcluded) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + ASSERT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 10)}, kT0).status, + HeartbeatResult::APPLIED); + + std::unordered_set exclude = {"n1"}; + auto locs = store.LookupBlockForRouteGet("k1", exclude, kT0, 60s); + EXPECT_TRUE(locs.empty()); // every location excluded + + // No lease granted (hazard #4) → still an eviction candidate immediately. + EXPECT_FALSE(store.EnumerateLruForEviction(Budget("n1", TierType::HBM, 1000), kT0).empty()); +} + +TEST(InMemoryStore, BatchLookupForRouteGetParallelToKeys) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + ASSERT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 10), Add("k3", TierType::HBM, 30)}, kT0) + .status, + HeartbeatResult::APPLIED); + + auto out = store.BatchLookupBlockForRouteGet({"k1", "missing", "k3"}, {}, kT0, 60s); + ASSERT_EQ(out.size(), 3u); + EXPECT_EQ(out[0].size(), 1u); + EXPECT_TRUE(out[1].empty()); // missing key + EXPECT_EQ(out[2].size(), 1u); +} + +TEST(InMemoryStore, BatchExistsBlockNoSideEffects) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + ASSERT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 10)}, kT0).status, + HeartbeatResult::APPLIED); + + auto exists = store.BatchExistsBlock({"k1", "missing"}); + ASSERT_EQ(exists.size(), 2u); + EXPECT_TRUE(exists[0]); + EXPECT_FALSE(exists[1]); + + // No lease granted by an existence check. + EXPECT_FALSE(store.EnumerateLruForEviction(Budget("n1", TierType::HBM, 1000), kT0).empty()); +} + +// --------------------------------------------------------------------------- +// EnumerateLruForEviction +// --------------------------------------------------------------------------- + +TEST(InMemoryStore, EvictionLruOrderAndBudget) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + // Three keys, each 100 bytes, accessed at increasing times so LRU order is + // k_old < k_mid < k_new. + ASSERT_EQ(Beat(store, "n1", 1, {Add("k_old", TierType::HBM, 100)}, kT0).status, + HeartbeatResult::APPLIED); + ASSERT_EQ(Beat(store, "n1", 2, {Add("k_mid", TierType::HBM, 100)}, kT0 + 1s).status, + HeartbeatResult::APPLIED); + ASSERT_EQ(Beat(store, "n1", 3, {Add("k_new", TierType::HBM, 100)}, kT0 + 2s).status, + HeartbeatResult::APPLIED); + + // Budget 150 bytes → should take the two oldest (200 bytes ≥ 150 after second). + auto cands = store.EnumerateLruForEviction(Budget("n1", TierType::HBM, 150), kT0 + 10s); + ASSERT_EQ(cands.size(), 1u); + auto& bucket = cands.at(NodeTierKey{"n1", TierType::HBM}); + ASSERT_EQ(bucket.size(), 2u); + EXPECT_EQ(bucket[0].key, "k_old"); // oldest first + EXPECT_EQ(bucket[1].key, "k_mid"); +} + +TEST(InMemoryStore, EvictionSkipsLeased) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + ASSERT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 100)}, kT0).status, + HeartbeatResult::APPLIED); + // Lease k1 well past the enumeration time. + store.LookupBlockForRouteGet("k1", {}, kT0, 1h); + EXPECT_TRUE(store.EnumerateLruForEviction(Budget("n1", TierType::HBM, 1000), kT0 + 1s).empty()); +} + +TEST(InMemoryStore, EvictionTieTimestampsAllSurvive) { + // §2d correctness claim: many candidates sharing one identical last_accessed_at + // (the common case, since a batch RouteGet stamps one `now` across all keys) + // must all be enumerable — none dropped by tie collisions. + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + std::vector adds; + for (int i = 0; i < 50; ++i) { + adds.push_back(Add("k" + std::to_string(i), TierType::HBM, 10)); + } + // All keys created (and thus last_accessed) at the identical instant kT0. + ASSERT_EQ(Beat(store, "n1", 1, adds, kT0).status, HeartbeatResult::APPLIED); + + // Huge budget → take everything; all 50 tied-timestamp candidates must appear. + auto cands = store.EnumerateLruForEviction(Budget("n1", TierType::HBM, 100000), kT0 + 10s); + ASSERT_EQ(cands.size(), 1u); + EXPECT_EQ(cands.at(NodeTierKey{"n1", TierType::HBM}).size(), 50u); +} + +TEST(InMemoryStore, EvictionOnlyBudgetedBuckets) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + ASSERT_EQ(Beat(store, "n1", 1, {Add("kh", TierType::HBM, 10), Add("kd", TierType::DRAM, 10)}, kT0) + .status, + HeartbeatResult::APPLIED); + // Only ask about the HBM bucket. + auto cands = store.EnumerateLruForEviction(Budget("n1", TierType::HBM, 1000), kT0 + 1s); + ASSERT_EQ(cands.size(), 1u); + EXPECT_EQ(cands.begin()->first.tier, TierType::HBM); +} + +// --------------------------------------------------------------------------- +// External KV +// --------------------------------------------------------------------------- + +TEST(InMemoryStore, RegisterExternalKvAliveGate) { + InMemoryMasterMetadataStore store; + // Dead/unknown node → rejected, nothing written. + EXPECT_FALSE(store.RegisterExternalKvIfAlive("ghost", {"h1"}, TierType::HBM)); + EXPECT_TRUE(store.MatchExternalKv({"h1"}, false, kT0).empty()); + + RegisterAlive(store, "n1"); + EXPECT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1"}, TierType::HBM)); + EXPECT_EQ(store.MatchExternalKv({"h1"}, false, kT0).size(), 1u); +} + +TEST(InMemoryStore, UnregisterExternalKvAndByTier) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1"}, TierType::HBM)); + ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1"}, TierType::DRAM)); + + // Remove only the HBM tier; DRAM remains. + store.UnregisterExternalKv("n1", {"h1"}, TierType::HBM); + auto m = store.MatchExternalKv({"h1"}, false, kT0); + ASSERT_EQ(m.size(), 1u); + EXPECT_EQ(m[0].hashes_by_tier.count(TierType::HBM), 0u); + EXPECT_EQ(m[0].hashes_by_tier.count(TierType::DRAM), 1u); + + // Whole-tier wipe of DRAM → entry gone. + store.UnregisterExternalKvByTier("n1", TierType::DRAM); + EXPECT_TRUE(store.MatchExternalKv({"h1"}, false, kT0).empty()); +} + +TEST(InMemoryStore, MatchCountsHitsWhenRequested) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1", "h2"}, TierType::HBM)); + + // count_as_hit=false: pure read, hit map untouched. + store.MatchExternalKv({"h1", "h2"}, /*count_as_hit=*/false, kT0); + EXPECT_TRUE(store.GetExternalKvHitCounts({"h1", "h2"}).empty()); + + // count_as_hit=true: increments accumulate across calls. + store.MatchExternalKv({"h1", "h2"}, /*count_as_hit=*/true, kT0); + store.MatchExternalKv({"h1"}, /*count_as_hit=*/true, kT0 + 1s); + + auto counts = store.GetExternalKvHitCounts({"h1", "h2"}); + std::map by_hash; + for (const auto& e : counts) by_hash[e.hash] = e.hit_count_total; + EXPECT_EQ(by_hash["h1"], 2u); + EXPECT_EQ(by_hash["h2"], 1u); +} + +TEST(InMemoryStore, MatchedHashCountAcrossTiers) { + // Preserves the NodeMatch::MatchedHashCount coverage from + // test_external_kv_block_index.cpp:57 — one hash mirrored across two tiers + // counts once. + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1"}, TierType::HBM)); + ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1"}, TierType::DRAM)); + + auto m = store.MatchExternalKv({"h1"}, false, kT0); + ASSERT_EQ(m.size(), 1u); + EXPECT_EQ(m[0].hashes_by_tier.size(), 2u); // appears in two tier buckets + EXPECT_EQ(m[0].MatchedHashCount(), 1u); // but is one unique hash +} + +TEST(InMemoryStore, GetExternalKvHitCountsDedupesAndSkipsMissing) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1"}, TierType::HBM)); + store.MatchExternalKv({"h1"}, true, kT0); + + auto counts = store.GetExternalKvHitCounts({"missing", "h1", "h1"}); + ASSERT_EQ(counts.size(), 1u); + EXPECT_EQ(counts[0].hash, "h1"); + EXPECT_EQ(counts[0].hit_count_total, 1u); +} + +TEST(InMemoryStore, GarbageCollectHitsByLastSeen) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"old", "fresh"}, TierType::HBM)); + store.MatchExternalKv({"old"}, true, kT0); + store.MatchExternalKv({"fresh"}, true, kT0 + 100s); + + // Drop entries last seen before kT0+50s → only "old" goes. + EXPECT_EQ(store.GarbageCollectHits(kT0 + 50s), 1u); + + auto counts = store.GetExternalKvHitCounts({"old", "fresh"}); + ASSERT_EQ(counts.size(), 1u); + EXPECT_EQ(counts[0].hash, "fresh"); +} + +TEST(InMemoryStore, UnregisterExternalKvByNodeWipesAllTiersOnly) { + // Whole-node external-KV wipe (backs RevokeAllExternalKvBlocksForNode). Unlike + // UnregisterClient, it must NOT touch the client record or block locations. + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + ASSERT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 10)}, kT0).status, + HeartbeatResult::APPLIED); + ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1", "h2"}, TierType::HBM)); + ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1"}, TierType::DRAM)); + ASSERT_EQ(store.GetExternalKvCount("n1"), 2u); + + store.UnregisterExternalKvByNode("n1"); + + // External KV gone across every tier. + EXPECT_EQ(store.GetExternalKvCount("n1"), 0u); + EXPECT_TRUE(store.MatchExternalKv({"h1", "h2"}, false, kT0).empty()); + + // Client record and block locations untouched (distinguishes from UnregisterClient). + EXPECT_TRUE(store.IsClientAlive("n1")); + EXPECT_EQ(store.LookupBlock("k1").size(), 1u); +} + +TEST(InMemoryStore, UnregisterExternalKvByNodeUnknownIsNoOp) { + InMemoryMasterMetadataStore store; + store.UnregisterExternalKvByNode("ghost"); // must not crash + EXPECT_EQ(store.GetExternalKvCount("ghost"), 0u); +} + +// --------------------------------------------------------------------------- +// Client reads — GetPeerAddress, GetClientTags, ListAliveClients content +// --------------------------------------------------------------------------- + +TEST(InMemoryStore, GetPeerAddressAliveExpiredAndUnknown) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + + // ALIVE → peer surfaced (MakeReg sets peer:). + auto alive = store.GetPeerAddress("n1"); + ASSERT_TRUE(alive.has_value()); + EXPECT_EQ(*alive, "peer:n1"); + + // EXPIRED rows still surface their peer_address (contract: the row is kept). + ASSERT_EQ(store.ExpireStaleClients(kT0 + 10s).size(), 1u); + auto expired = store.GetPeerAddress("n1"); + ASSERT_TRUE(expired.has_value()); + EXPECT_EQ(*expired, "peer:n1"); + + // Unknown node → nullopt. + EXPECT_FALSE(store.GetPeerAddress("ghost").has_value()); +} + +TEST(InMemoryStore, GetClientTagsReturnsRegisteredTagsAndEmptyForUnknown) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); // MakeReg sets tags = {"role=test"} + + auto tags = store.GetClientTags("n1"); + ASSERT_EQ(tags.size(), 1u); + EXPECT_EQ(tags[0], "role=test"); + + EXPECT_TRUE(store.GetClientTags("ghost").empty()); +} + +TEST(InMemoryStore, ListAliveClientsReturnsAliveRecordsExcludingExpired) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1", kT0); + RegisterAlive(store, "n2", kT0 + 20s); // fresher, survives the cutoff below + + // Expire only n1. + ASSERT_EQ(store.ExpireStaleClients(kT0 + 10s).size(), 1u); + + auto alive = store.ListAliveClients(); + ASSERT_EQ(alive.size(), 1u); // n1 excluded even though its row still exists + EXPECT_EQ(alive[0].node_id, "n2"); + EXPECT_EQ(alive[0].status, ClientStatus::ALIVE); + EXPECT_EQ(alive[0].peer_address, "peer:n2"); +} + +// --------------------------------------------------------------------------- +// Concurrency +// --------------------------------------------------------------------------- + +TEST(InMemoryStore, ConcurrentHeartbeatCasExactlyOneApplied) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + + std::atomic applied{0}; + std::atomic gap{0}; + std::atomic start{false}; + std::vector threads; + for (int t = 0; t < 2; ++t) { + threads.emplace_back([&] { + while (!start.load(std::memory_order_acquire)) std::this_thread::yield(); + // Both race to apply seq=1 (last_applied starts at 0). + auto r = store.ApplyHeartbeat("n1", 1, kT0, Caps(), {}, /*is_full_sync=*/false); + if (r.status == HeartbeatResult::APPLIED) { + applied.fetch_add(1); + } else if (r.status == HeartbeatResult::SEQ_GAP) { + gap.fetch_add(1); + } + }); + } + start.store(true, std::memory_order_release); + for (auto& th : threads) th.join(); + + EXPECT_EQ(applied.load(), 1); + EXPECT_EQ(gap.load(), 1); + EXPECT_EQ(store.GetClient("n1")->last_applied_seq, 1u); +} + +// ThreadSanitizer safety net for collapsing four lock domains into one: a mixed +// read/write workload across the shared/unique split must be race-free. +TEST(InMemoryStore, MixedWorkloadIsRaceFree) { + InMemoryMasterMetadataStore store; + RegisterAlive(store, "n1"); + for (int i = 0; i < 100; ++i) { + store.ApplyHeartbeat("n1", i + 1, kT0, Caps(), + {Add("k" + std::to_string(i), TierType::HBM, 10)}, + /*is_full_sync=*/false); + } + ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1", "h2", "h3"}, TierType::HBM)); + + std::atomic start{false}; + std::vector threads; + + // RouteGet readers (shared-lock path with atomic lease/access mutation). + for (int r = 0; r < 4; ++r) { + threads.emplace_back([&] { + while (!start.load(std::memory_order_acquire)) std::this_thread::yield(); + for (int i = 0; i < 500; ++i) { + store.BatchLookupBlockForRouteGet({"k1", "k50", "k99"}, {}, kT0 + std::chrono::seconds(i), + 30s); + store.BatchExistsBlock({"k1", "k2"}); + } + }); + } + // Hit writers (the formerly-shared path that becomes exclusive). + threads.emplace_back([&] { + while (!start.load(std::memory_order_acquire)) std::this_thread::yield(); + for (int i = 0; i < 500; ++i) { + store.MatchExternalKv({"h1", "h2", "h3"}, /*count_as_hit=*/true, + kT0 + std::chrono::seconds(i)); + } + }); + // Eviction-enumeration reader. + threads.emplace_back([&] { + while (!start.load(std::memory_order_acquire)) std::this_thread::yield(); + for (int i = 0; i < 500; ++i) { + store.EnumerateLruForEviction(Budget("n1", TierType::HBM, 50), kT0 + std::chrono::seconds(i)); + } + }); + + start.store(true, std::memory_order_release); + for (auto& th : threads) th.join(); + + // After the storm, hit counts reflect exactly the 500 hit-writer iterations. + auto counts = store.GetExternalKvHitCounts({"h1"}); + ASSERT_EQ(counts.size(), 1u); + EXPECT_EQ(counts[0].hit_count_total, 500u); +} + +} // namespace +} // namespace mori::umbp diff --git a/tests/cpp/umbp/distributed/test_master_metadata_store_interface.cpp b/tests/cpp/umbp/distributed/test_master_metadata_store_interface.cpp new file mode 100644 index 000000000..ed82a3d2a --- /dev/null +++ b/tests/cpp/umbp/distributed/test_master_metadata_store_interface.cpp @@ -0,0 +1,133 @@ +// Copyright © Advanced Micro Devices, Inc. All rights reserved. +// +// MIT License +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +// Phase 1 compile/instantiation gate for IMasterMetadataStore. +// +// The interface is abstract with no implementation yet, so there is no runtime +// behavior to exercise. The bar for Phase 1 is that the contract is well-formed +// and instantiable: +// 1. MockMasterMetadataStore overrides every pure-virtual (a missing or +// ill-typed override makes the mock abstract → fails to instantiate). +// 2. A MockMasterMetadataStore is usable through an IMasterMetadataStore&, +// proving the override set is complete. +// Behavioral assertions arrive with InMemoryMasterMetadataStore in Phase 2. + +#include + +#include + +#include "mock_master_metadata_store.h" +#include "umbp/distributed/master/master_metadata_store.h" + +namespace mori::umbp { +namespace { + +// Instantiation gate: if the interface had an orphaned or ill-typed pure +// virtual, MockMasterMetadataStore would stay abstract and this would not +// compile. +TEST(MasterMetadataStoreInterface, MockIsInstantiableThroughInterface) { + MockMasterMetadataStore mock; + IMasterMetadataStore& store = mock; + (void)store; + SUCCEED(); +} + +// Signature-completeness spot check: name every interface method once through +// the base-class pointer with a default ON_CALL, mirroring the §1b delta table +// plus the two added hit-count methods (GetExternalKvHitCounts, +// GarbageCollectHits) and the `now` parameter on MatchExternalKv. This guards +// against silently dropping the live GetExternalKvHitCounts RPC path. +TEST(MasterMetadataStoreInterface, EveryMethodIsCallableThroughInterface) { + using ::testing::_; + using ::testing::NiceMock; + using ::testing::Return; + using namespace std::chrono_literals; + + // NiceMock: these are default-action calls, not behavior under test, so the + // "uninteresting call" warnings would just be noise. + NiceMock mock; + const auto now = std::chrono::system_clock::now(); + + ON_CALL(mock, RegisterClient(_, _, _)).WillByDefault(Return(true)); + ON_CALL(mock, ApplyHeartbeat(_, _, _, _, _, _)) + .WillByDefault(Return(HeartbeatResult{HeartbeatResult::APPLIED, 0})); + ON_CALL(mock, ExpireStaleClients(_)).WillByDefault(Return(std::vector{})); + ON_CALL(mock, RegisterExternalKvIfAlive(_, _, _)).WillByDefault(Return(true)); + ON_CALL(mock, GarbageCollectHits(_)).WillByDefault(Return(0)); + ON_CALL(mock, LookupBlock(_)).WillByDefault(Return(std::vector{})); + ON_CALL(mock, LookupBlockForRouteGet(_, _, _, _)).WillByDefault(Return(std::vector{})); + ON_CALL(mock, BatchLookupBlockForRouteGet(_, _, _, _)) + .WillByDefault(Return(std::vector>{})); + ON_CALL(mock, BatchExistsBlock(_)).WillByDefault(Return(std::vector{})); + ON_CALL(mock, EnumerateLruForEviction(_, _)) + .WillByDefault(Return(std::map>{})); + ON_CALL(mock, GetClient(_)).WillByDefault(Return(std::nullopt)); + ON_CALL(mock, IsClientAlive(_)).WillByDefault(Return(false)); + ON_CALL(mock, GetPeerAddress(_)).WillByDefault(Return(std::nullopt)); + ON_CALL(mock, ListAliveClients()).WillByDefault(Return(std::vector{})); + ON_CALL(mock, AliveClientCount()).WillByDefault(Return(0)); + ON_CALL(mock, GetClientTags(_)).WillByDefault(Return(std::vector{})); + ON_CALL(mock, MatchExternalKv(_, _, _)).WillByDefault(Return(std::vector{})); + ON_CALL(mock, GetExternalKvHitCounts(_)) + .WillByDefault(Return(std::vector{})); + ON_CALL(mock, GetExternalKvCount(_)).WillByDefault(Return(0)); + + IMasterMetadataStore& store = mock; + + // Cross-store writes. + ClientRegistration reg; + reg.node_id = "node-a"; + EXPECT_TRUE(store.RegisterClient(reg, now, 30s)); + store.UnregisterClient("node-a"); + EXPECT_EQ(store.ApplyHeartbeat("node-a", 1, now, {}, {}, false).status, HeartbeatResult::APPLIED); + EXPECT_TRUE(store.ExpireStaleClients(now).empty()); + + // External-KV writes. + EXPECT_TRUE(store.RegisterExternalKvIfAlive("node-a", {"h0"}, TierType::HBM)); + store.UnregisterExternalKv("node-a", {"h0"}, TierType::HBM); + store.UnregisterExternalKvByTier("node-a", TierType::HBM); + store.UnregisterExternalKvByNode("node-a"); + EXPECT_EQ(store.GarbageCollectHits(now), 0u); + + // Block reads. + EXPECT_TRUE(store.LookupBlock("k0").empty()); + EXPECT_TRUE(store.LookupBlockForRouteGet("k0", {}, now, 5s).empty()); + EXPECT_TRUE(store.BatchLookupBlockForRouteGet({"k0"}, {}, now, 5s).empty()); + EXPECT_TRUE(store.BatchExistsBlock({"k0"}).empty()); + EXPECT_TRUE(store.EnumerateLruForEviction({}, now).empty()); + + // Client reads. + EXPECT_FALSE(store.GetClient("node-a").has_value()); + EXPECT_FALSE(store.IsClientAlive("node-a")); + EXPECT_FALSE(store.GetPeerAddress("node-a").has_value()); + EXPECT_TRUE(store.ListAliveClients().empty()); + EXPECT_EQ(store.AliveClientCount(), 0u); + EXPECT_TRUE(store.GetClientTags("node-a").empty()); + + // External-KV reads, incl. the two added hit-count methods + `now` param. + EXPECT_TRUE(store.MatchExternalKv({"h0"}, /*count_as_hit=*/true, now).empty()); + EXPECT_TRUE(store.GetExternalKvHitCounts({"h0"}).empty()); + EXPECT_EQ(store.GetExternalKvCount("node-a"), 0u); +} + +} // namespace +} // namespace mori::umbp diff --git a/tests/cpp/umbp/distributed/test_peer_dram_allocator.cpp b/tests/cpp/umbp/distributed/test_peer_dram_allocator.cpp new file mode 100644 index 000000000..f76b43a5c --- /dev/null +++ b/tests/cpp/umbp/distributed/test_peer_dram_allocator.cpp @@ -0,0 +1,899 @@ +// Copyright © Advanced Micro Devices, Inc. All rights reserved. +// +// MIT License +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "umbp/distributed/peer/peer_dram_allocator.h" + +namespace mori::umbp { + +namespace { + +// 3 buffers x 4 pages of 1 KiB = 12 KiB total DRAM. +constexpr uint64_t kPageSize = 1024; + +PeerDramAllocator::TierConfig MakeDramCfg() { + PeerDramAllocator::TierConfig cfg; + cfg.buffer_sizes = {kPageSize * 4, kPageSize * 4, kPageSize * 4}; + cfg.buffer_descs = {{0xA0, 0xA1}, {0xB0, 0xB1}, {0xC0, 0xC1}}; + return cfg; +} + +PeerDramAllocator::TierConfig EmptyCfg() { return {}; } + +std::unique_ptr MakeAllocator( + std::chrono::milliseconds pending_ttl = std::chrono::milliseconds{5000}, + std::chrono::milliseconds read_lease_ttl = std::chrono::milliseconds{500}) { + return std::make_unique(kPageSize, MakeDramCfg(), EmptyCfg(), pending_ttl, + read_lease_ttl); +} + +// Strip AllocateResult down to its slot for tests that don't exercise +// the dedup outcome. +std::optional AllocateOk(PeerDramAllocator& a, + const std::string& key, uint64_t size, + TierType tier) { + return a.Allocate(key, size, tier).slot; +} + +} // namespace + +// ---- Allocate / Commit / Resolve happy path --------------------------------- + +TEST(PeerDramAllocator, CommitMakesKeyResolvable) { + auto a = MakeAllocator(); + auto pending = AllocateOk(*a, "key-1", kPageSize, TierType::DRAM); + ASSERT_TRUE(pending.has_value()); + EXPECT_EQ(pending->size, kPageSize); + EXPECT_EQ(pending->pages.size(), 1u); + + uint64_t committed_bytes = 0; + ASSERT_TRUE(a->Commit(pending->slot_id, "key-1", committed_bytes)); + EXPECT_EQ(committed_bytes, pending->size); + auto r = a->Resolve("key-1"); + EXPECT_TRUE(r.found); + EXPECT_EQ(r.size, kPageSize); + EXPECT_EQ(r.tier, TierType::DRAM); + EXPECT_EQ(r.pages, pending->pages); + + auto events = a->DrainPendingEvents(); + ASSERT_EQ(events.size(), 1u); + EXPECT_EQ(events[0].kind, KvEvent::Kind::ADD); + EXPECT_EQ(events[0].key, "key-1"); + EXPECT_EQ(events[0].size, kPageSize); + EXPECT_EQ(events[0].tier, TierType::DRAM); +} + +// ---- Allocate-side dedup ---------------------------------------------------- +// Defensive layer for master-index lag (primary dedup is at BatchRoutePut). + +TEST(PeerDramAllocator, AllocateRejectsAlreadyOwnedKey) { + auto a = MakeAllocator(); + + auto first = AllocateOk(*a, "A", kPageSize, TierType::DRAM); + ASSERT_TRUE(first.has_value()); + uint64_t committed_bytes = 0; + ASSERT_TRUE(a->Commit(first->slot_id, "A", committed_bytes)); + a->DrainPendingEvents(); + + const auto cap_after_commit = a->TierCapacitiesSnapshot()[TierType::DRAM]; + + auto second = a->Allocate("A", kPageSize, TierType::DRAM); + EXPECT_EQ(second.outcome, PeerDramAllocator::Outcome::kSuccessAlreadyExists); + EXPECT_FALSE(second.slot.has_value()); + + // No pages reserved -> capacity unchanged. + const auto cap_after_dedup = a->TierCapacitiesSnapshot()[TierType::DRAM]; + EXPECT_EQ(cap_after_dedup.available_bytes, cap_after_commit.available_bytes); +} + +TEST(PeerDramAllocator, AllocateAllowsDifferentKey) { + auto a = MakeAllocator(); + + auto first = AllocateOk(*a, "A", kPageSize, TierType::DRAM); + ASSERT_TRUE(first.has_value()); + uint64_t committed_bytes = 0; + ASSERT_TRUE(a->Commit(first->slot_id, "A", committed_bytes)); + + auto second = a->Allocate("B", kPageSize, TierType::DRAM); + EXPECT_EQ(second.outcome, PeerDramAllocator::Outcome::kSuccessAllocated); + ASSERT_TRUE(second.slot.has_value()); + EXPECT_TRUE(a->Commit(second.slot->slot_id, "B", committed_bytes)); +} + +// Lax mode: pending_ not checked. Two same-key Allocates before any +// Commit both succeed; race absorbed by Commit() (see +// DuplicateCommitIsIdempotentAndKeepsFirst). +TEST(PeerDramAllocator, AllocateDoesNotRejectOnPendingDuplicate) { + auto a = MakeAllocator(); + + auto first = a->Allocate("A", kPageSize, TierType::DRAM); + EXPECT_EQ(first.outcome, PeerDramAllocator::Outcome::kSuccessAllocated); + ASSERT_TRUE(first.slot.has_value()); + + auto second = a->Allocate("A", kPageSize, TierType::DRAM); + EXPECT_EQ(second.outcome, PeerDramAllocator::Outcome::kSuccessAllocated); + ASSERT_TRUE(second.slot.has_value()); + ASSERT_NE(second.slot->slot_id, first.slot->slot_id); +} + +// ---- Duplicate Commit idempotency ------------------------------------------- +// Race-window safety net. Both Allocates must happen BEFORE either +// Commit — once owned_["dup-key"] is set, the new owned_-check in +// Allocate would reject the second slot before it could reach Commit. + +TEST(PeerDramAllocator, DuplicateCommitIsIdempotentAndKeepsFirst) { + auto a = MakeAllocator(); + + auto first = AllocateOk(*a, "dup-key", kPageSize, TierType::DRAM); + ASSERT_TRUE(first.has_value()); + auto second = AllocateOk(*a, "dup-key", kPageSize, TierType::DRAM); + ASSERT_TRUE(second.has_value()); + ASSERT_NE(second->slot_id, first->slot_id); + + const auto first_pages = first->pages; + + uint64_t committed_bytes = 0; + ASSERT_TRUE(a->Commit(first->slot_id, "dup-key", committed_bytes)); + EXPECT_EQ(committed_bytes, kPageSize); + + auto events = a->DrainPendingEvents(); + ASSERT_EQ(events.size(), 1u); + EXPECT_EQ(events[0].kind, KvEvent::Kind::ADD); + EXPECT_EQ(events[0].key, "dup-key"); + + // First owned (1 page) + second still pending (1 page) = 2 occupied. + const auto cap_after_first_commit = a->TierCapacitiesSnapshot()[TierType::DRAM]; + EXPECT_EQ(cap_after_first_commit.available_bytes, + cap_after_first_commit.total_bytes - 2 * kPageSize); + + // Duplicate Commit: idempotent success, consumes the second pending + // (caller never needs to Abort it), prior owned slot unchanged. + committed_bytes = 0; + ASSERT_TRUE(a->Commit(second->slot_id, "dup-key", committed_bytes)); + EXPECT_EQ(committed_bytes, kPageSize); + + // Master's view unchanged: no REMOVE, no second ADD. + EXPECT_TRUE(a->DrainPendingEvents().empty()); + + // Resolve still returns the first commit's pages. + auto r = a->Resolve("dup-key"); + ASSERT_TRUE(r.found); + EXPECT_EQ(r.pages, first_pages); + EXPECT_EQ(r.size, kPageSize); + + // Second slot's pages freed -> only first occupies (1 page). + const auto cap_after_dup = a->TierCapacitiesSnapshot()[TierType::DRAM]; + EXPECT_EQ(cap_after_dup.available_bytes, cap_after_dup.total_bytes - kPageSize); + EXPECT_EQ(cap_after_dup.total_bytes, cap_after_first_commit.total_bytes); + + // Second slot_id no longer pending; idempotent Abort returns true. + EXPECT_TRUE(a->Abort(second->slot_id)); + EXPECT_TRUE(a->DrainPendingEvents().empty()); +} + +// ---- ENOSPC ----------------------------------------------------------------- + +TEST(PeerDramAllocator, AllocateReturnsNulloptWhenFull) { + auto a = MakeAllocator(); + std::vector slot_ids; + for (int i = 0; i < 12; ++i) { + auto p = AllocateOk(*a, "k-" + std::to_string(i), kPageSize, TierType::DRAM); + ASSERT_TRUE(p.has_value()) << "i=" << i; + slot_ids.push_back(p->slot_id); + } + EXPECT_FALSE(AllocateOk(*a, "k-overflow", kPageSize, TierType::DRAM).has_value()); + + EXPECT_TRUE(a->Abort(slot_ids.back())); + EXPECT_TRUE(AllocateOk(*a, "k-recovered", kPageSize, TierType::DRAM).has_value()); +} + +TEST(PeerDramAllocator, UnconfiguredTierReturnsNullopt) { + auto a = MakeAllocator(); + EXPECT_FALSE(AllocateOk(*a, "k", kPageSize, TierType::HBM).has_value()); +} + +// ---- Pending TTL ------------------------------------------------------------ + +TEST(PeerDramAllocator, PendingSlotExpiresAfterTtl) { + auto a = std::make_unique(kPageSize, MakeDramCfg(), EmptyCfg(), + /*pending_ttl=*/std::chrono::milliseconds{1}); + auto pending = AllocateOk(*a, "key-late", kPageSize, TierType::DRAM); + ASSERT_TRUE(pending.has_value()); + + std::this_thread::sleep_for(std::chrono::milliseconds{20}); + a->RunReaperOnceForTest(); + + uint64_t committed_bytes = 0; + EXPECT_FALSE(a->Commit(pending->slot_id, "key-late", committed_bytes)); + EXPECT_EQ(committed_bytes, 0u); + EXPECT_TRUE(a->DrainPendingEvents().empty()); + + auto cap = a->TierCapacitiesSnapshot(); + EXPECT_EQ(cap[TierType::DRAM].available_bytes, cap[TierType::DRAM].total_bytes); +} + +// ---- Abort idempotency ------------------------------------------------------ + +TEST(PeerDramAllocator, AbortIsIdempotent) { + auto a = MakeAllocator(); + auto pending = AllocateOk(*a, "k", kPageSize, TierType::DRAM); + ASSERT_TRUE(pending.has_value()); + EXPECT_TRUE(a->Abort(pending->slot_id)); + EXPECT_TRUE(a->Abort(pending->slot_id)); + EXPECT_TRUE(a->Abort(999999)); + EXPECT_TRUE(a->DrainPendingEvents().empty()); +} + +// ---- Evict idempotency + REMOVE event --------------------------------------- + +TEST(PeerDramAllocator, EvictRemovesKeyAndQueuesEvent) { + auto a = MakeAllocator(); + auto p = AllocateOk(*a, "k", kPageSize, TierType::DRAM); + uint64_t committed_bytes = 0; + ASSERT_TRUE(a->Commit(p->slot_id, "k", committed_bytes)); + EXPECT_EQ(committed_bytes, p->size); + a->DrainPendingEvents(); + + auto results = a->Evict({"k", "ghost"}); + ASSERT_EQ(results.size(), 2u); + EXPECT_EQ(results[0].key, "k"); + EXPECT_EQ(results[0].bytes_freed, kPageSize); + EXPECT_EQ(results[1].key, "ghost"); + EXPECT_EQ(results[1].bytes_freed, 0u); + + auto events = a->DrainPendingEvents(); + ASSERT_EQ(events.size(), 1u); + EXPECT_EQ(events[0].kind, KvEvent::Kind::REMOVE); + EXPECT_EQ(events[0].key, "k"); + + EXPECT_FALSE(a->Resolve("k").found); + + results = a->Evict({"k"}); + EXPECT_EQ(results[0].bytes_freed, 0u); + EXPECT_TRUE(a->DrainPendingEvents().empty()); +} + +// ---- Resolve-during-Evict race --------------------------------------------- + +TEST(PeerDramAllocator, EvictDefersWhenReadLeaseActive) { + auto a = std::make_unique(kPageSize, MakeDramCfg(), EmptyCfg(), + /*pending_ttl=*/std::chrono::milliseconds{5000}, + /*read_lease_ttl=*/std::chrono::milliseconds{200}); + auto p = AllocateOk(*a, "k", kPageSize, TierType::DRAM); + uint64_t committed_bytes = 0; + ASSERT_TRUE(a->Commit(p->slot_id, "k", committed_bytes)); + EXPECT_EQ(committed_bytes, p->size); + a->DrainPendingEvents(); + + auto r = a->Resolve("k"); + ASSERT_TRUE(r.found); + + auto results = a->Evict({"k"}); + EXPECT_EQ(results[0].bytes_freed, 0u); + EXPECT_TRUE(a->Resolve("k").found); + EXPECT_TRUE(a->DrainPendingEvents().empty()); + + std::this_thread::sleep_for(std::chrono::milliseconds{300}); + a->RunReaperOnceForTest(); + results = a->Evict({"k"}); + EXPECT_EQ(results[0].bytes_freed, kPageSize); + auto events = a->DrainPendingEvents(); + ASSERT_EQ(events.size(), 1u); + EXPECT_EQ(events[0].kind, KvEvent::Kind::REMOVE); +} + +// ---- Full-sync snapshot ----------------------------------------------------- + +TEST(PeerDramAllocator, SnapshotOwnedKeysReturnsEveryAdd) { + auto a = MakeAllocator(); + for (int i = 0; i < 5; ++i) { + const std::string k = "k-" + std::to_string(i); + auto p = AllocateOk(*a, k, kPageSize, TierType::DRAM); + ASSERT_TRUE(p.has_value()); + uint64_t committed_bytes = 0; + ASSERT_TRUE(a->Commit(p->slot_id, k, committed_bytes)); + EXPECT_EQ(committed_bytes, p->size); + } + a->DrainPendingEvents(); + + auto snap = a->SnapshotOwnedKeys(); + ASSERT_EQ(snap.size(), 5u); + for (const auto& ev : snap) { + EXPECT_EQ(ev.kind, KvEvent::Kind::ADD); + EXPECT_EQ(ev.size, kPageSize); + EXPECT_EQ(ev.tier, TierType::DRAM); + } + EXPECT_TRUE(a->DrainPendingEvents().empty()); +} + +// ---- Buffer descs filtered to the page set --------------------------------- + +TEST(PeerDramAllocator, BufferDescsForPagesDedupAndOrder) { + auto a = MakeAllocator(); + auto p = AllocateOk(*a, "k", kPageSize * 5, TierType::DRAM); + ASSERT_TRUE(p.has_value()); + ASSERT_EQ(p->pages.size(), 5u); + + auto descs = a->BufferDescsForPages(TierType::DRAM, p->pages); + ASSERT_EQ(descs.size(), 2u); + EXPECT_EQ(descs[0].buffer_index, 0u); + EXPECT_EQ(descs[1].buffer_index, 1u); + EXPECT_EQ(descs[0].desc_bytes, std::vector({0xA0, 0xA1})); + EXPECT_EQ(descs[1].desc_bytes, std::vector({0xB0, 0xB1})); +} + +// ---- BatchAllocate / BatchCommit / BatchAbort ------------------------------- + +TEST(PeerDramAllocator, BatchAllocateEmptyInputReturnsEmpty) { + auto a = MakeAllocator(); + EXPECT_TRUE(a->BatchAllocate({}).empty()); +} + +TEST(PeerDramAllocator, BatchAllocateMixedOutcomesAndDescs) { + auto a = MakeAllocator(); + auto owned = AllocateOk(*a, "owned", kPageSize, TierType::DRAM); + ASSERT_TRUE(owned.has_value()); + uint64_t committed_bytes = 0; + ASSERT_TRUE(a->Commit(owned->slot_id, "owned", committed_bytes)); + a->DrainPendingEvents(); + + std::vector requests; + requests.push_back({"owned", kPageSize, TierType::DRAM}); + requests.push_back({"ok", kPageSize * 5, TierType::DRAM}); + requests.push_back({"bad-tier", kPageSize, TierType::HBM}); + requests.push_back({"zero", 0, TierType::DRAM}); + requests.push_back({"too-big", kPageSize * 20, TierType::DRAM}); + + auto results = a->BatchAllocate(requests); + ASSERT_EQ(results.size(), requests.size()); + + EXPECT_EQ(results[0].outcome, PeerDramAllocator::Outcome::kSuccessAlreadyExists); + EXPECT_FALSE(results[0].slot.has_value()); + EXPECT_TRUE(results[0].descs.empty()); + + EXPECT_EQ(results[1].outcome, PeerDramAllocator::Outcome::kSuccessAllocated); + ASSERT_TRUE(results[1].slot.has_value()); + EXPECT_EQ(results[1].slot->size, kPageSize * 5); + EXPECT_EQ(results[1].slot->pages.size(), 5u); + ASSERT_EQ(results[1].descs.size(), 2u); + EXPECT_EQ(results[1].descs[0].buffer_index, 0u); + EXPECT_EQ(results[1].descs[1].buffer_index, 1u); + + EXPECT_EQ(results[2].outcome, PeerDramAllocator::Outcome::kFailed); + EXPECT_FALSE(results[2].slot.has_value()); + EXPECT_EQ(results[3].outcome, PeerDramAllocator::Outcome::kFailed); + EXPECT_FALSE(results[3].slot.has_value()); + EXPECT_EQ(results[4].outcome, PeerDramAllocator::Outcome::kFailedNoSpace); + EXPECT_FALSE(results[4].slot.has_value()); +} + +TEST(PeerDramAllocator, BatchCommitMixedSuccessAndFailure) { + auto a = MakeAllocator(); + auto allocated = a->BatchAllocate({ + {"dup", kPageSize, TierType::DRAM}, + {"dup", kPageSize * 2, TierType::DRAM}, + {"unique", kPageSize, TierType::DRAM}, + }); + ASSERT_EQ(allocated.size(), 3u); + ASSERT_TRUE(allocated[0].slot.has_value()); + ASSERT_TRUE(allocated[1].slot.has_value()); + ASSERT_TRUE(allocated[2].slot.has_value()); + + auto committed = a->BatchCommit({ + {allocated[0].slot->slot_id, "dup"}, + {999999, "missing"}, + {allocated[1].slot->slot_id, "dup"}, + {allocated[2].slot->slot_id, "unique"}, + }); + ASSERT_EQ(committed.size(), 4u); + EXPECT_TRUE(committed[0].success); + EXPECT_EQ(committed[0].bytes_committed, kPageSize); + EXPECT_FALSE(committed[1].success); + EXPECT_EQ(committed[1].bytes_committed, 0u); + EXPECT_TRUE(committed[2].success); + EXPECT_EQ(committed[2].bytes_committed, kPageSize); + EXPECT_TRUE(committed[3].success); + EXPECT_EQ(committed[3].bytes_committed, kPageSize); + + auto dup = a->Resolve("dup"); + ASSERT_TRUE(dup.found); + EXPECT_EQ(dup.pages, allocated[0].slot->pages); + EXPECT_EQ(dup.size, kPageSize); + auto unique = a->Resolve("unique"); + ASSERT_TRUE(unique.found); + EXPECT_EQ(unique.size, kPageSize); + + auto events = a->DrainPendingEvents(); + ASSERT_EQ(events.size(), 2u); + EXPECT_EQ(events[0].kind, KvEvent::Kind::ADD); + EXPECT_EQ(events[0].key, "dup"); + EXPECT_EQ(events[1].kind, KvEvent::Kind::ADD); + EXPECT_EQ(events[1].key, "unique"); +} + +TEST(PeerDramAllocator, BatchAbortMixedSlotsIsIdempotent) { + auto a = MakeAllocator(); + auto allocated = a->BatchAllocate({ + {"drop", kPageSize, TierType::DRAM}, + {"keep", kPageSize, TierType::DRAM}, + }); + ASSERT_EQ(allocated.size(), 2u); + ASSERT_TRUE(allocated[0].slot.has_value()); + ASSERT_TRUE(allocated[1].slot.has_value()); + + auto aborted = a->BatchAbort({allocated[0].slot->slot_id, 999999}); + ASSERT_EQ(aborted.size(), 2u); + EXPECT_TRUE(aborted[0]); + EXPECT_TRUE(aborted[1]); + + uint64_t committed_bytes = 0; + EXPECT_FALSE(a->Commit(allocated[0].slot->slot_id, "drop", committed_bytes)); + EXPECT_TRUE(a->Commit(allocated[1].slot->slot_id, "keep", committed_bytes)); + EXPECT_EQ(committed_bytes, kPageSize); + EXPECT_TRUE(a->Resolve("keep").found); +} + +// ---- BatchResolve ---------------------------------------------------------- + +TEST(PeerDramAllocator, BatchResolveEmptyInputReturnsEmpty) { + auto a = MakeAllocator(); + EXPECT_TRUE(a->BatchResolve({}).empty()); +} + +TEST(PeerDramAllocator, BatchResolveMixedHitsAndMisses) { + auto a = MakeAllocator(); + // 5 pages over 4-pages-per-buffer config -> exercises dedup'd descs. + auto p_hit = AllocateOk(*a, "hit", kPageSize * 5, TierType::DRAM); + ASSERT_TRUE(p_hit.has_value()); + uint64_t committed_bytes = 0; + ASSERT_TRUE(a->Commit(p_hit->slot_id, "hit", committed_bytes)); + auto p_small = AllocateOk(*a, "small", kPageSize, TierType::DRAM); + ASSERT_TRUE(p_small.has_value()); + ASSERT_TRUE(a->Commit(p_small->slot_id, "small", committed_bytes)); + a->DrainPendingEvents(); + + auto ref_hit = a->Resolve("hit"); + auto ref_descs_hit = a->BufferDescsForPages(ref_hit.tier, ref_hit.pages); + auto ref_small = a->Resolve("small"); + auto ref_descs_small = a->BufferDescsForPages(ref_small.tier, ref_small.pages); + ASSERT_TRUE(ref_hit.found); + ASSERT_TRUE(ref_small.found); + + auto results = a->BatchResolve({"hit", "ghost-a", "small", "ghost-b"}); + ASSERT_EQ(results.size(), 4u); + + EXPECT_TRUE(results[0].found); + EXPECT_EQ(results[0].tier, ref_hit.tier); + EXPECT_EQ(results[0].pages, ref_hit.pages); + EXPECT_EQ(results[0].size, ref_hit.size); + ASSERT_EQ(results[0].descs.size(), ref_descs_hit.size()); + for (size_t i = 0; i < ref_descs_hit.size(); ++i) { + EXPECT_EQ(results[0].descs[i].buffer_index, ref_descs_hit[i].buffer_index); + EXPECT_EQ(results[0].descs[i].desc_bytes, ref_descs_hit[i].desc_bytes); + } + + EXPECT_FALSE(results[1].found); + EXPECT_TRUE(results[1].pages.empty()); + EXPECT_EQ(results[1].size, 0u); + EXPECT_TRUE(results[1].descs.empty()); + + EXPECT_TRUE(results[2].found); + EXPECT_EQ(results[2].tier, ref_small.tier); + EXPECT_EQ(results[2].pages, ref_small.pages); + EXPECT_EQ(results[2].size, ref_small.size); + ASSERT_EQ(results[2].descs.size(), ref_descs_small.size()); + for (size_t i = 0; i < ref_descs_small.size(); ++i) { + EXPECT_EQ(results[2].descs[i].buffer_index, ref_descs_small[i].buffer_index); + EXPECT_EQ(results[2].descs[i].desc_bytes, ref_descs_small[i].desc_bytes); + } + + EXPECT_FALSE(results[3].found); +} + +TEST(PeerDramAllocator, BatchResolveExtendsLeaseForHitsOnly) { + auto a = std::make_unique(kPageSize, MakeDramCfg(), EmptyCfg(), + /*pending_ttl=*/std::chrono::milliseconds{5000}, + /*read_lease_ttl=*/std::chrono::milliseconds{500}); + auto p_x = AllocateOk(*a, "x", kPageSize, TierType::DRAM); + ASSERT_TRUE(p_x.has_value()); + uint64_t committed_bytes = 0; + ASSERT_TRUE(a->Commit(p_x->slot_id, "x", committed_bytes)); + auto p_y = AllocateOk(*a, "y", kPageSize, TierType::DRAM); + ASSERT_TRUE(p_y.has_value()); + ASSERT_TRUE(a->Commit(p_y->slot_id, "y", committed_bytes)); + a->DrainPendingEvents(); + + auto results = a->BatchResolve({"x", "missing", "y"}); + ASSERT_EQ(results.size(), 3u); + ASSERT_TRUE(results[0].found); + ASSERT_FALSE(results[1].found); + ASSERT_TRUE(results[2].found); + + auto evict = a->Evict({"x", "y"}); + ASSERT_EQ(evict.size(), 2u); + EXPECT_EQ(evict[0].bytes_freed, 0u); + EXPECT_EQ(evict[1].bytes_freed, 0u); + EXPECT_TRUE(a->Resolve("x").found); + EXPECT_TRUE(a->Resolve("y").found); + EXPECT_TRUE(a->DrainPendingEvents().empty()); + + // Miss must not poison read_lease_until_: a subsequent + // Allocate+Commit+Evict on the same key must free as if never touched. + auto p_miss = AllocateOk(*a, "missing", kPageSize, TierType::DRAM); + ASSERT_TRUE(p_miss.has_value()); + ASSERT_TRUE(a->Commit(p_miss->slot_id, "missing", committed_bytes)); + a->DrainPendingEvents(); + auto evict_missing = a->Evict({"missing"}); + ASSERT_EQ(evict_missing.size(), 1u); + EXPECT_EQ(evict_missing[0].bytes_freed, kPageSize); +} + +TEST(PeerDramAllocator, BatchResolveLeaseExpiresLikeSingleKeyResolve) { + auto a = std::make_unique(kPageSize, MakeDramCfg(), EmptyCfg(), + /*pending_ttl=*/std::chrono::milliseconds{5000}, + /*read_lease_ttl=*/std::chrono::milliseconds{50}); + auto p = AllocateOk(*a, "k", kPageSize, TierType::DRAM); + ASSERT_TRUE(p.has_value()); + uint64_t committed_bytes = 0; + ASSERT_TRUE(a->Commit(p->slot_id, "k", committed_bytes)); + a->DrainPendingEvents(); + + auto results = a->BatchResolve({"k"}); + ASSERT_EQ(results.size(), 1u); + ASSERT_TRUE(results[0].found); + + EXPECT_EQ(a->Evict({"k"})[0].bytes_freed, 0u); + + std::this_thread::sleep_for(std::chrono::milliseconds{100}); + auto evicted = a->Evict({"k"}); + ASSERT_EQ(evicted.size(), 1u); + EXPECT_EQ(evicted[0].bytes_freed, kPageSize); +} + +// ---- Capacities snapshot ---------------------------------------------------- + +TEST(PeerDramAllocator, TierCapacitiesReflectAllocations) { + auto a = MakeAllocator(); + auto cap0 = a->TierCapacitiesSnapshot(); + ASSERT_EQ(cap0.count(TierType::DRAM), 1u); + const uint64_t total = cap0[TierType::DRAM].total_bytes; + EXPECT_EQ(cap0[TierType::DRAM].available_bytes, total); + + auto p = AllocateOk(*a, "k", kPageSize * 3, TierType::DRAM); + ASSERT_TRUE(p.has_value()); + auto cap1 = a->TierCapacitiesSnapshot(); + EXPECT_EQ(cap1[TierType::DRAM].available_bytes, total - 3 * kPageSize); + + uint64_t committed_bytes = 0; + ASSERT_TRUE(a->Commit(p->slot_id, "k", committed_bytes)); + EXPECT_EQ(committed_bytes, p->size); + auto cap2 = a->TierCapacitiesSnapshot(); + EXPECT_EQ(cap2[TierType::DRAM].available_bytes, total - 3 * kPageSize); + + ASSERT_EQ(a->Evict({"k"})[0].bytes_freed, 3 * kPageSize); + auto cap3 = a->TierCapacitiesSnapshot(); + EXPECT_EQ(cap3[TierType::DRAM].available_bytes, total); +} + +// ---- Commit after reap ------------------------------------------------------ + +TEST(PeerDramAllocator, CommitAfterReapReturnsFalse) { + auto a = std::make_unique(kPageSize, MakeDramCfg(), EmptyCfg(), + std::chrono::milliseconds{1}); + auto p = AllocateOk(*a, "doomed", kPageSize, TierType::DRAM); + ASSERT_TRUE(p.has_value()); + std::this_thread::sleep_for(std::chrono::milliseconds{20}); + a->RunReaperOnceForTest(); + uint64_t committed_bytes = 0; + EXPECT_FALSE(a->Commit(p->slot_id, "doomed", committed_bytes)); + EXPECT_EQ(committed_bytes, 0u); + EXPECT_TRUE(a->DrainPendingEvents().empty()); +} + +// ---- Distributed Clear ------------------------------------------------------ + +TEST(PeerDramAllocator, ClearLocalReleasesOwnedAndCancelsPending) { + auto a = MakeAllocator(); + + auto pA = AllocateOk(*a, "A", kPageSize, TierType::DRAM); + ASSERT_TRUE(pA.has_value()); + uint64_t committed_bytes = 0; + ASSERT_TRUE(a->Commit(pA->slot_id, "A", committed_bytes)); + + auto pB = AllocateOk(*a, "B", kPageSize * 2, TierType::DRAM); + ASSERT_TRUE(pB.has_value()); + a->DrainPendingEvents(); // discard the A ADD + + const auto cap_before = a->TierCapacitiesSnapshot()[TierType::DRAM]; + ASSERT_LT(cap_before.available_bytes, cap_before.total_bytes); + + a->ClearLocal(); + + EXPECT_TRUE(a->IsClearFullSyncPending()); + EXPECT_FALSE(a->Resolve("A").found); + EXPECT_TRUE(a->SnapshotOwnedKeys().empty()); + EXPECT_TRUE(a->DrainPendingEvents().empty()); + + // Owned pages (A) returned immediately; pending pages (B) still held. + auto cap_after_clear = a->TierCapacitiesSnapshot()[TierType::DRAM]; + EXPECT_EQ(cap_after_clear.available_bytes, cap_before.total_bytes - 2 * kPageSize); + + // Committing the cancelled pending fails AND releases its pages + // without emitting an ADD. + EXPECT_FALSE(a->Commit(pB->slot_id, "B", committed_bytes)); + EXPECT_EQ(committed_bytes, 0u); + EXPECT_TRUE(a->DrainPendingEvents().empty()); + EXPECT_FALSE(a->Resolve("B").found); + + auto cap_final = a->TierCapacitiesSnapshot()[TierType::DRAM]; + EXPECT_EQ(cap_final.available_bytes, cap_final.total_bytes); +} + +TEST(PeerDramAllocator, ClearLocalGatesAllocateUntilAcked) { + auto a = MakeAllocator(); + + a->ClearLocal(); + EXPECT_FALSE(AllocateOk(*a, "blocked", kPageSize, TierType::DRAM).has_value()); + + a->ClearFullSyncAcked(); + EXPECT_FALSE(a->IsClearFullSyncPending()); + EXPECT_TRUE(AllocateOk(*a, "ok-after-ack", kPageSize, TierType::DRAM).has_value()); +} + +TEST(PeerDramAllocator, ClearLocalDropsQueuedAdds) { + auto a = MakeAllocator(); + auto p = AllocateOk(*a, "k", kPageSize, TierType::DRAM); + ASSERT_TRUE(p.has_value()); + uint64_t committed_bytes = 0; + ASSERT_TRUE(a->Commit(p->slot_id, "k", committed_bytes)); + // ADD is sitting in the outbox, not yet drained. + + a->ClearLocal(); + + EXPECT_TRUE(a->DrainPendingEvents().empty()); + EXPECT_TRUE(a->SnapshotOwnedKeys().empty()); +} + +TEST(PeerDramAllocator, AbortReleasesCancelledPending) { + auto a = MakeAllocator(); + auto p = AllocateOk(*a, "p1", kPageSize, TierType::DRAM); + ASSERT_TRUE(p.has_value()); + + a->ClearLocal(); + // Abort on a cancelled pending is idempotent and frees the pages. + EXPECT_TRUE(a->Abort(p->slot_id)); + a->ClearFullSyncAcked(); + + auto cap = a->TierCapacitiesSnapshot()[TierType::DRAM]; + EXPECT_EQ(cap.available_bytes, cap.total_bytes); +} + +// Pre-clear pending Commit fails; post-ack new Allocate+Commit succeeds. +TEST(PeerDramAllocator, PendingGenerationRejectsPreClearCommit) { + auto a = MakeAllocator(); + + auto pB = AllocateOk(*a, "B", kPageSize * 2, TierType::DRAM); + ASSERT_TRUE(pB.has_value()); + const auto cap_before = a->TierCapacitiesSnapshot()[TierType::DRAM]; + EXPECT_EQ(cap_before.available_bytes, cap_before.total_bytes - 2 * kPageSize); + + a->ClearLocal(); + + uint64_t committed_bytes = 0; + EXPECT_FALSE(a->Commit(pB->slot_id, "B", committed_bytes)); + EXPECT_EQ(committed_bytes, 0u); + EXPECT_TRUE(a->DrainPendingEvents().empty()); + auto cap_after_reject = a->TierCapacitiesSnapshot()[TierType::DRAM]; + EXPECT_EQ(cap_after_reject.available_bytes, cap_after_reject.total_bytes); + + a->ClearFullSyncAcked(); + + auto pC = AllocateOk(*a, "C", kPageSize, TierType::DRAM); + ASSERT_TRUE(pC.has_value()); + ASSERT_TRUE(a->Commit(pC->slot_id, "C", committed_bytes)); + EXPECT_EQ(committed_bytes, kPageSize); + EXPECT_TRUE(a->Resolve("C").found); +} + +// Repeated Clears still reject the original pre-clear pending Commit. +TEST(PeerDramAllocator, PendingGenerationSurvivesDoubleClear) { + auto a = MakeAllocator(); + + auto pB = AllocateOk(*a, "B", kPageSize, TierType::DRAM); + ASSERT_TRUE(pB.has_value()); + + a->ClearLocal(); + a->ClearLocal(); + + uint64_t committed_bytes = 0; + EXPECT_FALSE(a->Commit(pB->slot_id, "B", committed_bytes)); + auto cap_after_reject = a->TierCapacitiesSnapshot()[TierType::DRAM]; + EXPECT_EQ(cap_after_reject.available_bytes, cap_after_reject.total_bytes); + + a->ClearFullSyncAcked(); + auto pC = AllocateOk(*a, "C", kPageSize, TierType::DRAM); + ASSERT_TRUE(pC.has_value()); + EXPECT_TRUE(a->Commit(pC->slot_id, "C", committed_bytes)); +} + +// Leased owned key: logically gone at Clear, pages freed by reaper after +// the lease expires. +TEST(PeerDramAllocator, ClearLocalDefersLeasedOwnedPages) { + auto a = MakeAllocator(/*pending_ttl=*/std::chrono::milliseconds{5000}, + /*read_lease_ttl=*/std::chrono::milliseconds{200}); + + auto p = AllocateOk(*a, "A", kPageSize, TierType::DRAM); + ASSERT_TRUE(p.has_value()); + uint64_t committed_bytes = 0; + ASSERT_TRUE(a->Commit(p->slot_id, "A", committed_bytes)); + a->DrainPendingEvents(); + + const auto cap_committed = a->TierCapacitiesSnapshot()[TierType::DRAM]; + EXPECT_EQ(cap_committed.available_bytes, cap_committed.total_bytes - kPageSize); + + ASSERT_TRUE(a->Resolve("A").found); // lease. + + a->ClearLocal(); + + EXPECT_FALSE(a->Resolve("A").found); + EXPECT_TRUE(a->SnapshotOwnedKeys().empty()); + + auto cap_after_clear = a->TierCapacitiesSnapshot()[TierType::DRAM]; + EXPECT_EQ(cap_after_clear.available_bytes, cap_committed.total_bytes - kPageSize); + + // Pre-TTL sweep: no-op. + a->RunReaperOnceForTest(); + auto cap_no_op_sweep = a->TierCapacitiesSnapshot()[TierType::DRAM]; + EXPECT_EQ(cap_no_op_sweep.available_bytes, cap_committed.total_bytes - kPageSize); + + // Past TTL: pages return to bitmap. + std::this_thread::sleep_for(std::chrono::milliseconds{300}); + a->RunReaperOnceForTest(); + auto cap_swept = a->TierCapacitiesSnapshot()[TierType::DRAM]; + EXPECT_EQ(cap_swept.available_bytes, cap_swept.total_bytes); +} + +// Leased owned A defers; pending B rejects via generation. +TEST(PeerDramAllocator, ClearLocalMixedPendingAndLeased) { + auto a = MakeAllocator(/*pending_ttl=*/std::chrono::milliseconds{5000}, + /*read_lease_ttl=*/std::chrono::milliseconds{200}); + + auto pA = AllocateOk(*a, "A", kPageSize, TierType::DRAM); + ASSERT_TRUE(pA.has_value()); + uint64_t committed_bytes = 0; + ASSERT_TRUE(a->Commit(pA->slot_id, "A", committed_bytes)); + ASSERT_TRUE(a->Resolve("A").found); // lease. + + auto pB = AllocateOk(*a, "B", kPageSize * 2, TierType::DRAM); + ASSERT_TRUE(pB.has_value()); + a->DrainPendingEvents(); + + const auto total = a->TierCapacitiesSnapshot()[TierType::DRAM].total_bytes; + + a->ClearLocal(); + + EXPECT_FALSE(a->Resolve("A").found); + EXPECT_TRUE(a->SnapshotOwnedKeys().empty()); + + // A deferred + B pending: 3 pages occupied. + auto cap_after_clear = a->TierCapacitiesSnapshot()[TierType::DRAM]; + EXPECT_EQ(cap_after_clear.available_bytes, total - 3 * kPageSize); + + // Commit(B) fails on generation mismatch, releases B. + EXPECT_FALSE(a->Commit(pB->slot_id, "B", committed_bytes)); + auto cap_after_reject = a->TierCapacitiesSnapshot()[TierType::DRAM]; + EXPECT_EQ(cap_after_reject.available_bytes, total - kPageSize); // only A. + + // Past lease + sweep: A released. + std::this_thread::sleep_for(std::chrono::milliseconds{300}); + a->RunReaperOnceForTest(); + auto cap_final = a->TierCapacitiesSnapshot()[TierType::DRAM]; + EXPECT_EQ(cap_final.available_bytes, total); +} + +// Sweeps are no-ops while the deferred lease is still active. +TEST(PeerDramAllocator, ClearLocalSweepRespectsTtl) { + auto a = MakeAllocator(/*pending_ttl=*/std::chrono::milliseconds{5000}, + /*read_lease_ttl=*/std::chrono::milliseconds{10000}); + + auto p = AllocateOk(*a, "A", kPageSize, TierType::DRAM); + ASSERT_TRUE(p.has_value()); + uint64_t committed_bytes = 0; + ASSERT_TRUE(a->Commit(p->slot_id, "A", committed_bytes)); + ASSERT_TRUE(a->Resolve("A").found); + + const auto cap_committed = a->TierCapacitiesSnapshot()[TierType::DRAM]; + + a->ClearLocal(); + + // Lease still live: every sweep is a no-op. + for (int i = 0; i < 3; ++i) { + a->RunReaperOnceForTest(); + auto cap = a->TierCapacitiesSnapshot()[TierType::DRAM]; + EXPECT_EQ(cap.available_bytes, cap_committed.available_bytes) << "sweep i=" << i; + } +} + +// ---- OwnedKeyCountByTier ---------------------------------------------------- + +TEST(PeerDramAllocator, OwnedKeyCountByTierTracksCommitsAndEvicts) { + auto a = MakeAllocator(); + + auto counts0 = a->OwnedKeyCountByTier(); + EXPECT_EQ(counts0[TierType::DRAM], 0u); + EXPECT_EQ(counts0[TierType::HBM], 0u); + EXPECT_EQ(counts0[TierType::SSD], 0u); + + for (int i = 0; i < 3; ++i) { + const std::string k = "key-dram-" + std::to_string(i); + auto p = AllocateOk(*a, k, kPageSize, TierType::DRAM); + ASSERT_TRUE(p.has_value()) << "i=" << i; + uint64_t committed_bytes = 0; + ASSERT_TRUE(a->Commit(p->slot_id, k, committed_bytes)); + } + auto counts1 = a->OwnedKeyCountByTier(); + EXPECT_EQ(counts1[TierType::DRAM], 3u); + EXPECT_EQ(counts1[TierType::HBM], 0u); + EXPECT_EQ(counts1[TierType::SSD], 0u); + + a->Evict({"key-dram-0"}); + auto counts2 = a->OwnedKeyCountByTier(); + EXPECT_EQ(counts2[TierType::DRAM], 2u); + EXPECT_EQ(counts2[TierType::HBM], 0u); +} + +TEST(PeerDramAllocator, OwnedKeyCountByTierMultiTier) { + PeerDramAllocator::TierConfig hbm_cfg; + hbm_cfg.buffer_sizes = {kPageSize * 4}; + hbm_cfg.buffer_descs = {{0xD0, 0xD1}}; + auto a = std::make_unique(kPageSize, MakeDramCfg(), hbm_cfg, + std::chrono::milliseconds{5000}); + + for (int i = 0; i < 2; ++i) { + const std::string k = "d-" + std::to_string(i); + auto p = AllocateOk(*a, k, kPageSize, TierType::DRAM); + ASSERT_TRUE(p.has_value()); + uint64_t committed_bytes = 0; + ASSERT_TRUE(a->Commit(p->slot_id, k, committed_bytes)); + } + { + auto p = AllocateOk(*a, "h-0", kPageSize, TierType::HBM); + ASSERT_TRUE(p.has_value()); + uint64_t committed_bytes = 0; + ASSERT_TRUE(a->Commit(p->slot_id, "h-0", committed_bytes)); + } + + auto counts = a->OwnedKeyCountByTier(); + EXPECT_EQ(counts[TierType::DRAM], 2u); + EXPECT_EQ(counts[TierType::HBM], 1u); + EXPECT_EQ(counts[TierType::SSD], 0u); +} + +} // namespace mori::umbp diff --git a/tests/cpp/umbp/distributed/test_peer_ssd_eviction.cpp b/tests/cpp/umbp/distributed/test_peer_ssd_eviction.cpp new file mode 100644 index 000000000..591d3b1d3 --- /dev/null +++ b/tests/cpp/umbp/distributed/test_peer_ssd_eviction.cpp @@ -0,0 +1,406 @@ +// Copyright © Advanced Micro Devices, Inc. All rights reserved. +// +// MIT License +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +// +// SSD local capacity management + eviction. Drives PeerSsdManager +// through a controllable in-memory TierBackend (the test-only constructor) so +// LRU ordering, watermark eviction, the in-flight-read guard, idempotent Write, +// backend-evict failure, concurrent eviction, and physical Clear are all +// deterministic without real disk IO. +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "umbp/distributed/peer/peer_ssd_manager.h" +#include "umbp/local/tiers/tier_backend.h" + +namespace mori::umbp { +namespace { + +// In-memory TierBackend with test hooks: blockable reads (to hold the in-flight +// guard), forced evict failure, and call counters. +class FakeBackend : public TierBackend { + public: + explicit FakeBackend(size_t capacity) + : TierBackend(StorageTier::LOCAL_SSD), capacity_(capacity) {} + + bool Write(const std::string& key, const void* data, size_t size) override { + std::lock_guard lk(mu_); + ++write_calls_; + auto it = store_.find(key); + size_t prev = (it == store_.end()) ? 0 : it->second.size(); + if (used_ - prev + size > capacity_) return false; // ENOSPC + store_[key].assign(static_cast(data), static_cast(data) + size); + used_ = used_ - prev + size; + return true; + } + + bool ReadIntoPtr(const std::string& key, uintptr_t dst, size_t size) override { + { + std::unique_lock lk(gate_mu_); + ++reads_started_; + started_cv_.notify_all(); + gate_cv_.wait(lk, [this] { return !read_blocked_; }); + } + std::lock_guard lk(mu_); + auto it = store_.find(key); + if (it == store_.end() || it->second.size() != size) return false; + std::memcpy(reinterpret_cast(dst), it->second.data(), size); + return true; + } + + bool Exists(const std::string& key) const override { + std::lock_guard lk(mu_); + return store_.count(key) != 0; + } + + bool Evict(const std::string& key) override { + std::lock_guard lk(mu_); + if (fail_evict_) return false; + auto it = store_.find(key); + if (it == store_.end()) return false; + used_ -= it->second.size(); + store_.erase(it); + return true; + } + + std::pair Capacity() const override { + std::lock_guard lk(mu_); + return {used_, capacity_}; + } + + void Clear() override { + std::lock_guard lk(mu_); + ++clear_calls_; + store_.clear(); + used_ = 0; + } + + // --- test controls --- + void BlockReads() { + std::lock_guard lk(gate_mu_); + read_blocked_ = true; + } + void UnblockReads() { + { + std::lock_guard lk(gate_mu_); + read_blocked_ = false; + } + gate_cv_.notify_all(); + } + void WaitReadsStarted(int n) { + std::unique_lock lk(gate_mu_); + started_cv_.wait(lk, [&] { return reads_started_ >= n; }); + } + void SetFailEvict(bool f) { + std::lock_guard lk(mu_); + fail_evict_ = f; + } + int write_calls() const { + std::lock_guard lk(mu_); + return write_calls_; + } + int clear_calls() const { + std::lock_guard lk(mu_); + return clear_calls_; + } + + private: + mutable std::mutex mu_; + std::unordered_map> store_; + size_t used_ = 0; + size_t capacity_; + int write_calls_ = 0; + int clear_calls_ = 0; + bool fail_evict_ = false; + + std::mutex gate_mu_; + std::condition_variable gate_cv_; + std::condition_variable started_cv_; + bool read_blocked_ = false; + int reads_started_ = 0; +}; + +std::vector> OneSeg(const std::string& s) { + return {{s.data(), s.size()}}; +} + +// Manager owning a FakeBackend we keep a raw pointer to for test inspection. +struct Harness { + FakeBackend* backend; + std::unique_ptr mgr; +}; + +Harness MakeHarness(size_t capacity, double high = 0.9, double low = 0.7) { + auto be = std::make_unique(capacity); + FakeBackend* raw = be.get(); + return Harness{raw, std::make_unique(std::move(be), high, low)}; +} + +int CountKind(const std::vector& events, KvEvent::Kind kind) { + int n = 0; + for (const auto& e : events) { + if (e.kind == kind && e.tier == TierType::SSD) ++n; + } + return n; +} + +bool HasRemove(const std::vector& events, const std::string& key) { + for (const auto& e : events) { + if (e.kind == KvEvent::Kind::REMOVE && e.tier == TierType::SSD && e.key == key) return true; + } + return false; +} + +// --------------------------------------------------------------------------- + +TEST(PeerSsdEviction, WriteAndPrepareReadRefreshLru) { + auto h = MakeHarness(/*capacity=*/1'000'000); + ASSERT_TRUE(h.mgr->Write("A", OneSeg("aaaa"), 4)); + ASSERT_TRUE(h.mgr->Write("B", OneSeg("bbbb"), 4)); + ASSERT_TRUE(h.mgr->Write("C", OneSeg("cccc"), 4)); + + // LRU now (oldest->newest): A, B, C. Reading A promotes it to MRU, so the + // oldest becomes B and SelectVictims must pick B first (not the just-read A). + std::vector buf(4); + auto out = h.mgr->PrepareRead("A", buf.data(), buf.size()); + ASSERT_EQ(out.status, SsdReadStatus::kOk); + EXPECT_EQ(std::string(buf.data(), out.size), "aaaa"); + + auto victims = h.mgr->SelectVictims(/*bytes_to_free=*/1); + ASSERT_FALSE(victims.empty()); + EXPECT_EQ(victims.front(), "B"); + EXPECT_NE(victims.front(), "A"); +} + +TEST(PeerSsdEviction, WatermarkTriggersEvictionDownToLow) { + // capacity 1000, high 0.9 (=>900), low 0.7 (=>700); 100-byte values. + auto h = MakeHarness(/*capacity=*/1000, /*high=*/0.9, /*low=*/0.7); + std::string val(100, 'x'); + for (int i = 1; i <= 9; ++i) { + ASSERT_TRUE(h.mgr->Write("k" + std::to_string(i), OneSeg(val), val.size())); + } + // After k9: used hit 900 >= high -> evict oldest down to <= 700. + auto [used, total] = h.mgr->Capacity(); + EXPECT_EQ(total, 1000u); + EXPECT_LE(used, 700u); + + // Oldest (k1, k2) evicted first; newest still present. + EXPECT_FALSE(h.mgr->Exists("k1")); + EXPECT_FALSE(h.mgr->Exists("k2")); + EXPECT_TRUE(h.mgr->Exists("k9")); + + auto events = h.mgr->DrainPendingEvents(); + EXPECT_TRUE(HasRemove(events, "k1")); + EXPECT_TRUE(HasRemove(events, "k2")); + EXPECT_EQ(CountKind(events, KvEvent::Kind::REMOVE), 2); +} + +TEST(PeerSsdEviction, EnospcTriggersEvictThenRetry) { + // Fill to 800/1000 (below the 0.9 high watermark, so no watermark eviction + // fires during the fill), then write a 300-byte value that overflows the + // device: backend Write -> ENOSPC -> one evict round (frees the oldest down + // to the 0.5 low watermark) -> retry succeeds. After the retry used is 800, + // still below high, so no second round disturbs the just-written key. + auto h = MakeHarness(/*capacity=*/1000, /*high=*/0.9, /*low=*/0.5); + for (int i = 1; i <= 8; ++i) { + ASSERT_TRUE(h.mgr->Write("k" + std::to_string(i), OneSeg(std::string(100, 'a')), 100)); + } + ASSERT_TRUE(h.mgr->Write("big", OneSeg(std::string(300, 'c')), 300)); + EXPECT_TRUE(h.mgr->Exists("big")); + EXPECT_FALSE(h.mgr->Exists("k1")); // oldest reclaimed to make room + EXPECT_LE(h.mgr->Capacity().first, 1000u); +} + +TEST(PeerSsdEviction, InFlightReadIsNotEvicted) { + auto h = MakeHarness(/*capacity=*/1'000'000); + const std::string val = "payload-payload"; + ASSERT_TRUE(h.mgr->Write("K", OneSeg(val), val.size())); + + h.backend->BlockReads(); + std::vector buf(val.size()); + SsdReadOutcome out{}; + std::thread reader([&] { out = h.mgr->PrepareRead("K", buf.data(), buf.size()); }); + h.backend->WaitReadsStarted(1); // PrepareRead has marked K in-flight and is blocked in the read + + // Eviction must skip a key that is being read. + EXPECT_FALSE(h.mgr->Evict("K")); + EXPECT_TRUE(h.mgr->SelectVictims(1'000'000).empty()); + EXPECT_TRUE(h.mgr->Exists("K")); + + h.backend->UnblockReads(); + reader.join(); + EXPECT_EQ(out.status, SsdReadStatus::kOk); + EXPECT_EQ(std::string(buf.data(), out.size), val); + + // Once the read finished, the key can be evicted. + EXPECT_TRUE(h.mgr->Evict("K")); + EXPECT_FALSE(h.mgr->Exists("K")); +} + +TEST(PeerSsdEviction, StaleRouteReadAfterEvictIsNotFound) { + auto h = MakeHarness(/*capacity=*/1'000'000); + ASSERT_TRUE(h.mgr->Write("K", OneSeg("data"), 4)); + ASSERT_TRUE(h.mgr->Evict("K")); + + std::vector buf(4); + auto out = h.mgr->PrepareRead("K", buf.data(), buf.size()); + EXPECT_EQ(out.status, SsdReadStatus::kNotFound); +} + +TEST(PeerSsdEviction, DuplicateWriteIsIdempotent) { + auto h = MakeHarness(/*capacity=*/1'000'000); + ASSERT_TRUE(h.mgr->Write("K", OneSeg("data"), 4)); + ASSERT_TRUE(h.mgr->Write("K", OneSeg("data"), 4)); // same content-addressed key + + EXPECT_EQ(h.backend->write_calls(), 1); // no second backend write + auto events = h.mgr->DrainPendingEvents(); + EXPECT_EQ(CountKind(events, KvEvent::Kind::ADD), 1); // no duplicate ADD SSD +} + +TEST(PeerSsdEviction, BackendEvictFailureKeepsMetadata) { + auto h = MakeHarness(/*capacity=*/1'000'000); + ASSERT_TRUE(h.mgr->Write("K", OneSeg("data"), 4)); + h.mgr->DrainPendingEvents(); // discard ADD + + h.backend->SetFailEvict(true); + EXPECT_FALSE(h.mgr->Evict("K")); + EXPECT_TRUE(h.mgr->Exists("K")); // kept for retry + EXPECT_TRUE(h.mgr->DrainPendingEvents().empty()); // no REMOVE emitted + + h.backend->SetFailEvict(false); + EXPECT_TRUE(h.mgr->Evict("K")); // retry succeeds + EXPECT_FALSE(h.mgr->Exists("K")); + auto events = h.mgr->DrainPendingEvents(); + EXPECT_EQ(CountKind(events, KvEvent::Kind::REMOVE), 1); +} + +TEST(PeerSsdEviction, ConcurrentEvictOfSameKeyRemovesOnce) { + auto h = MakeHarness(/*capacity=*/1'000'000); + ASSERT_TRUE(h.mgr->Write("K", OneSeg("data"), 4)); + h.mgr->DrainPendingEvents(); + + std::atomic wins{0}; + std::vector threads; + for (int i = 0; i < 4; ++i) { + threads.emplace_back([&] { + if (h.mgr->Evict("K")) wins.fetch_add(1); + }); + } + for (auto& t : threads) t.join(); + + EXPECT_EQ(wins.load(), 1); // exactly one evictor wins + EXPECT_FALSE(h.mgr->Exists("K")); + auto events = h.mgr->DrainPendingEvents(); + EXPECT_EQ(CountKind(events, KvEvent::Kind::REMOVE), 1); // no double REMOVE +} + +TEST(PeerSsdEviction, ClearLocalWipesPhysicalBytes) { + auto h = MakeHarness(/*capacity=*/1'000'000); + ASSERT_TRUE(h.mgr->Write("a", OneSeg("1111"), 4)); + ASSERT_TRUE(h.mgr->Write("b", OneSeg("2222"), 4)); + + h.mgr->ClearLocal(); + + EXPECT_EQ(h.backend->clear_calls(), 1); // physical wipe happened + EXPECT_FALSE(h.mgr->Exists("a")); + EXPECT_FALSE(h.mgr->Exists("b")); + EXPECT_TRUE(h.mgr->SnapshotOwnedKeys().empty()); + auto [used, total] = h.mgr->Capacity(); + EXPECT_EQ(used, 0u); +} + +TEST(PeerSsdEviction, ClearLocalWaitsForInFlightRead) { + auto h = MakeHarness(/*capacity=*/1'000'000); + const std::string val = "read-priority"; + ASSERT_TRUE(h.mgr->Write("K", OneSeg(val), val.size())); + + h.backend->BlockReads(); + std::vector buf(val.size()); + SsdReadOutcome out{}; + std::thread reader([&] { out = h.mgr->PrepareRead("K", buf.data(), buf.size()); }); + h.backend->WaitReadsStarted(1); + + std::thread clearer([&] { h.mgr->ClearLocal(); }); + + // The read is in flight; let it complete, then ClearLocal may wipe. If + // ClearLocal had wiped the backend before the read finished, the read would + // return kError instead of the correct bytes — so kOk proves read priority. + h.backend->UnblockReads(); + reader.join(); + clearer.join(); + + EXPECT_EQ(out.status, SsdReadStatus::kOk); + EXPECT_EQ(std::string(buf.data(), out.size), val); + EXPECT_EQ(h.backend->clear_calls(), 1); + EXPECT_FALSE(h.mgr->Exists("K")); +} + +TEST(PeerSsdEviction, InvalidWatermarksThrow) { + // low >= high, and high > 1 are both rejected (fail-fast, no silent clamp). + EXPECT_THROW(PeerSsdManager(std::make_unique(1000), 0.5, 0.7), std::runtime_error); + EXPECT_THROW(PeerSsdManager(std::make_unique(1000), 1.5, 0.7), std::runtime_error); + EXPECT_THROW(PeerSsdManager(std::make_unique(1000), 0.9, 0.0), std::runtime_error); +} + +TEST(PeerSsdEviction, SelectVictimsBoundaries) { + auto h = MakeHarness(/*capacity=*/1'000'000); + ASSERT_TRUE(h.mgr->Write("K", OneSeg("data"), 4)); + + EXPECT_TRUE(h.mgr->SelectVictims(0).empty()); // nothing to free + + // All candidates in flight -> no victim, no spin. + h.backend->BlockReads(); + std::vector buf(4); + SsdReadOutcome out{}; + std::thread reader([&] { out = h.mgr->PrepareRead("K", buf.data(), buf.size()); }); + h.backend->WaitReadsStarted(1); + EXPECT_TRUE(h.mgr->SelectVictims(1'000'000).empty()); + h.backend->UnblockReads(); + reader.join(); + EXPECT_EQ(out.status, SsdReadStatus::kOk); +} + +TEST(PeerSsdEviction, DisabledManagerIsInert) { + PeerSsdConfig cfg; + cfg.enabled = false; + PeerSsdManager mgr(cfg); + + EXPECT_FALSE(mgr.Write("K", OneSeg("data"), 4)); + EXPECT_FALSE(mgr.Evict("K")); + EXPECT_TRUE(mgr.SelectVictims(100).empty()); + std::vector buf(4); + EXPECT_EQ(mgr.PrepareRead("K", buf.data(), buf.size()).status, SsdReadStatus::kNotFound); + mgr.ClearLocal(); // no backend -> no crash, no-op + EXPECT_TRUE(mgr.SnapshotOwnedKeys().empty()); +} + +} // namespace +} // namespace mori::umbp diff --git a/tests/cpp/umbp/distributed/test_peer_ssd_manager.cpp b/tests/cpp/umbp/distributed/test_peer_ssd_manager.cpp new file mode 100644 index 000000000..0fd2aa861 --- /dev/null +++ b/tests/cpp/umbp/distributed/test_peer_ssd_manager.cpp @@ -0,0 +1,233 @@ +// Copyright © Advanced Micro Devices, Inc. All rights reserved. +// +// MIT License +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "umbp/distributed/peer/owned_location_source.h" +#include "umbp/distributed/peer/peer_ssd_manager.h" + +namespace mori::umbp { +namespace { + +namespace fs = std::filesystem; + +// Unique temp dir per fixture instance; backend uses Posix I/O to avoid +// io_uring availability differences inside the build container. +class PeerSsdManagerTest : public ::testing::Test { + protected: + void SetUp() override { + static std::atomic counter{0}; + dir_ = fs::temp_directory_path() / ("umbp_ssd_test_" + std::to_string(::getpid()) + "_" + + std::to_string(counter.fetch_add(1))); + fs::remove_all(dir_); + } + + void TearDown() override { + std::error_code ec; + fs::remove_all(dir_, ec); + } + + PeerSsdConfig MakeConfig(size_t capacity = 64ULL * 1024 * 1024) const { + PeerSsdConfig cfg; + cfg.enabled = true; + cfg.ssd.enabled = true; + cfg.ssd.storage_dir = dir_.string(); + cfg.ssd.capacity_bytes = capacity; + cfg.ssd.io.backend = UMBPIoBackend::Posix; // avoid io_uring container flakiness + return cfg; + } + + static std::vector> OneSegment(const std::string& s) { + return {{s.data(), s.size()}}; + } + + fs::path dir_; +}; + +TEST_F(PeerSsdManagerTest, WriteRecordsOwnershipAndQueuesAddEvent) { + PeerSsdManager mgr(MakeConfig()); + const std::string key = "key-1"; + const std::string value = "hello-ssd-payload"; + + ASSERT_TRUE(mgr.Write(key, OneSegment(value), value.size())); + EXPECT_TRUE(mgr.Exists(key)); + + auto events = mgr.DrainPendingEvents(); + ASSERT_EQ(events.size(), 1u); + EXPECT_EQ(events[0].kind, KvEvent::Kind::ADD); + EXPECT_EQ(events[0].key, key); + EXPECT_EQ(events[0].tier, TierType::SSD); + EXPECT_EQ(events[0].size, value.size()); + + // Drain is destructive. + EXPECT_TRUE(mgr.DrainPendingEvents().empty()); + + auto snap = mgr.SnapshotOwnedKeys(); + ASSERT_EQ(snap.size(), 1u); + EXPECT_EQ(snap[0].key, key); + EXPECT_EQ(snap[0].tier, TierType::SSD); + EXPECT_EQ(snap[0].size, value.size()); +} + +TEST_F(PeerSsdManagerTest, WriteAssemblesNonContiguousSegments) { + PeerSsdManager mgr(MakeConfig()); + const std::string a = "abc"; + const std::string b = "defgh"; + std::vector> segs = {{a.data(), a.size()}, {b.data(), b.size()}}; + + ASSERT_TRUE(mgr.Write("multi", segs, a.size() + b.size())); + EXPECT_TRUE(mgr.Exists("multi")); + auto snap = mgr.SnapshotOwnedKeys(); + ASSERT_EQ(snap.size(), 1u); + EXPECT_EQ(snap[0].size, a.size() + b.size()); +} + +TEST_F(PeerSsdManagerTest, CapacityReportsTotalAndGrowsWithWrites) { + const size_t cap = 32ULL * 1024 * 1024; + PeerSsdManager mgr(MakeConfig(cap)); + + auto [used_before, total_before] = mgr.Capacity(); + EXPECT_EQ(total_before, cap); + + std::string value(4096, 'x'); + ASSERT_TRUE(mgr.Write("big", OneSegment(value), value.size())); + + auto [used_after, total_after] = mgr.Capacity(); + EXPECT_EQ(total_after, cap); + EXPECT_GE(used_after, used_before); +} + +TEST_F(PeerSsdManagerTest, EvictRemovesOwnershipAndQueuesRemoveEvent) { + PeerSsdManager mgr(MakeConfig()); + const std::string key = "key-evict"; + const std::string value = "payload"; + ASSERT_TRUE(mgr.Write(key, OneSegment(value), value.size())); + mgr.DrainPendingEvents(); // discard the ADD + + EXPECT_TRUE(mgr.Evict(key)); + EXPECT_FALSE(mgr.Exists(key)); + + auto events = mgr.DrainPendingEvents(); + ASSERT_EQ(events.size(), 1u); + EXPECT_EQ(events[0].kind, KvEvent::Kind::REMOVE); + EXPECT_EQ(events[0].key, key); + EXPECT_EQ(events[0].tier, TierType::SSD); + + // Evicting an unknown key is a no-op (no event, returns false). + EXPECT_FALSE(mgr.Evict("never-written")); + EXPECT_TRUE(mgr.DrainPendingEvents().empty()); +} + +TEST_F(PeerSsdManagerTest, PrepareReadReturnsBytesForOwnedKey) { + PeerSsdManager mgr(MakeConfig()); + const std::string key = "key-read"; + const std::string value = "hello-ssd-read-path"; + ASSERT_TRUE(mgr.Write(key, OneSegment(value), value.size())); + + std::vector staging(value.size()); + auto out = mgr.PrepareRead(key, staging.data(), staging.size()); + EXPECT_EQ(out.status, SsdReadStatus::kOk); + EXPECT_EQ(out.size, value.size()); + EXPECT_EQ(std::string(staging.data(), out.size), value); +} + +TEST_F(PeerSsdManagerTest, PrepareReadUnknownKeyIsNotFound) { + PeerSsdManager mgr(MakeConfig()); + std::vector staging(64); + auto out = mgr.PrepareRead("never-written", staging.data(), staging.size()); + EXPECT_EQ(out.status, SsdReadStatus::kNotFound); +} + +TEST_F(PeerSsdManagerTest, PrepareReadRejectsOverCapBeforeIo) { + PeerSsdManager mgr(MakeConfig()); + const std::string key = "key-big"; + const std::string value(4096, 'z'); + ASSERT_TRUE(mgr.Write(key, OneSegment(value), value.size())); + + // Capacity smaller than the actual size must be rejected as kSizeTooLarge + // (and the reported size is the real size) without reading into the buffer. + std::vector staging(value.size() / 2); + auto out = mgr.PrepareRead(key, staging.data(), staging.size()); + EXPECT_EQ(out.status, SsdReadStatus::kSizeTooLarge); + EXPECT_EQ(out.size, value.size()); +} + +// ---- Unified owned-location source aggregation ------------------------------ + +// Minimal OwnedLocationSource that replays a fixed event list, used to verify +// MasterClient's multi-source concat logic without a live master. +class FakeSource : public OwnedLocationSource { + public: + explicit FakeSource(std::vector events) : events_(std::move(events)) {} + std::vector DrainPendingEvents() override { + auto out = events_; + drained_ = true; + return out; + } + std::vector SnapshotOwnedKeys() const override { return events_; } + bool drained_ = false; + + private: + std::vector events_; +}; + +TEST(OwnedLocationSourceAgg, DrainAndSnapshotConcatAcrossSourcesInOrder) { + FakeSource dram({{KvEvent::Kind::ADD, "d1", TierType::DRAM, 10}, + {KvEvent::Kind::ADD, "d2", TierType::DRAM, 20}}); + FakeSource ssd({{KvEvent::Kind::ADD, "s1", TierType::SSD, 30}}); + + std::vector sources = {&dram, &ssd}; + + auto drained = DrainAllSources(sources); + ASSERT_EQ(drained.size(), 3u); + EXPECT_EQ(drained[0].key, "d1"); + EXPECT_EQ(drained[0].tier, TierType::DRAM); + EXPECT_EQ(drained[1].key, "d2"); + EXPECT_EQ(drained[2].key, "s1"); + EXPECT_EQ(drained[2].tier, TierType::SSD); + EXPECT_TRUE(dram.drained_); + EXPECT_TRUE(ssd.drained_); + + auto snap = SnapshotAllSources(sources); + ASSERT_EQ(snap.size(), 3u); + EXPECT_EQ(snap[2].tier, TierType::SSD); +} + +TEST(OwnedLocationSourceAgg, NullSourcesAreSkipped) { + FakeSource only({{KvEvent::Kind::ADD, "x", TierType::SSD, 1}}); + std::vector sources = {nullptr, &only, nullptr}; + auto drained = DrainAllSources(sources); + ASSERT_EQ(drained.size(), 1u); + EXPECT_EQ(drained[0].key, "x"); + EXPECT_TRUE(SnapshotAllSources({nullptr}).empty()); +} + +} // namespace +} // namespace mori::umbp diff --git a/tests/cpp/umbp/distributed/test_peer_ssd_read_rpc.cpp b/tests/cpp/umbp/distributed/test_peer_ssd_read_rpc.cpp new file mode 100644 index 000000000..949996cdb --- /dev/null +++ b/tests/cpp/umbp/distributed/test_peer_ssd_read_rpc.cpp @@ -0,0 +1,258 @@ +// Copyright © Advanced Micro Devices, Inc. All rights reserved. +// +// MIT License +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +// +// RPC-level integration test for the SSD read path: a real PeerServiceServer +// (backed by a real PeerSsdManager / POSIX SSDTier) served over a gRPC loopback +// channel. It exercises prepare -> read-from-staging -> release / TTL and, +// crucially, asserts that OK / NOT_FOUND / NO_SLOT / SIZE_TOO_LARGE are each +// reported as distinct statuses so a transient failure is never collapsed into +// a NOT_FOUND miss. RDMA is intentionally out of scope here (the staging buffer +// is read directly); the full BatchGet -> RDMA path needs a live cluster. +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "umbp/distributed/config.h" +#include "umbp/distributed/peer/peer_service.h" +#include "umbp/distributed/peer/peer_ssd_manager.h" +#include "umbp_peer.grpc.pb.h" + +namespace mori::umbp { +namespace { + +namespace fs = std::filesystem; + +constexpr size_t kStagingSize = 4096; +constexpr int kNumReadSlots = 4; // -> 1024 B per slot +constexpr int kLeaseTimeoutS = 2; + +uint16_t AllocPort() { + static std::atomic next{51300}; + return next.fetch_add(1); +} + +class PeerSsdReadRpcTest : public ::testing::Test { + protected: + void SetUp() override { + staging_buffer_ = std::malloc(kStagingSize); + ASSERT_NE(staging_buffer_, nullptr); + std::memset(staging_buffer_, 0, kStagingSize); + + dir_ = fs::temp_directory_path() / + ("umbp_ssd_rpc_" + std::to_string(::getpid()) + "_" + std::to_string(AllocPort())); + fs::remove_all(dir_); + + PeerSsdConfig cfg; + cfg.enabled = true; + cfg.ssd.enabled = true; + cfg.ssd.storage_dir = dir_.string(); + cfg.ssd.capacity_bytes = 1 << 20; + cfg.ssd.io.backend = UMBPIoBackend::Posix; // avoid io_uring container flakiness + peer_ssd_ = std::make_unique(cfg); + + // Fake staging MemoryDesc bytes — GetPeerInfo just echoes them; this test + // reads the staging buffer directly rather than RDMA-ing it. + staging_desc_ = {0xAB, 0xCD}; + + port_ = AllocPort(); + server_ = std::make_unique( + /*dram_alloc=*/nullptr, peer_ssd_.get(), staging_buffer_, kStagingSize, staging_desc_, + kNumReadSlots, kLeaseTimeoutS); + ASSERT_TRUE(server_->Start(port_)); + std::this_thread::sleep_for(std::chrono::milliseconds(150)); + + auto channel = grpc::CreateChannel("localhost:" + std::to_string(port_), + grpc::InsecureChannelCredentials()); + stub_ = ::umbp::UMBPPeer::NewStub(channel); + } + + void TearDown() override { + server_->Stop(); + server_.reset(); + peer_ssd_.reset(); + std::free(staging_buffer_); + fs::remove_all(dir_); + } + + void WriteSsd(const std::string& key, const std::string& data) { + ASSERT_TRUE(peer_ssd_->Write(key, {{data.data(), data.size()}}, data.size())); + } + + ::umbp::PrepareSsdReadResponse Prepare(const std::string& key, uint64_t max_size) { + ::umbp::PrepareSsdReadRequest req; + req.set_key(key); + req.set_max_size(max_size); + ::umbp::PrepareSsdReadResponse resp; + grpc::ClientContext ctx; + EXPECT_TRUE(stub_->PrepareSsdRead(&ctx, req, &resp).ok()); + return resp; + } + + void* staging_buffer_ = nullptr; + fs::path dir_; + std::vector staging_desc_; + uint16_t port_ = 0; + std::unique_ptr peer_ssd_; + std::unique_ptr server_; + std::unique_ptr<::umbp::UMBPPeer::Stub> stub_; +}; + +TEST_F(PeerSsdReadRpcTest, OkReadsBytesIntoStaging) { + const std::string data = "ssd-read-rpc-ok"; + WriteSsd("k-ok", data); + + auto resp = Prepare("k-ok", data.size()); + ASSERT_EQ(resp.status(), ::umbp::SSD_READ_OK); + EXPECT_EQ(resp.size(), data.size()); + EXPECT_LT(resp.staging_offset(), kStagingSize); + EXPECT_GT(resp.lease_id(), 0u); + std::string loaded(static_cast(staging_buffer_) + resp.staging_offset(), + resp.size()); + EXPECT_EQ(loaded, data); +} + +TEST_F(PeerSsdReadRpcTest, NotFoundIsADistinctMiss) { + auto resp = Prepare("absent", 64); + EXPECT_EQ(resp.status(), ::umbp::SSD_READ_NOT_FOUND); +} + +TEST_F(PeerSsdReadRpcTest, SizeTooLargeIsDistinct) { + // A key bigger than one slot (1024 B) must report SIZE_TOO_LARGE, not OK and + // not NOT_FOUND. + const std::string big(2048, 'q'); + WriteSsd("k-big", big); + auto resp = Prepare("k-big", kStagingSize); + EXPECT_EQ(resp.status(), ::umbp::SSD_READ_SIZE_TOO_LARGE); +} + +// The key assertion the review asked for: slot exhaustion is NO_SLOT +// (retryable), never collapsed into NOT_FOUND. A present key and an absent key +// under exhaustion are distinguishable. +TEST_F(PeerSsdReadRpcTest, NoSlotIsDistinctFromNotFound) { + std::vector<::umbp::PrepareSsdReadResponse> held; + for (int i = 0; i < kNumReadSlots; ++i) { + const std::string key = "hold-" + std::to_string(i); + WriteSsd(key, "payload"); + auto resp = Prepare(key, 64); + ASSERT_EQ(resp.status(), ::umbp::SSD_READ_OK); + held.push_back(resp); // keep the lease held (no release) so slots stay busy + } + + // A present key with all slots busy -> NO_SLOT (retryable), NOT a miss. + WriteSsd("present-extra", "payload"); + EXPECT_EQ(Prepare("present-extra", 64).status(), ::umbp::SSD_READ_NO_SLOT); + + // An absent key under the same exhaustion is still NO_SLOT (slot check + // precedes the key lookup), so the caller cannot mistake exhaustion for a + // definitive miss. + EXPECT_EQ(Prepare("absent-extra", 64).status(), ::umbp::SSD_READ_NO_SLOT); +} + +// Many concurrent readers contend for a fixed pool of staging slots: at most +// kNumReadSlots win OK, the rest get NO_SLOT (a retryable transient), and NEVER +// NOT_FOUND for a present key. Also exercises the staging observability +// (slot_full_rejects counter + the in-use gauge accessor). +TEST_F(PeerSsdReadRpcTest, ConcurrentReadersExhaustSlotsWithoutFalseMiss) { + const std::string data = "concurrent-payload"; + for (int i = 0; i < 32; ++i) WriteSsd("ck-" + std::to_string(i), data); + + const uint64_t slot_full_before = server_->Metrics().slot_full_rejects.load(); + + constexpr int kReaders = 24; // >> kNumReadSlots, leases held (never released) + std::atomic ok{0}, no_slot{0}, other{0}; + std::vector threads; + for (int i = 0; i < kReaders; ++i) { + threads.emplace_back([&, i] { + auto resp = Prepare("ck-" + std::to_string(i), data.size()); + switch (resp.status()) { + case ::umbp::SSD_READ_OK: + ok.fetch_add(1); + break; + case ::umbp::SSD_READ_NO_SLOT: + no_slot.fetch_add(1); + break; + default: + other.fetch_add(1); // NOT_FOUND / SIZE_TOO_LARGE / ERROR must never happen here + break; + } + }); + } + for (auto& t : threads) t.join(); + + EXPECT_EQ(other.load(), 0) << "present keys never report a false miss under contention"; + EXPECT_LE(ok.load(), kNumReadSlots) << "at most one OK per staging slot"; + EXPECT_EQ(ok.load() + no_slot.load(), kReaders); + EXPECT_GT(no_slot.load(), 0) << "with more readers than slots, some must see NO_SLOT"; + + // The NO_SLOT rejections were counted, and the gauge sees the held leases. + EXPECT_GE(server_->Metrics().slot_full_rejects.load() - slot_full_before, + static_cast(no_slot.load())); + EXPECT_EQ(server_->SnapshotReadSlotsInUse(), static_cast(ok.load())); +} + +// A best-effort release frees the slot; double release reports false. +TEST_F(PeerSsdReadRpcTest, ReleaseFreesSlotAndIsBestEffort) { + WriteSsd("k-rel", "payload"); + auto resp = Prepare("k-rel", 64); + ASSERT_EQ(resp.status(), ::umbp::SSD_READ_OK); + + ::umbp::ReleaseSsdLeaseRequest rel; + rel.set_lease_id(resp.lease_id()); + ::umbp::ReleaseSsdLeaseResponse rel_resp; + grpc::ClientContext ctx; + ASSERT_TRUE(stub_->ReleaseSsdLease(&ctx, rel, &rel_resp).ok()); + EXPECT_TRUE(rel_resp.success()); + + ::umbp::ReleaseSsdLeaseResponse rel_resp2; + grpc::ClientContext ctx2; + ASSERT_TRUE(stub_->ReleaseSsdLease(&ctx2, rel, &rel_resp2).ok()); + EXPECT_FALSE(rel_resp2.success()) << "double release is a no-op"; +} + +// Leased slots are reclaimed by TTL even without a release, so a fresh prepare +// succeeds after the lease elapses (slot lifecycle: Leased -> reclaimed). +TEST_F(PeerSsdReadRpcTest, LeasedSlotsReclaimedByTtl) { + for (int i = 0; i < kNumReadSlots; ++i) { + const std::string key = "ttl-" + std::to_string(i); + WriteSsd(key, "payload"); + ASSERT_EQ(Prepare(key, 64).status(), ::umbp::SSD_READ_OK); // never released + } + EXPECT_EQ(Prepare("ttl-0", 64).status(), ::umbp::SSD_READ_NO_SLOT); // all busy + + std::this_thread::sleep_for(std::chrono::seconds(kLeaseTimeoutS + 1)); + + WriteSsd("ttl-after", "payload"); + EXPECT_EQ(Prepare("ttl-after", 64).status(), ::umbp::SSD_READ_OK) << "TTL should reclaim a slot"; +} + +} // namespace +} // namespace mori::umbp diff --git a/tests/cpp/umbp/distributed/test_router_dedup.cpp b/tests/cpp/umbp/distributed/test_router_dedup.cpp new file mode 100644 index 000000000..6de571fd1 --- /dev/null +++ b/tests/cpp/umbp/distributed/test_router_dedup.cpp @@ -0,0 +1,126 @@ +// Copyright © Advanced Micro Devices, Inc. All rights reserved. +// +// MIT License +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +// +// Master-side dedup for Router::BatchRoutePut: indexed keys come back +// with already_exists=true and bypass node selection. +#include + +#include +#include +#include +#include +#include +#include + +#include "umbp/distributed/master/in_memory_master_metadata_store.h" +#include "umbp/distributed/routing/router.h" +#include "umbp/distributed/types.h" + +namespace mori::umbp { + +namespace { + +constexpr uint64_t kGB = 1024ULL * 1024 * 1024; + +std::map MakeDramCaps(uint64_t total = 8 * kGB) { + std::map caps; + caps[TierType::DRAM] = {total, total}; + return caps; +} + +ClientRegistration MakeRegistration(const std::string& node_id, const std::string& node_address, + const std::string& peer_address) { + ClientRegistration reg; + reg.node_id = node_id; + reg.node_address = node_address; + reg.tier_capacities = MakeDramCaps(); + reg.peer_address = peer_address; + return reg; +} + +// Register `node_id` ALIVE and apply one ADD event for `key` so it has a block +// location in the store. Under the merged store a location can only be created +// through an ApplyHeartbeat from a registered (alive) node — locations no +// longer exist independently of a client record the way the old +// GlobalBlockIndex allowed. +void RegisterWithKey(InMemoryMasterMetadataStore& store, const std::string& node_id, + const std::string& key, std::chrono::system_clock::time_point now) { + ASSERT_TRUE(store.RegisterClient(MakeRegistration(node_id, node_id + ":1", node_id + ":peer"), + now, std::chrono::seconds{30})); + auto hb = store.ApplyHeartbeat(node_id, /*seq=*/1, now, MakeDramCaps(), + {KvEvent{KvEvent::Kind::ADD, key, TierType::DRAM, 4096}}, + /*is_full_sync=*/false); + ASSERT_EQ(hb.status, HeartbeatResult::APPLIED); +} + +} // namespace + +// Indexed keys are marked already_exists; unknown keys still routed. +TEST(RouterDedup, BatchRoutePutMarksAlreadyExistsForIndexedKey) { + const auto now = std::chrono::system_clock::now(); + InMemoryMasterMetadataStore store; + Router router(store); + + RegisterWithKey(store, "node-a", "key-X", now); + + std::vector keys{"key-X", "key-Y"}; + std::vector sizes{4096, 4096}; + std::unordered_set excludes; + + auto results = router.BatchRoutePut(keys, "requester", sizes, excludes); + ASSERT_EQ(results.size(), 2u); + + ASSERT_TRUE(results[0].has_value()); + EXPECT_EQ(results[0]->outcome, RoutePutOutcome::kAlreadyExists); + EXPECT_TRUE(results[0]->node_id.empty()); + + ASSERT_TRUE(results[1].has_value()); + EXPECT_EQ(results[1]->outcome, RoutePutOutcome::kRouted); + EXPECT_EQ(results[1]->node_id, "node-a"); +} + +// already_exists wins over an unroutable Put: an existing key is marked +// kAlreadyExists even when no node can accept the write. In the old design +// "no node" meant an empty registry while a foreign node owned the key; under +// the merged store a location can't outlive its alive owner, so the +// unroutable condition is expressed by excluding the only candidate node. The +// property under test is unchanged: dedup wins over node selection. +TEST(RouterDedup, BatchRoutePutAlreadyExistsBypassesUnroutablePut) { + const auto now = std::chrono::system_clock::now(); + InMemoryMasterMetadataStore store; + Router router(store); + + RegisterWithKey(store, "node-a", "key-X", now); + + std::vector keys{"key-X", "key-Y"}; + std::vector sizes{4096, 4096}; + std::unordered_set excludes{"node-a"}; // no routable target left + + auto results = router.BatchRoutePut(keys, "requester", sizes, excludes); + ASSERT_EQ(results.size(), 2u); + + ASSERT_TRUE(results[0].has_value()); + EXPECT_EQ(results[0]->outcome, RoutePutOutcome::kAlreadyExists); + EXPECT_FALSE(results[1].has_value()); // distinct from kAlreadyExists +} + +} // namespace mori::umbp diff --git a/tests/cpp/umbp/distributed/test_ssd_copy_pipeline.cpp b/tests/cpp/umbp/distributed/test_ssd_copy_pipeline.cpp new file mode 100644 index 000000000..1d144cbb6 --- /dev/null +++ b/tests/cpp/umbp/distributed/test_ssd_copy_pipeline.cpp @@ -0,0 +1,341 @@ +// Copyright © Advanced Micro Devices, Inc. All rights reserved. +// +// MIT License +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "umbp/distributed/peer/peer_dram_allocator.h" +#include "umbp/distributed/peer/peer_ssd_manager.h" +#include "umbp/distributed/peer/ssd_copy_pipeline.h" + +namespace mori::umbp { +namespace { + +namespace fs = std::filesystem; + +constexpr uint64_t kPageSize = 1024; + +// Concatenate a pin's segments into one buffer for content comparison. +std::string Concat(const PeerDramAllocator::DramCopyPin& pin) { + std::string out; + for (const auto& [ptr, len] : pin.segments) { + out.append(static_cast(ptr), len); + } + return out; +} + +// ---- DramCopyPin unit tests (direct on PeerDramAllocator) ------------------- + +class DramCopyPinTest : public ::testing::Test { + protected: + void SetUp() override { + backing_.assign(kPageSize * 8, 0); + PeerDramAllocator::TierConfig dram; + dram.buffer_sizes = {kPageSize * 8}; + dram.buffer_descs = {{0x01, 0x02}}; + dram.buffer_bases = {backing_.data()}; + dram_ = std::make_unique(kPageSize, std::move(dram), + PeerDramAllocator::TierConfig{}, + /*pending_ttl=*/std::chrono::milliseconds{5000}, + /*read_lease_ttl=*/std::chrono::milliseconds{0}); + } + + // Allocate, write `value` into its pages, commit. Backing memory is owned + // by the test, so we resolve pages -> offset exactly like the real writer. + void PutLocal(const std::string& key, const std::string& value) { + auto res = dram_->Allocate(key, value.size(), TierType::DRAM); + ASSERT_EQ(res.outcome, PeerDramAllocator::Outcome::kSuccessAllocated); + const auto& slot = *res.slot; + size_t off = 0; + for (const auto& p : slot.pages) { + const size_t bytes = std::min(kPageSize, value.size() - off); + std::memcpy(backing_.data() + static_cast(p.page_index) * kPageSize, + value.data() + off, bytes); + off += bytes; + } + uint64_t committed = 0; + ASSERT_TRUE(dram_->Commit(slot.slot_id, key, committed)); + ASSERT_EQ(committed, value.size()); + } + + std::vector backing_; + std::unique_ptr dram_; +}; + +TEST_F(DramCopyPinTest, AcquireResolvesSegmentsToCommittedBytes) { + const std::string value(kPageSize + 17, 'Z'); // spans 2 pages, last partial + PutLocal("k", value); + + auto pin = dram_->AcquireDramCopyPin("k"); + ASSERT_TRUE(pin.has_value()); + EXPECT_EQ(pin->total_size, value.size()); + EXPECT_EQ(Concat(*pin), value); + dram_->ReleaseDramCopyPin("k", pin->pin_token); +} + +TEST_F(DramCopyPinTest, AcquireMissingKeyReturnsNullopt) { + EXPECT_FALSE(dram_->AcquireDramCopyPin("never").has_value()); +} + +TEST_F(DramCopyPinTest, DuplicatePinReturnsNullopt) { + PutLocal("k", "payload"); + auto first = dram_->AcquireDramCopyPin("k"); + ASSERT_TRUE(first.has_value()); + EXPECT_FALSE(dram_->AcquireDramCopyPin("k").has_value()); // already pinned + dram_->ReleaseDramCopyPin("k", first->pin_token); +} + +TEST_F(DramCopyPinTest, EvictBlockedWhilePinnedThenAllowedAfterRelease) { + PutLocal("k", "payload"); + dram_->DrainPendingEvents(); // discard the commit's ADD DRAM event + auto pin = dram_->AcquireDramCopyPin("k"); + ASSERT_TRUE(pin.has_value()); + + // Pinned: Evict must not free, not emit REMOVE, keep ownership. + auto evicted = dram_->Evict({"k"}); + ASSERT_EQ(evicted.size(), 1u); + EXPECT_EQ(evicted[0].bytes_freed, 0u); + EXPECT_TRUE(dram_->DrainPendingEvents().empty()); // no REMOVE DRAM + ASSERT_EQ(dram_->SnapshotOwnedKeys().size(), 1u); // still owned + + // Release -> next Evict frees and emits REMOVE. + dram_->ReleaseDramCopyPin("k", pin->pin_token); + auto evicted2 = dram_->Evict({"k"}); + ASSERT_EQ(evicted2.size(), 1u); + EXPECT_GT(evicted2[0].bytes_freed, 0u); + auto events = dram_->DrainPendingEvents(); + ASSERT_EQ(events.size(), 1u); + EXPECT_EQ(events[0].kind, KvEvent::Kind::REMOVE); + EXPECT_EQ(events[0].tier, TierType::DRAM); +} + +TEST(DramCopyPinNonContiguous, SegmentsSpanMultipleBuffers) { + // Two 1-page buffers force a cross-buffer page set for a 2-page key. + std::vector b0(kPageSize, 0), b1(kPageSize, 0); + PeerDramAllocator::TierConfig dram; + dram.buffer_sizes = {kPageSize, kPageSize}; + dram.buffer_descs = {{0x01}, {0x02}}; + dram.buffer_bases = {b0.data(), b1.data()}; + PeerDramAllocator alloc(kPageSize, std::move(dram), PeerDramAllocator::TierConfig{}, + std::chrono::milliseconds{5000}, std::chrono::milliseconds{0}); + + const std::string value(kPageSize + 5, 'Q'); + auto res = alloc.Allocate("k", value.size(), TierType::DRAM); + ASSERT_EQ(res.outcome, PeerDramAllocator::Outcome::kSuccessAllocated); + const auto& slot = *res.slot; + ASSERT_EQ(slot.pages.size(), 2u); + std::vector bases = {b0.data(), b1.data()}; + size_t off = 0; + for (const auto& p : slot.pages) { + const size_t bytes = std::min(kPageSize, value.size() - off); + std::memcpy(bases[p.buffer_index] + static_cast(p.page_index) * kPageSize, + value.data() + off, bytes); + off += bytes; + } + uint64_t committed = 0; + ASSERT_TRUE(alloc.Commit(slot.slot_id, "k", committed)); + + auto pin = alloc.AcquireDramCopyPin("k"); + ASSERT_TRUE(pin.has_value()); + EXPECT_EQ(pin->segments.size(), 2u); + EXPECT_EQ(Concat(*pin), value); + alloc.ReleaseDramCopyPin("k", pin->pin_token); +} + +// ---- Pipeline integration tests (allocator + SSD manager + pipeline) -------- + +class SsdCopyPipelineTest : public ::testing::Test { + protected: + void SetUp() override { + static std::atomic counter{0}; + dir_ = fs::temp_directory_path() / ("umbp_copy_test_" + std::to_string(::getpid()) + "_" + + std::to_string(counter.fetch_add(1))); + fs::remove_all(dir_); + + backing_.assign(kPageSize * 16, 0); + PeerDramAllocator::TierConfig dram; + dram.buffer_sizes = {kPageSize * 16}; + dram.buffer_descs = {{0x01}}; + dram.buffer_bases = {backing_.data()}; + dram_ = std::make_unique( + kPageSize, std::move(dram), PeerDramAllocator::TierConfig{}, + std::chrono::milliseconds{5000}, std::chrono::milliseconds{0}); + + PeerSsdConfig ssd_cfg; + ssd_cfg.enabled = true; + ssd_cfg.ssd.enabled = true; + ssd_cfg.ssd.storage_dir = dir_.string(); + ssd_cfg.ssd.capacity_bytes = 64ULL * 1024 * 1024; + ssd_cfg.ssd.io.backend = UMBPIoBackend::Posix; // avoid io_uring container flakiness + ssd_ = std::make_unique(ssd_cfg); + } + + void TearDown() override { + std::error_code ec; + fs::remove_all(dir_, ec); + } + + void PutLocal(const std::string& key, const std::string& value) { + auto res = dram_->Allocate(key, value.size(), TierType::DRAM); + ASSERT_EQ(res.outcome, PeerDramAllocator::Outcome::kSuccessAllocated); + const auto& slot = *res.slot; + size_t off = 0; + for (const auto& p : slot.pages) { + const size_t bytes = std::min(kPageSize, value.size() - off); + std::memcpy(backing_.data() + static_cast(p.page_index) * kPageSize, + value.data() + off, bytes); + off += bytes; + } + uint64_t committed = 0; + ASSERT_TRUE(dram_->Commit(slot.slot_id, key, committed)); + } + + bool WaitForSsd(const std::string& key, std::chrono::milliseconds timeout) { + const auto deadline = std::chrono::steady_clock::now() + timeout; + while (std::chrono::steady_clock::now() < deadline) { + if (ssd_->Exists(key)) return true; + std::this_thread::sleep_for(std::chrono::milliseconds(2)); + } + return ssd_->Exists(key); + } + + fs::path dir_; + std::vector backing_; + std::unique_ptr dram_; + std::unique_ptr ssd_; +}; + +TEST_F(SsdCopyPipelineTest, CommitCopiesToSsdAndEmitsAddEvent) { + SsdCopyPipeline pipeline(dram_.get(), ssd_.get()); + pipeline.Start(); + + PutLocal("k", "hello-ssd-copy-on-commit"); + ASSERT_TRUE(pipeline.Enqueue(SsdCopyTask{"k", TierType::DRAM, 24})); + + ASSERT_TRUE(WaitForSsd("k", std::chrono::seconds(2))); + EXPECT_GE(pipeline.CopiedOk(), 1u); + EXPECT_GE(pipeline.Enqueued(), 1u); // observability: task was accepted + EXPECT_EQ(pipeline.Failed(), 0u); + + auto events = ssd_->DrainPendingEvents(); + ASSERT_EQ(events.size(), 1u); + EXPECT_EQ(events[0].kind, KvEvent::Kind::ADD); + EXPECT_EQ(events[0].tier, TierType::SSD); + EXPECT_EQ(events[0].key, "k"); + + pipeline.Stop(); +} + +TEST_F(SsdCopyPipelineTest, QueuedTaskForEvictedKeyIsDropped) { + SsdCopyPipeline pipeline(dram_.get(), ssd_.get()); + + PutLocal("gone", "data"); + // Evict before the copy ever runs (no pin held) -> key removed from owned_. + auto ev = dram_->Evict({"gone"}); + ASSERT_EQ(ev.size(), 1u); + EXPECT_GT(ev[0].bytes_freed, 0u); + + // Now start draining: the worker's AcquireDramCopyPin returns nullopt -> drop. + ASSERT_TRUE(pipeline.Enqueue(SsdCopyTask{"gone", TierType::DRAM, 4})); + pipeline.Start(); + + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + EXPECT_FALSE(ssd_->Exists("gone")); + EXPECT_EQ(pipeline.CopiedOk(), 0u); + pipeline.Stop(); +} + +TEST_F(SsdCopyPipelineTest, FullQueueDropsWithoutBlocking) { + // queue_depth=2, no workers started -> nothing drains, so the 3rd+ enqueue + // overflows and is dropped (and returns immediately). + SsdCopyPipeline pipeline(dram_.get(), ssd_.get(), /*queue_depth=*/2, /*workers=*/1); + EXPECT_TRUE(pipeline.Enqueue(SsdCopyTask{"a", TierType::DRAM, 1})); + EXPECT_TRUE(pipeline.Enqueue(SsdCopyTask{"b", TierType::DRAM, 1})); + EXPECT_FALSE(pipeline.Enqueue(SsdCopyTask{"c", TierType::DRAM, 1})); // full -> drop + EXPECT_FALSE(pipeline.Enqueue(SsdCopyTask{"d", TierType::DRAM, 1})); + EXPECT_EQ(pipeline.Dropped(), 2u); + EXPECT_EQ(pipeline.Enqueued(), 2u); // only the two accepted tasks + EXPECT_EQ(pipeline.DroppedStopped(), 0u); // these are queue-full, not stopped, drops +} + +TEST_F(SsdCopyPipelineTest, EnqueueRejectedWhileStopped) { + SsdCopyPipeline pipeline(dram_.get(), ssd_.get()); + pipeline.Start(); + pipeline.Stop(); + EXPECT_FALSE(pipeline.Enqueue(SsdCopyTask{"k", TierType::DRAM, 1})); + EXPECT_EQ(pipeline.Dropped(), 0u); // stopped path is not counted as a full-drop + EXPECT_EQ(pipeline.DroppedStopped(), 1u); // counted under the stopped reason instead +} + +TEST_F(SsdCopyPipelineTest, StopAfterCopyIsCleanAndReleasesPin) { + SsdCopyPipeline pipeline(dram_.get(), ssd_.get()); + pipeline.Start(); + + const std::string value(8 * 1024, 'X'); + PutLocal("big", value); + ASSERT_TRUE(pipeline.Enqueue(SsdCopyTask{"big", TierType::DRAM, value.size()})); + + // Let the copy run, then Stop(). Stop() joins the worker; the RAII pin guard + // guarantees the pin is released before the worker exits (Stop() never + // force-frees an in-flight pin — that join is the in-flight-wait guarantee). + ASSERT_TRUE(WaitForSsd("big", std::chrono::seconds(2))); + pipeline.Stop(); + + EXPECT_EQ(pipeline.CopiedOk(), 1u); + // Pin released -> the key is now evictable (no copy holding its pages). + auto ev = dram_->Evict({"big"}); + ASSERT_EQ(ev.size(), 1u); + EXPECT_GT(ev[0].bytes_freed, 0u); +} + +TEST_F(SsdCopyPipelineTest, QuiesceThenClearLeavesNoStaleSsdState) { + SsdCopyPipeline pipeline(dram_.get(), ssd_.get()); + pipeline.Start(); + + PutLocal("k", "payload"); + ASSERT_TRUE(pipeline.Enqueue(SsdCopyTask{"k", TierType::DRAM, 7})); + ASSERT_TRUE(WaitForSsd("k", std::chrono::seconds(2))); + + // Clear path: quiesce (drain in-flight) then clear both tiers. + pipeline.Quiesce(); + dram_->ClearLocal(); + ssd_->ClearLocal(); + pipeline.Resume(); + + EXPECT_FALSE(ssd_->Exists("k")); + EXPECT_TRUE(ssd_->SnapshotOwnedKeys().empty()); + EXPECT_TRUE(ssd_->DrainPendingEvents().empty()); + + pipeline.Stop(); +} + +} // namespace +} // namespace mori::umbp diff --git a/tests/cpp/umbp/distributed/test_ssd_read_lease_gating.cpp b/tests/cpp/umbp/distributed/test_ssd_read_lease_gating.cpp new file mode 100644 index 000000000..1b105d0d1 --- /dev/null +++ b/tests/cpp/umbp/distributed/test_ssd_read_lease_gating.cpp @@ -0,0 +1,97 @@ +// Copyright © Advanced Micro Devices, Inc. All rights reserved. +// +// MIT License +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +// +// Pure-logic unit tests for the reader-side remote SSD read lease gating +// (umbp/distributed/ssd_read_lease.h). These cover the decision policy without +// a cluster / RDMA: the full PrepareSsdRead -> RDMA path is exercised at the +// RPC level in test_peer_ssd_read_rpc.cpp. Retryable outcomes are NO_SLOT and +// a reader-local lease expiry; rpc failures are not-served (RPC-test covered). +#include + +#include + +#include "umbp/distributed/ssd_read_lease.h" + +namespace mori::umbp::ssd_read_lease { +namespace { + +using std::chrono::milliseconds; +using std::chrono::steady_clock; + +// ---- LeaseExpired ---- + +TEST(SsdReadLeaseGating, NotExpiredBeforeDeadline) { + const auto t_send = steady_clock::now(); + EXPECT_FALSE(LeaseExpired(t_send, /*lease_ttl_ms=*/1000, t_send + milliseconds(500))); +} + +TEST(SsdReadLeaseGating, ExpiredAfterDeadline) { + const auto t_send = steady_clock::now(); + EXPECT_TRUE(LeaseExpired(t_send, /*lease_ttl_ms=*/1000, t_send + milliseconds(1001))); +} + +TEST(SsdReadLeaseGating, ExactlyAtDeadlineIsNotExpired) { + // Boundary: now == t_send + ttl uses '>' so it is still valid. + const auto t_send = steady_clock::now(); + EXPECT_FALSE(LeaseExpired(t_send, /*lease_ttl_ms=*/1000, t_send + milliseconds(1000))); +} + +TEST(SsdReadLeaseGating, ZeroTtlIsBornExpired) { + const auto t_send = steady_clock::now(); + EXPECT_FALSE(LeaseExpired(t_send, /*lease_ttl_ms=*/0, t_send)); // exactly t_send: valid + EXPECT_TRUE(LeaseExpired(t_send, /*lease_ttl_ms=*/0, t_send + milliseconds(1))); +} + +// ---- DecideSsdReadOutcome ---- +// Situation A (not expired): a good RDMA serves + releases; a failed RDMA is a +// hard error but still releases (the lease is still ours). +// Situation B (expired): always a transient retry, and NEVER release (the slot +// is left for the peer's TTL reclaim), regardless of whether the RDMA "worked". + +TEST(SsdReadLeaseGating, ValidAndRdmaOk_ServesAndReleases) { + const auto d = DecideSsdReadOutcome(/*expired=*/false, /*rdma_ok=*/true); + EXPECT_EQ(d.outcome, GateOutcome::kSuccess); + EXPECT_TRUE(d.release); +} + +TEST(SsdReadLeaseGating, ValidAndRdmaFailed_ErrorButReleases) { + const auto d = DecideSsdReadOutcome(/*expired=*/false, /*rdma_ok=*/false); + EXPECT_EQ(d.outcome, GateOutcome::kError); + EXPECT_TRUE(d.release); +} + +TEST(SsdReadLeaseGating, ExpiredWithRdmaOk_RetryNoRelease) { + // The dangerous case: RDMA "succeeded" but the lease elapsed, so the bytes + // are untrusted (the peer may have recycled the slot). Must NOT be success. + const auto d = DecideSsdReadOutcome(/*expired=*/true, /*rdma_ok=*/true); + EXPECT_EQ(d.outcome, GateOutcome::kRetry); + EXPECT_FALSE(d.release); +} + +TEST(SsdReadLeaseGating, ExpiredWithRdmaFailed_RetryNoRelease) { + const auto d = DecideSsdReadOutcome(/*expired=*/true, /*rdma_ok=*/false); + EXPECT_EQ(d.outcome, GateOutcome::kRetry); + EXPECT_FALSE(d.release); +} + +} // namespace +} // namespace mori::umbp::ssd_read_lease diff --git a/tests/cpp/umbp/distributed/test_ssd_reliability.cpp b/tests/cpp/umbp/distributed/test_ssd_reliability.cpp new file mode 100644 index 000000000..e7f1a37ac --- /dev/null +++ b/tests/cpp/umbp/distributed/test_ssd_reliability.cpp @@ -0,0 +1,345 @@ +// Copyright © Advanced Micro Devices, Inc. All rights reserved. +// +// MIT License +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +// +// Cross-component reliability tests: combinations no single-component test +// covers. All deterministic, no real disk / RDMA / master RPC: +// * the unified owned-location event source merges DRAM + SSD into one +// snapshot/delta (so a heartbeat full-sync ships SSD owned keys too); +// * a local SSD eviction's REMOVE SSD event converges the master +// GlobalBlockIndex while leaving the DRAM bucket intact; +// * tier-priority RouteGet over the real index picks DRAM, then SSD once the +// DRAM replica is removed; +// * crash-restart leftover is discarded at startup; +// * the SSD observability counters increment at the right events. +// +// (copy-pin vs DRAM evict is covered by test_ssd_copy_pipeline's +// EvictBlockedWhilePinnedThenAllowedAfterRelease; seq-gap -> full-sync by +// test_global_block_index_events' ClientRegistryHeartbeat.SeqGap*.) +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "umbp/distributed/master/global_block_index.h" +#include "umbp/distributed/peer/owned_location_source.h" +#include "umbp/distributed/peer/peer_ssd_manager.h" +#include "umbp/distributed/routing/route_get_strategy.h" +#include "umbp/distributed/types.h" +#include "umbp/local/tiers/tier_backend.h" + +namespace mori::umbp { +namespace { + +// Minimal in-memory TierBackend (mirrors the one in test_peer_ssd_eviction) so +// PeerSsdManager runs without real disk IO. Exposes a forced evict failure for +// the backend-failure counter and lets the test pre-seed bytes (crash leftover). +class FakeBackend : public TierBackend { + public: + explicit FakeBackend(size_t capacity) + : TierBackend(StorageTier::LOCAL_SSD), capacity_(capacity) {} + + bool Write(const std::string& key, const void* data, size_t size) override { + std::lock_guard lk(mu_); + auto it = store_.find(key); + size_t prev = (it == store_.end()) ? 0 : it->second.size(); + if (used_ - prev + size > capacity_) return false; + store_[key].assign(static_cast(data), static_cast(data) + size); + used_ = used_ - prev + size; + return true; + } + bool ReadIntoPtr(const std::string& key, uintptr_t dst, size_t size) override { + std::lock_guard lk(mu_); + auto it = store_.find(key); + if (it == store_.end() || it->second.size() != size) return false; + std::memcpy(reinterpret_cast(dst), it->second.data(), size); + return true; + } + bool Exists(const std::string& key) const override { + std::lock_guard lk(mu_); + return store_.count(key) != 0; + } + bool Evict(const std::string& key) override { + std::lock_guard lk(mu_); + if (fail_evict_) return false; + auto it = store_.find(key); + if (it == store_.end()) return false; + used_ -= it->second.size(); + store_.erase(it); + return true; + } + std::pair Capacity() const override { + std::lock_guard lk(mu_); + return {used_, capacity_}; + } + void Clear() override { + std::lock_guard lk(mu_); + ++clear_calls_; + store_.clear(); + used_ = 0; + } + void SetFailEvict(bool f) { + std::lock_guard lk(mu_); + fail_evict_ = f; + } + int clear_calls() const { + std::lock_guard lk(mu_); + return clear_calls_; + } + + private: + mutable std::mutex mu_; + std::unordered_map> store_; + size_t used_ = 0; + size_t capacity_; + bool fail_evict_ = false; + int clear_calls_ = 0; +}; + +std::vector> OneSeg(const std::string& s) { + return {{s.data(), s.size()}}; +} + +bool HasLoc(const std::vector& locs, const std::string& node, TierType tier) { + for (const auto& l : locs) { + if (l.node_id == node && l.tier == tier) return true; + } + return false; +} + +int CountTier(const std::vector& events, KvEvent::Kind kind, TierType tier) { + int n = 0; + for (const auto& e : events) { + if (e.kind == kind && e.tier == tier) ++n; + } + return n; +} + +// A canned owned-location source standing in for PeerDramAllocator so the +// aggregation can be tested without standing up a DRAM allocator. +class FakeOwnedSource : public OwnedLocationSource { + public: + std::vector delta; + std::vector snapshot; + std::vector DrainPendingEvents() override { + std::vector out; + out.swap(delta); + return out; + } + std::vector SnapshotOwnedKeys() const override { return snapshot; } +}; + +// --------------------------------------------------------------------------- +// Unified owned-location source: DRAM + SSD merge into one bundle. +// --------------------------------------------------------------------------- + +// A heartbeat full-sync snapshots ALL sources; SSD owned keys must be present +// alongside DRAM in the merged snapshot (otherwise master would drop the SSD +// tier on a seq-gap recovery). +TEST(SsdReliability, FullSyncSnapshotMergesDramAndSsdOwnedKeys) { + FakeOwnedSource dram; + dram.snapshot = {KvEvent{KvEvent::Kind::ADD, "d-key", TierType::DRAM, 10}}; + + auto be = std::make_unique(1'000'000); + PeerSsdManager ssd(std::move(be), 0.9, 0.7); + ASSERT_TRUE(ssd.Write("s-key", OneSeg("ssddata"), 7)); + ssd.DrainPendingEvents(); // the ADD SSD delta; snapshot is independent + + std::vector sources = {&dram, &ssd}; + auto snap = SnapshotAllSources(sources); + + EXPECT_EQ(CountTier(snap, KvEvent::Kind::ADD, TierType::DRAM), 1); + EXPECT_EQ(CountTier(snap, KvEvent::Kind::ADD, TierType::SSD), 1); +} + +// A delta heartbeat drains ALL sources and concatenates into one list. +TEST(SsdReliability, DeltaDrainMergesDramAndSsdEvents) { + FakeOwnedSource dram; + dram.delta = {KvEvent{KvEvent::Kind::REMOVE, "d-key", TierType::DRAM, 0}}; + + auto be = std::make_unique(1'000'000); + PeerSsdManager ssd(std::move(be), 0.9, 0.7); + ASSERT_TRUE(ssd.Write("s-key", OneSeg("ssddata"), 7)); // queues ADD SSD delta + + std::vector sources = {&dram, &ssd}; + auto merged = DrainAllSources(sources); + + EXPECT_EQ(CountTier(merged, KvEvent::Kind::REMOVE, TierType::DRAM), 1); + EXPECT_EQ(CountTier(merged, KvEvent::Kind::ADD, TierType::SSD), 1); + // Draining again yields nothing (outbox cleared on both sources). + EXPECT_TRUE(DrainAllSources(sources).empty()); +} + +// --------------------------------------------------------------------------- +// SSD local eviction -> REMOVE SSD -> master GlobalBlockIndex converges. +// --------------------------------------------------------------------------- + +// A key mirrored on DRAM + SSD of one owner: a local SSD eviction emits +// REMOVE SSD, and applying that to the master index drops only the SSD bucket +// (the DRAM replica, owned independently, stays routable). +TEST(SsdReliability, LocalSsdEvictionRemoveConvergesMasterIndex) { + GlobalBlockIndex idx; + + auto be = std::make_unique(1'000'000); + PeerSsdManager ssd(std::move(be), 0.9, 0.7); + + // DRAM replica added independently (a DRAM owner would emit this). + idx.ApplyEvents("owner", {KvEvent{KvEvent::Kind::ADD, "k", TierType::DRAM, 100}}); + // SSD copy lands -> ADD SSD drained into the index. + ASSERT_TRUE(ssd.Write("k", OneSeg(std::string(100, 'x')), 100)); + idx.ApplyEvents("owner", ssd.DrainPendingEvents()); + + auto both = idx.Lookup("k"); + ASSERT_TRUE(HasLoc(both, "owner", TierType::DRAM)); + ASSERT_TRUE(HasLoc(both, "owner", TierType::SSD)); + + // Local SSD eviction -> REMOVE SSD -> index drops only the SSD bucket. + ASSERT_TRUE(ssd.Evict("k")); + auto ssd_events = ssd.DrainPendingEvents(); + EXPECT_EQ(CountTier(ssd_events, KvEvent::Kind::REMOVE, TierType::SSD), 1); + idx.ApplyEvents("owner", ssd_events); + + auto after = idx.Lookup("k"); + EXPECT_TRUE(HasLoc(after, "owner", TierType::DRAM)); // DRAM replica still routable + EXPECT_FALSE(HasLoc(after, "owner", TierType::SSD)); // SSD bucket converged away +} + +// --------------------------------------------------------------------------- +// Tier-priority RouteGet over the real index: DRAM first, SSD after evict. +// --------------------------------------------------------------------------- + +TEST(SsdReliability, TierPriorityRoutesDramThenSsdAfterDramRemoved) { + GlobalBlockIndex idx; + idx.ApplyEvents("owner", {KvEvent{KvEvent::Kind::ADD, "k", TierType::DRAM, 100}, + KvEvent{KvEvent::Kind::ADD, "k", TierType::SSD, 100}}); + + TierPriorityRouteGetStrategy strategy; + + auto locs = idx.BatchLookupForRouteGet({"k"}, {}, std::chrono::seconds{10}); + ASSERT_EQ(locs.size(), 1u); + auto dram_pick = strategy.Select(locs[0], "reader"); + EXPECT_EQ(dram_pick.tier, TierType::DRAM) << "prefers the fast DRAM replica"; + + // DRAM evicted -> only the SSD bucket remains -> RouteGet must serve from SSD. + idx.ApplyEvents("owner", {KvEvent{KvEvent::Kind::REMOVE, "k", TierType::DRAM, 0}}); + auto locs2 = idx.BatchLookupForRouteGet({"k"}, {}, std::chrono::seconds{10}); + ASSERT_EQ(locs2.size(), 1u); + auto ssd_pick = strategy.Select(locs2[0], "reader"); + EXPECT_EQ(ssd_pick.tier, TierType::SSD) << "falls back to the surviving SSD replica"; + EXPECT_EQ(ssd_pick.node_id, "owner"); +} + +// --------------------------------------------------------------------------- +// Crash-restart leftover handling (discard). +// --------------------------------------------------------------------------- + +// After a crash owned_ is empty but the backend still holds bytes from the +// previous run. DiscardLeftoverOnStartup wipes them so used capacity starts +// at 0 (no divergence between the empty owned_ map and the physical device). +TEST(SsdReliability, StartupDiscardWipesLeftoverBytes) { + auto be = std::make_unique(1'000'000); + FakeBackend* raw = be.get(); + // Simulate a previous process's bytes left on the device. + ASSERT_TRUE(raw->Write("orphan-1", "leftover-a", 10)); + ASSERT_TRUE(raw->Write("orphan-2", "leftover-b", 10)); + ASSERT_GT(raw->Capacity().first, 0u); + + // Fresh manager: owned_ is empty, but the backend reports used > 0. + PeerSsdManager ssd(std::move(be), 0.9, 0.7); + EXPECT_TRUE(ssd.SnapshotOwnedKeys().empty()); + ASSERT_GT(ssd.Capacity().first, 0u); + + ssd.DiscardLeftoverOnStartup(); + + EXPECT_EQ(raw->clear_calls(), 1); + EXPECT_EQ(ssd.Capacity().first, 0u); // leftover gone -> consistent with empty owned_ +} + +TEST(SsdReliability, StartupDiscardOnCleanTierIsNoop) { + auto be = std::make_unique(1'000'000); + FakeBackend* raw = be.get(); + PeerSsdManager ssd(std::move(be), 0.9, 0.7); + + ssd.DiscardLeftoverOnStartup(); // used == 0 -> skip the wipe entirely + EXPECT_EQ(raw->clear_calls(), 0); +} + +// --------------------------------------------------------------------------- +// Observability counters increment at the right events. +// --------------------------------------------------------------------------- + +TEST(SsdReliability, ReadCountersTrackOutcomes) { + auto be = std::make_unique(1'000'000); + PeerSsdManager ssd(std::move(be), 0.9, 0.7); + ASSERT_TRUE(ssd.Write("k", OneSeg("0123456789"), 10)); + + std::vector buf(10); + EXPECT_EQ(ssd.PrepareRead("k", buf.data(), buf.size()).status, SsdReadStatus::kOk); + EXPECT_EQ(ssd.PrepareRead("absent", buf.data(), buf.size()).status, SsdReadStatus::kNotFound); + EXPECT_EQ(ssd.PrepareRead("k", buf.data(), /*cap=*/1).status, SsdReadStatus::kSizeTooLarge); + + EXPECT_EQ(ssd.ReadOk(), 1u); + EXPECT_EQ(ssd.ReadNotFound(), 1u); + EXPECT_EQ(ssd.ReadSizeTooLarge(), 1u); + EXPECT_EQ(ssd.ReadError(), 0u); +} + +TEST(SsdReliability, EvictionCountersTrackVictimsBytesAndBackendFailures) { + auto be = std::make_unique(1'000'000); + FakeBackend* raw = be.get(); + PeerSsdManager ssd(std::move(be), 0.9, 0.7); + ASSERT_TRUE(ssd.Write("a", OneSeg(std::string(40, 'a')), 40)); + ASSERT_TRUE(ssd.Write("b", OneSeg(std::string(60, 'b')), 60)); + + ASSERT_TRUE(ssd.Evict("a")); + EXPECT_EQ(ssd.EvictionVictims(), 1u); + EXPECT_EQ(ssd.EvictionBytesFreed(), 40u); + EXPECT_EQ(ssd.EvictionBackendFailures(), 0u); + + // Backend refuses the next evict -> the failure is counted, the key kept. + raw->SetFailEvict(true); + EXPECT_FALSE(ssd.Evict("b")); + EXPECT_EQ(ssd.EvictionBackendFailures(), 1u); + EXPECT_EQ(ssd.EvictionVictims(), 1u); // unchanged + EXPECT_TRUE(ssd.Exists("b")); +} + +TEST(SsdReliability, WatermarkEvictionCountsARound) { + // capacity 1000, high 0.9 (=>900), low 0.5 (=>500); 100-byte values. After + // the 9th write used hits 900 -> one eviction round runs. + auto be = std::make_unique(1000); + PeerSsdManager ssd(std::move(be), 0.9, 0.5); + std::string val(100, 'x'); + for (int i = 1; i <= 9; ++i) { + ASSERT_TRUE(ssd.Write("k" + std::to_string(i), OneSeg(val), val.size())); + } + EXPECT_GE(ssd.EvictionRounds(), 1u); + EXPECT_GE(ssd.EvictionVictims(), 1u); + EXPECT_LE(ssd.Capacity().first, 500u); +} + +} // namespace +} // namespace mori::umbp diff --git a/tests/cpp/umbp/distributed/test_tier_priority_route_get.cpp b/tests/cpp/umbp/distributed/test_tier_priority_route_get.cpp new file mode 100644 index 000000000..b6b2abe2d --- /dev/null +++ b/tests/cpp/umbp/distributed/test_tier_priority_route_get.cpp @@ -0,0 +1,112 @@ +// Copyright © Advanced Micro Devices, Inc. All rights reserved. +// +// MIT License +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +#include + +#include +#include +#include + +#include "umbp/distributed/routing/route_get_strategy.h" + +namespace mori::umbp { +namespace { + +Location MakeLoc(const std::string& node_id, TierType tier) { + Location loc; + loc.node_id = node_id; + loc.size = 4096; + loc.tier = tier; + return loc; +} + +// With a DRAM (or HBM) replica present alongside an SSD one, the strategy must +// never route to the slow SSD tier. +TEST(TierPriorityRouteGetStrategyTest, PrefersDramOverSsd) { + TierPriorityRouteGetStrategy strategy; + std::vector locations = { + MakeLoc("ssd-node", TierType::SSD), + MakeLoc("dram-node", TierType::DRAM), + }; + for (int i = 0; i < 100; ++i) { + auto selected = strategy.Select(locations, "requester"); + EXPECT_EQ(selected.tier, TierType::DRAM); + EXPECT_EQ(selected.node_id, "dram-node"); + } +} + +// HBM beats both DRAM and SSD. +TEST(TierPriorityRouteGetStrategyTest, PrefersHbmOverDramAndSsd) { + TierPriorityRouteGetStrategy strategy; + std::vector locations = { + MakeLoc("ssd-node", TierType::SSD), + MakeLoc("dram-node", TierType::DRAM), + MakeLoc("hbm-node", TierType::HBM), + }; + for (int i = 0; i < 100; ++i) { + auto selected = strategy.Select(locations, "requester"); + EXPECT_EQ(selected.tier, TierType::HBM); + } +} + +// When SSD is the only tier present it is selected (read-from-SSD is valid). +TEST(TierPriorityRouteGetStrategyTest, FallsBackToSsdWhenOnlyTier) { + TierPriorityRouteGetStrategy strategy; + std::vector locations = { + MakeLoc("ssd-a", TierType::SSD), + MakeLoc("ssd-b", TierType::SSD), + }; + for (int i = 0; i < 50; ++i) { + auto selected = strategy.Select(locations, "requester"); + EXPECT_EQ(selected.tier, TierType::SSD); + } +} + +// Within the winning tier, selection spreads across all replicas on that tier +// and never leaks to a lower tier. +TEST(TierPriorityRouteGetStrategyTest, RandomWithinBestTierOnly) { + TierPriorityRouteGetStrategy strategy; + std::vector locations = { + MakeLoc("dram-a", TierType::DRAM), + MakeLoc("dram-b", TierType::DRAM), + MakeLoc("dram-c", TierType::DRAM), + MakeLoc("ssd-x", TierType::SSD), + }; + std::set seen; + for (int i = 0; i < 2000; ++i) { + auto selected = strategy.Select(locations, "requester"); + ASSERT_EQ(selected.tier, TierType::DRAM) << "must never pick the SSD replica"; + seen.insert(selected.node_id); + } + EXPECT_EQ(seen.size(), 3u) << "all three DRAM replicas should be reachable"; + EXPECT_EQ(seen.count("ssd-x"), 0u); +} + +TEST(TierPriorityRouteGetStrategyTest, EmptyReturnsDefault) { + TierPriorityRouteGetStrategy strategy; + std::vector locations; + auto selected = strategy.Select(locations, "requester"); + EXPECT_EQ(selected.tier, TierType::UNKNOWN); + EXPECT_TRUE(selected.node_id.empty()); +} + +} // namespace +} // namespace mori::umbp From 4e87eb5e8b8be5aea340907c58856c125cfe59e0 Mon Sep 17 00:00:00 2001 From: TianDi101 Date: Mon, 15 Jun 2026 15:17:01 +0000 Subject: [PATCH 7/8] umbp: delete old master state classes (Phase 4) Now that all consumers route through IMasterMetadataStore (Phase 3), remove the four superseded master-side state holders and their class-specific test suites: - GlobalBlockIndex, ClientRegistry, ExternalKvBlockIndex, ExternalKvHitIndex (header + cpp each) - their Phase 0a `using NodeTierKey` / `using NodeMatch` aliases (lived inside the deleted headers) - GlobalBlockIndex::GetMetrics (zero callers outside the class) Behavioral coverage moved to the store suite (test_in_memory_master_metadata_store), so the five class-specific tests are dropped. Two suites that still constructed the old classes as fixtures are migrated to InMemoryMasterMetadataStore instead: - test_ssd_reliability: ApplyEvents/Lookup/BatchLookupForRouteGet -> RegisterClient + ApplyHeartbeat (ascending seq) + LookupBlock / BatchLookupBlockForRouteGet - test_umbp_tags Suite 1: ClientRegistry -> store RegisterClient/ApplyHeartbeat/GetClientTags/UnregisterClient BlockMetrics and EvictionCandidate stay in types.h (still used by the store's BlockEntry, eviction_manager, and test_types). Co-Authored-By: Claude Opus 4.8 --- src/umbp/CMakeLists.txt | 4 - .../distributed/master/client_registry.cpp | 278 ---------- .../master/external_kv_block_index.cpp | 134 ----- .../master/external_kv_hit_index.cpp | 116 ---- .../distributed/master/global_block_index.cpp | 276 ---------- .../umbp/distributed/master/client_registry.h | 125 ----- .../master/external_kv_block_index.h | 67 --- .../master/external_kv_hit_index.h | 81 --- .../distributed/master/global_block_index.h | 139 ----- tests/cpp/umbp/distributed/CMakeLists.txt | 38 -- .../umbp/distributed/test_client_registry.cpp | 289 ---------- .../test_client_registry_external_kv.cpp | 55 -- .../test_external_kv_block_index.cpp | 103 ---- .../test_external_kv_hit_index.cpp | 116 ---- .../test_global_block_index_events.cpp | 505 ------------------ .../umbp/distributed/test_ssd_reliability.cpp | 94 +++- tests/cpp/umbp/distributed/test_umbp_tags.cpp | 95 ++-- 17 files changed, 133 insertions(+), 2382 deletions(-) delete mode 100644 src/umbp/distributed/master/client_registry.cpp delete mode 100644 src/umbp/distributed/master/external_kv_block_index.cpp delete mode 100644 src/umbp/distributed/master/external_kv_hit_index.cpp delete mode 100644 src/umbp/distributed/master/global_block_index.cpp delete mode 100644 src/umbp/include/umbp/distributed/master/client_registry.h delete mode 100644 src/umbp/include/umbp/distributed/master/external_kv_block_index.h delete mode 100644 src/umbp/include/umbp/distributed/master/external_kv_hit_index.h delete mode 100644 src/umbp/include/umbp/distributed/master/global_block_index.h delete mode 100644 tests/cpp/umbp/distributed/test_client_registry.cpp delete mode 100644 tests/cpp/umbp/distributed/test_client_registry_external_kv.cpp delete mode 100644 tests/cpp/umbp/distributed/test_external_kv_block_index.cpp delete mode 100644 tests/cpp/umbp/distributed/test_external_kv_hit_index.cpp delete mode 100644 tests/cpp/umbp/distributed/test_global_block_index_events.cpp diff --git a/src/umbp/CMakeLists.txt b/src/umbp/CMakeLists.txt index e680050c9..044967693 100644 --- a/src/umbp/CMakeLists.txt +++ b/src/umbp/CMakeLists.txt @@ -308,10 +308,6 @@ add_library( ${UMBP_GRPC_SRCS} ${UMBP_PEER_PROTO_SRCS} ${UMBP_PEER_GRPC_SRCS} - distributed/master/global_block_index.cpp - distributed/master/external_kv_block_index.cpp - distributed/master/client_registry.cpp - distributed/master/external_kv_hit_index.cpp distributed/master/in_memory_master_metadata_store.cpp distributed/master/master_server.cpp distributed/master/master_client.cpp diff --git a/src/umbp/distributed/master/client_registry.cpp b/src/umbp/distributed/master/client_registry.cpp deleted file mode 100644 index b9e1b96fb..000000000 --- a/src/umbp/distributed/master/client_registry.cpp +++ /dev/null @@ -1,278 +0,0 @@ -// Copyright © Advanced Micro Devices, Inc. All rights reserved. -// -// MIT License -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -#include "umbp/distributed/master/client_registry.h" - -#include - -#include "mori/utils/mori_log.hpp" -#include "umbp/distributed/master/external_kv_block_index.h" -#include "umbp/distributed/master/global_block_index.h" - -namespace mori::umbp { - -ClientRegistry::ClientRegistry(const ClientRegistryConfig& config) : config_(config) {} - -ClientRegistry::ClientRegistry(const ClientRegistryConfig& config, GlobalBlockIndex& index, - ExternalKvBlockIndex* external_kv_index) - : config_(config), index_(&index), external_kv_index_(external_kv_index) {} - -ClientRegistry::~ClientRegistry() { StopReaper(); } - -void ClientRegistry::SetBlockIndex(GlobalBlockIndex* index) { - std::unique_lock lock(mutex_); - index_ = index; -} - -void ClientRegistry::SetExternalKvBlockIndex(ExternalKvBlockIndex* index) { - std::unique_lock lock(mutex_); - external_kv_index_ = index; -} - -bool ClientRegistry::RegisterClient(const std::string& node_id, const std::string& node_address, - const std::map& tier_capacities, - const std::string& peer_address, - const std::vector& engine_desc_bytes, - const std::vector& tags) { - std::unique_lock lock(mutex_); - const auto now = std::chrono::system_clock::now(); - - auto it = clients_.find(node_id); - if (it != clients_.end()) { - const bool is_expired = (now - it->second.last_heartbeat > ExpiryDuration()) || - (it->second.status == ClientStatus::EXPIRED); - if (it->second.status == ClientStatus::ALIVE && !is_expired) { - MORI_UMBP_WARN("[Registry] Rejecting re-registration for alive node: {}", node_id); - return false; - } - MORI_UMBP_INFO("[Registry] Re-registering expired node: {}", node_id); - } - - ClientRecord record; - record.node_id = node_id; - record.node_address = node_address; - record.status = ClientStatus::ALIVE; - record.last_heartbeat = now; - record.registered_at = now; - record.tier_capacities = tier_capacities; - record.peer_address = peer_address; - record.engine_desc_bytes = engine_desc_bytes; - record.last_applied_seq = 0; - record.tags = tags; - - clients_[node_id] = std::move(record); - - std::string tags_str; - for (const auto& t : tags) { - if (!tags_str.empty()) tags_str += ','; - tags_str += t; - } - MORI_UMBP_INFO("[Registry] Registered node: {} at {} (peer={}) tags=[{}]", node_id, node_address, - peer_address, tags_str); - return true; -} - -void ClientRegistry::UnregisterClient(const std::string& node_id) { - GlobalBlockIndex* idx = nullptr; - ExternalKvBlockIndex* external_idx = nullptr; - { - std::unique_lock lock(mutex_); - auto it = clients_.find(node_id); - if (it == clients_.end()) return; - idx = index_; - external_idx = external_kv_index_; - clients_.erase(it); - } - if (idx != nullptr) { - idx->RemoveByNode(node_id); - } - if (external_idx != nullptr) { - external_idx->UnregisterByNode(node_id); - } - MORI_UMBP_INFO("[Registry] Unregistered node: {}", node_id); -} - -ClientStatus ClientRegistry::Heartbeat(const std::string& node_id, - const std::map& tier_capacities, - const std::vector& bundles, bool is_full_sync, - uint64_t delta_seq_baseline, uint64_t* out_acked_seq, - bool* out_request_full_sync) { - if (out_acked_seq != nullptr) *out_acked_seq = 0; - if (out_request_full_sync != nullptr) *out_request_full_sync = false; - - GlobalBlockIndex* idx = nullptr; - std::vector bundles_to_apply; - std::vector full_sync_adds; - bool do_full_sync = false; - - { - std::unique_lock lock(mutex_); - auto it = clients_.find(node_id); - if (it == clients_.end()) { - MORI_UMBP_WARN("[Registry] Heartbeat from unknown node: {}", node_id); - return ClientStatus::UNKNOWN; - } - auto& record = it->second; - - record.last_heartbeat = std::chrono::system_clock::now(); - record.status = ClientStatus::ALIVE; - record.tier_capacities = tier_capacities; - - idx = index_; - - if (is_full_sync) { - for (const auto& bundle : bundles) { - for (auto ev : bundle.events) { - if (ev.kind != KvEvent::Kind::ADD) continue; - full_sync_adds.push_back(std::move(ev)); - } - } - record.last_applied_seq = delta_seq_baseline; - if (out_acked_seq != nullptr) *out_acked_seq = record.last_applied_seq; - do_full_sync = true; - } else { - for (const auto& bundle : bundles) { - if (bundle.seq <= record.last_applied_seq) continue; - if (bundle.seq != record.last_applied_seq + 1) { - MORI_UMBP_WARN( - "[Registry] Heartbeat bundle seq gap from {}: got {}, expected {} — requesting " - "full sync", - node_id, bundle.seq, record.last_applied_seq + 1); - if (out_acked_seq != nullptr) *out_acked_seq = record.last_applied_seq; - if (out_request_full_sync != nullptr) *out_request_full_sync = true; - return ClientStatus::ALIVE; - } - bundles_to_apply.push_back(bundle); - record.last_applied_seq = bundle.seq; - } - if (out_acked_seq != nullptr) *out_acked_seq = record.last_applied_seq; - } - } - - if (idx != nullptr) { - if (do_full_sync) { - idx->ReplaceNodeLocations(node_id, full_sync_adds); - } else { - for (const auto& bundle : bundles_to_apply) { - if (!bundle.events.empty()) idx->ApplyEvents(node_id, bundle.events); - } - } - } - return ClientStatus::ALIVE; -} - -bool ClientRegistry::IsClientAlive(const std::string& node_id) const { - std::shared_lock lock(mutex_); - auto it = clients_.find(node_id); - return it != clients_.end() && it->second.status == ClientStatus::ALIVE; -} - -size_t ClientRegistry::ClientCount() const { - std::shared_lock lock(mutex_); - return clients_.size(); -} - -std::vector ClientRegistry::GetAliveClients() const { - std::shared_lock lock(mutex_); - std::vector result; - for (const auto& [id, record] : clients_) { - if (record.status == ClientStatus::ALIVE) result.push_back(record); - } - return result; -} - -std::vector ClientRegistry::GetClientTags(const std::string& node_id) const { - std::shared_lock lock(mutex_); - auto it = clients_.find(node_id); - if (it == clients_.end()) return {}; - return it->second.tags; -} - -void ClientRegistry::StartReaper() { - reaper_running_ = true; - reaper_thread_ = std::thread(&ClientRegistry::ReaperLoop, this); - MORI_UMBP_INFO("[Reaper] Started (interval={}s, expiry={}s)", config_.reaper_interval.count(), - ExpiryDuration().count()); -} - -void ClientRegistry::StopReaper() { - if (reaper_running_) { - reaper_running_ = false; - reaper_cv_.notify_one(); - if (reaper_thread_.joinable()) reaper_thread_.join(); - MORI_UMBP_INFO("[Reaper] Stopped"); - } -} - -void ClientRegistry::ReaperLoop() { - while (reaper_running_) { - { - std::unique_lock cv_lock(reaper_cv_mutex_); - reaper_cv_.wait_for(cv_lock, config_.reaper_interval, - [this] { return !reaper_running_.load(); }); - } - if (!reaper_running_) break; - ReapExpiredClients(); - } -} - -void ClientRegistry::ReapExpiredClients() { - const auto now = std::chrono::system_clock::now(); - const auto expiry = ExpiryDuration(); - std::vector dead_nodes; - - { - std::unique_lock lock(mutex_); - auto it = clients_.begin(); - while (it != clients_.end()) { - if (now - it->second.last_heartbeat > expiry) { - MORI_UMBP_WARN("[Reaper] Reaping expired client: {}", it->first); - dead_nodes.push_back(it->first); - it = clients_.erase(it); - } else { - ++it; - } - } - } - - GlobalBlockIndex* idx = nullptr; - ExternalKvBlockIndex* external_idx = nullptr; - { - std::shared_lock lock(mutex_); - idx = index_; - external_idx = external_kv_index_; - } - - if (idx != nullptr) { - for (const auto& dead_id : dead_nodes) { - // Clear every index entry belonging to the dead node. Capacity - // numbers vanish with the ClientRecord above. - idx->RemoveByNode(dead_id); - } - } - if (external_idx != nullptr) { - for (const auto& dead_id : dead_nodes) { - external_idx->UnregisterByNode(dead_id); - } - } -} - -} // namespace mori::umbp diff --git a/src/umbp/distributed/master/external_kv_block_index.cpp b/src/umbp/distributed/master/external_kv_block_index.cpp deleted file mode 100644 index f6e93ac33..000000000 --- a/src/umbp/distributed/master/external_kv_block_index.cpp +++ /dev/null @@ -1,134 +0,0 @@ -// Copyright © Advanced Micro Devices, Inc. All rights reserved. -// -// MIT License -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -#include "umbp/distributed/master/external_kv_block_index.h" - -#include -#include -#include -#include - -namespace mori::umbp { - -size_t ExternalKvBlockIndex::Register(const std::string& node_id, - const std::vector& hashes, TierType tier) { - std::unique_lock lock(mutex_); - size_t mutated = 0; - for (const auto& hash : hashes) { - auto [it, inserted] = entries_[hash][node_id].insert(tier); - (void)it; - if (inserted) ++mutated; - } - return mutated; -} - -size_t ExternalKvBlockIndex::Unregister(const std::string& node_id, - const std::vector& hashes, TierType tier) { - std::unique_lock lock(mutex_); - size_t mutated = 0; - for (const auto& hash : hashes) { - auto it = entries_.find(hash); - if (it == entries_.end()) continue; - - auto node_it = it->second.find(node_id); - if (node_it == it->second.end()) continue; - - mutated += node_it->second.erase(tier); - if (node_it->second.empty()) it->second.erase(node_it); - if (it->second.empty()) entries_.erase(it); - } - return mutated; -} - -size_t ExternalKvBlockIndex::UnregisterByNodeAtTier(const std::string& node_id, TierType tier) { - std::unique_lock lock(mutex_); - size_t mutated = 0; - auto it = entries_.begin(); - while (it != entries_.end()) { - auto node_it = it->second.find(node_id); - if (node_it != it->second.end()) { - mutated += node_it->second.erase(tier); - if (node_it->second.empty()) it->second.erase(node_it); - } - if (it->second.empty()) { - it = entries_.erase(it); - } else { - ++it; - } - } - return mutated; -} - -size_t ExternalKvBlockIndex::UnregisterByNode(const std::string& node_id) { - std::unique_lock lock(mutex_); - size_t mutated = 0; - auto it = entries_.begin(); - while (it != entries_.end()) { - auto node_it = it->second.find(node_id); - if (node_it != it->second.end()) { - mutated += node_it->second.size(); - it->second.erase(node_it); - } - if (it->second.empty()) { - it = entries_.erase(it); - } else { - ++it; - } - } - return mutated; -} - -std::vector ExternalKvBlockIndex::Match( - const std::vector& hashes) const { - std::shared_lock lock(mutex_); - - std::unordered_map>> acc; - for (const auto& hash : hashes) { - auto it = entries_.find(hash); - if (it == entries_.end()) continue; - for (const auto& [node_id, tiers] : it->second) { - auto& by_tier = acc[node_id]; - for (TierType tier : tiers) by_tier[tier].push_back(hash); - } - } - - std::vector result; - result.reserve(acc.size()); - for (auto& [node_id, by_tier] : acc) { - NodeMatch m; - m.node_id = std::move(node_id); - m.hashes_by_tier = std::move(by_tier); - result.push_back(std::move(m)); - } - return result; -} - -size_t ExternalKvBlockIndex::GetKvCount(const std::string& node_id) const { - std::shared_lock lock(mutex_); - size_t count = 0; - for (const auto& [hash, nodes] : entries_) { - (void)hash; - if (nodes.count(node_id)) ++count; - } - return count; -} - -} // namespace mori::umbp diff --git a/src/umbp/distributed/master/external_kv_hit_index.cpp b/src/umbp/distributed/master/external_kv_hit_index.cpp deleted file mode 100644 index 84d775c0e..000000000 --- a/src/umbp/distributed/master/external_kv_hit_index.cpp +++ /dev/null @@ -1,116 +0,0 @@ -// Copyright © Advanced Micro Devices, Inc. All rights reserved. -// -// MIT License -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -#include "umbp/distributed/master/external_kv_hit_index.h" - -#include -#include -#include - -namespace mori::umbp { - -size_t ExternalKvHitIndex::ShardIdx(std::string_view hash) { - return std::hash{}(hash) % kShards; -} - -void ExternalKvHitIndex::UpdateLastSeen(Entry* entry, uint64_t now_ns) { - uint64_t old = entry->last_seen_ns.load(std::memory_order_relaxed); - while (old < now_ns && !entry->last_seen_ns.compare_exchange_weak( - old, now_ns, std::memory_order_relaxed, std::memory_order_relaxed)) { - } -} - -void ExternalKvHitIndex::IncrementHits(const std::vector& unique_hashes, - uint64_t now_ns) { - for (const auto& hash : unique_hashes) { - auto& shard = shards_[ShardIdx(hash)]; - { - std::shared_lock lock(shard.mu); - auto it = shard.entries.find(hash); - if (it != shard.entries.end()) { - Entry* entry = it->second.get(); - entry->total.fetch_add(1, std::memory_order_relaxed); - UpdateLastSeen(entry, now_ns); - continue; - } - } - - std::unique_lock lock(shard.mu); - auto [it, inserted] = shard.entries.try_emplace(hash); - if (inserted) { - auto entry = std::make_unique(); - entry->total.store(1, std::memory_order_relaxed); - entry->last_seen_ns.store(now_ns, std::memory_order_relaxed); - it->second = std::move(entry); - } else { - Entry* entry = it->second.get(); - entry->total.fetch_add(1, std::memory_order_relaxed); - UpdateLastSeen(entry, now_ns); - } - } -} - -size_t ExternalKvHitIndex::Lookup(const std::vector& hashes, - std::vector>* out) const { - if (out == nullptr) return 0; - std::unordered_set seen; - seen.reserve(hashes.size()); - size_t filled = 0; - for (const auto& hash : hashes) { - if (!seen.insert(hash).second) continue; - const auto& shard = shards_[ShardIdx(hash)]; - std::shared_lock lock(shard.mu); - auto it = shard.entries.find(hash); - if (it == shard.entries.end()) continue; - out->push_back({hash, it->second->total.load(std::memory_order_relaxed)}); - ++filled; - } - return filled; -} - -size_t ExternalKvHitIndex::GarbageCollect(uint64_t cutoff_ns) { - size_t dropped = 0; - for (auto& shard : shards_) { - std::unique_lock lock(shard.mu); - auto it = shard.entries.begin(); - while (it != shard.entries.end()) { - const uint64_t last_seen = it->second->last_seen_ns.load(std::memory_order_relaxed); - if (last_seen < cutoff_ns) { - it = shard.entries.erase(it); - ++dropped; - } else { - ++it; - } - } - } - return dropped; -} - -size_t ExternalKvHitIndex::Size() const { - size_t size = 0; - for (const auto& shard : shards_) { - std::shared_lock lock(shard.mu); - size += shard.entries.size(); - } - return size; -} - -} // namespace mori::umbp diff --git a/src/umbp/distributed/master/global_block_index.cpp b/src/umbp/distributed/master/global_block_index.cpp deleted file mode 100644 index 24114dcaa..000000000 --- a/src/umbp/distributed/master/global_block_index.cpp +++ /dev/null @@ -1,276 +0,0 @@ -// Copyright © Advanced Micro Devices, Inc. All rights reserved. -// -// MIT License -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -#include "umbp/distributed/master/global_block_index.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "mori/utils/mori_log.hpp" - -namespace mori::umbp { - -namespace { - -// Locate (or insert) the location for (node_id, tier) within an entry's -// location list. Caller MUST hold the unique lock. Returns a pointer -// into entry.locations that's stable until the next mutation. -std::pair FindOrInsertLocation(BlockEntry& entry, const std::string& node_id, - TierType tier) { - for (auto& loc : entry.locations) { - if (loc.node_id == node_id && loc.tier == tier) return {&loc, false}; - } - entry.locations.push_back(Location{node_id, /*size=*/0, tier}); - return {&entry.locations.back(), true}; -} - -bool HasLocationForNode(const BlockEntry& entry, const std::string& node_id) { - return std::any_of(entry.locations.begin(), entry.locations.end(), - [&](const Location& loc) { return loc.node_id == node_id; }); -} - -size_t RemoveLocationsLocked( - std::unordered_map& entries, - std::unordered_map>& node_to_keys, - const std::string& node_id, std::optional tier) { - size_t removed = 0; - for (auto it = entries.begin(); it != entries.end();) { - auto& locs = it->second.locations; - const size_t before = locs.size(); - locs.erase(std::remove_if(locs.begin(), locs.end(), - [&](const Location& l) { - if (l.node_id != node_id) return false; - if (tier.has_value() && l.tier != *tier) return false; - return true; - }), - locs.end()); - const size_t removed_from_entry = before - locs.size(); - removed += removed_from_entry; - if (removed_from_entry != 0 && !HasLocationForNode(it->second, node_id)) { - auto rev_it = node_to_keys.find(node_id); - if (rev_it != node_to_keys.end()) { - rev_it->second.erase(it->first); - if (rev_it->second.empty()) node_to_keys.erase(rev_it); - } - } - if (locs.empty()) { - it = entries.erase(it); - } else { - ++it; - } - } - return removed; -} - -} // namespace - -size_t GlobalBlockIndex::ApplyEvents(const std::string& node_id, - const std::vector& events) { - if (events.empty()) return 0; - std::unique_lock lock(mutex_); - size_t mutated = 0; - const auto now = std::chrono::system_clock::now(); - - for (const auto& ev : events) { - if (ev.kind == KvEvent::Kind::CLEAR_AT_TIER) { - mutated += RemoveLocationsLocked(entries_, node_to_keys_, node_id, ev.tier); - } else if (ev.kind == KvEvent::Kind::ADD) { - auto& entry = entries_[ev.key]; - if (entry.locations.empty()) { - entry.metrics.created_at = now; - entry.metrics.last_accessed_at = now; - entry.metrics.access_count = 0; - entry.last_accessed_rep.store(now.time_since_epoch().count(), std::memory_order_release); - entry.atomic_access_count.store(0, std::memory_order_relaxed); - } - auto [loc, inserted] = FindOrInsertLocation(entry, node_id, ev.tier); - // Idempotent; must run on duplicate ADDs too. - node_to_keys_[node_id].insert(ev.key); - if (!inserted) { - MORI_UMBP_WARN( - "[GlobalBlockIndex] duplicate ADD for key='{}' node={} tier={} old_size={} " - "new_size={}; keeping existing location", - ev.key, node_id, TierTypeName(ev.tier), loc->size, ev.size); - } else { - loc->size = ev.size; - ++mutated; - } - } else { // REMOVE - auto it = entries_.find(ev.key); - if (it == entries_.end()) continue; - auto& locs = it->second.locations; - const size_t before = locs.size(); - locs.erase(std::remove_if( - locs.begin(), locs.end(), - [&](const Location& l) { return l.node_id == node_id && l.tier == ev.tier; }), - locs.end()); - if (locs.size() != before) { - ++mutated; - // find(), not operator[]: don't grow an empty bucket for strangers. - if (!HasLocationForNode(it->second, node_id)) { - auto rev_it = node_to_keys_.find(node_id); - if (rev_it != node_to_keys_.end()) { - rev_it->second.erase(ev.key); - if (rev_it->second.empty()) node_to_keys_.erase(rev_it); - } - } - if (locs.empty()) entries_.erase(it); - } - } - } - return mutated; -} - -void GlobalBlockIndex::ReplaceNodeLocations(const std::string& node_id, - const std::vector& adds) { - std::unique_lock lock(mutex_); - const auto now = std::chrono::system_clock::now(); - - // O(N_node + |adds|) via the reverse index. - auto rev_it = node_to_keys_.find(node_id); - if (rev_it != node_to_keys_.end()) { - auto old_keys = std::move(rev_it->second); - node_to_keys_.erase(rev_it); - for (const auto& key : old_keys) { - auto eit = entries_.find(key); - if (eit == entries_.end()) continue; - auto& locs = eit->second.locations; - locs.erase(std::remove_if(locs.begin(), locs.end(), - [&](const Location& l) { return l.node_id == node_id; }), - locs.end()); - if (locs.empty()) { - entries_.erase(eit); - } - } - } - - for (const auto& ev : adds) { - if (ev.kind != KvEvent::Kind::ADD) continue; - auto& entry = entries_[ev.key]; - if (entry.locations.empty()) { - entry.metrics.created_at = now; - entry.metrics.last_accessed_at = now; - entry.metrics.access_count = 0; - entry.last_accessed_rep.store(now.time_since_epoch().count(), std::memory_order_release); - entry.atomic_access_count.store(0, std::memory_order_relaxed); - } - auto [loc, inserted] = FindOrInsertLocation(entry, node_id, ev.tier); - (void)inserted; - loc->size = ev.size; - node_to_keys_[node_id].insert(ev.key); - } -} - -void GlobalBlockIndex::RemoveByNode(const std::string& node_id) { - std::unique_lock lock(mutex_); - RemoveLocationsLocked(entries_, node_to_keys_, node_id, std::nullopt); -} - -void GlobalBlockIndex::RecordAccess(const std::string& key) { - std::shared_lock lock(mutex_); - auto it = entries_.find(key); - if (it == entries_.end()) return; - it->second.RecordAccessAtomic(); -} - -void GlobalBlockIndex::GrantLease(const std::string& key, - std::chrono::system_clock::duration duration) { - std::shared_lock lock(mutex_); - auto it = entries_.find(key); - if (it != entries_.end()) it->second.GrantLease(duration); -} - -std::vector GlobalBlockIndex::Lookup(const std::string& key) const { - std::shared_lock lock(mutex_); - auto it = entries_.find(key); - if (it == entries_.end()) return {}; - return it->second.locations; -} - -std::vector GlobalBlockIndex::BatchLookupExists(const std::vector& keys) const { - std::vector results(keys.size(), false); - if (keys.empty()) return results; - std::shared_lock lock(mutex_); - for (size_t i = 0; i < keys.size(); ++i) { - auto it = entries_.find(keys[i]); - results[i] = (it != entries_.end()) && !it->second.locations.empty(); - } - return results; -} - -std::optional GlobalBlockIndex::GetMetrics(const std::string& key) const { - std::shared_lock lock(mutex_); - auto it = entries_.find(key); - if (it == entries_.end()) return std::nullopt; - BlockMetrics result = it->second.metrics; - result.last_accessed_at = it->second.GetLastAccessed(); - result.access_count = it->second.atomic_access_count.load(std::memory_order_acquire); - return result; -} - -std::vector> GlobalBlockIndex::BatchLookupForRouteGet( - const std::vector& keys, const std::unordered_set& exclude_nodes, - std::chrono::system_clock::duration lease_duration) { - std::vector> out(keys.size()); - if (keys.empty()) return out; - std::shared_lock lock(mutex_); - for (size_t i = 0; i < keys.size(); ++i) { - auto it = entries_.find(keys[i]); - if (it == entries_.end()) continue; - auto& locs = out[i]; - for (const auto& loc : it->second.locations) { - if (!exclude_nodes.empty() && exclude_nodes.count(loc.node_id)) continue; - locs.push_back(loc); - } - if (locs.empty()) continue; - it->second.RecordAccessAtomic(); - it->second.GrantLease(lease_duration); - } - return out; -} - -std::vector GlobalBlockIndex::FindEvictionCandidates( - const std::set& overloaded_node_tiers) const { - std::vector candidates; - std::shared_lock lock(mutex_); - for (const auto& [key, entry] : entries_) { - if (entry.IsLeased()) continue; - for (const auto& loc : entry.locations) { - if (overloaded_node_tiers.count({loc.node_id, loc.tier})) { - EvictionCandidate c; - c.key = key; - c.location = loc; - c.last_accessed_at = entry.GetLastAccessed(); - c.size = loc.size; - candidates.push_back(std::move(c)); - } - } - } - return candidates; -} - -} // namespace mori::umbp diff --git a/src/umbp/include/umbp/distributed/master/client_registry.h b/src/umbp/include/umbp/distributed/master/client_registry.h deleted file mode 100644 index daa81e226..000000000 --- a/src/umbp/include/umbp/distributed/master/client_registry.h +++ /dev/null @@ -1,125 +0,0 @@ -// Copyright © Advanced Micro Devices, Inc. All rights reserved. -// -// MIT License -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "umbp/distributed/config.h" -#include "umbp/distributed/types.h" - -namespace mori::umbp { - -class GlobalBlockIndex; -class ExternalKvBlockIndex; - -// Master-side membership ledger + heartbeat ingestion. In the -// master-as-advisor design this class no longer owns any allocator -// state; every per-tier capacity number it stores is the value the peer -// reported in its most recent heartbeat. Heartbeat is also the channel -// through which peer-shipped KvEvents reach GlobalBlockIndex. -class ClientRegistry { - public: - explicit ClientRegistry(const ClientRegistryConfig& config); - ClientRegistry(const ClientRegistryConfig& config, GlobalBlockIndex& index, - ExternalKvBlockIndex* external_kv_index = nullptr); - ~ClientRegistry(); - - ClientRegistry(const ClientRegistry&) = delete; - ClientRegistry& operator=(const ClientRegistry&) = delete; - - void SetBlockIndex(GlobalBlockIndex* index); - void SetExternalKvBlockIndex(ExternalKvBlockIndex* index); - - // --- Client lifecycle --- - - // Returns false when a live node with the same id already exists. - // Returns true for new registrations or re-registration of expired - // nodes. In the new design the only state master holds for a node is - // membership + last-reported tier capacities; the peer owns its own - // allocators. - bool RegisterClient(const std::string& node_id, const std::string& node_address, - const std::map& tier_capacities, - const std::string& peer_address = "", - const std::vector& engine_desc_bytes = {}, - const std::vector& tags = {}); - - // Drops the node from the registry and clears every index entry that - // belonged to it. - void UnregisterClient(const std::string& node_id); - - // Apply one heartbeat request. Returns the resulting status - // (UNKNOWN if the node isn't registered). On the success path: - // - tier_capacities replace the stored values unconditionally, - // - delta bundles are applied in seq order, with retransmissions skipped, - // - full-sync replaces this node's UMBP-owned locations. - ClientStatus Heartbeat(const std::string& node_id, - const std::map& tier_capacities, - const std::vector& bundles, bool is_full_sync, - uint64_t delta_seq_baseline, uint64_t* out_acked_seq, - bool* out_request_full_sync); - - // --- Queries --- - bool IsClientAlive(const std::string& node_id) const; - size_t ClientCount() const; - std::vector GetAliveClients() const; - // Returns the tags registered for node_id, or empty if not found. - std::vector GetClientTags(const std::string& node_id) const; - - // --- Reaper control --- - // The reaper only expires nodes whose last_heartbeat has aged past - // `heartbeat_ttl × max_missed_heartbeats`. No allocation reaper — - // pending state lives at the peer in this design. - void StartReaper(); - void StopReaper(); - - private: - ClientRegistryConfig config_; - GlobalBlockIndex* index_ = nullptr; - ExternalKvBlockIndex* external_kv_index_ = nullptr; - - mutable std::shared_mutex mutex_; - std::unordered_map clients_; - - std::thread reaper_thread_; - std::atomic reaper_running_{false}; - std::mutex reaper_cv_mutex_; - std::condition_variable reaper_cv_; - - void ReaperLoop(); - void ReapExpiredClients(); - - std::chrono::seconds ExpiryDuration() const { - return config_.heartbeat_ttl * config_.max_missed_heartbeats; - } -}; - -} // namespace mori::umbp diff --git a/src/umbp/include/umbp/distributed/master/external_kv_block_index.h b/src/umbp/include/umbp/distributed/master/external_kv_block_index.h deleted file mode 100644 index b573098af..000000000 --- a/src/umbp/include/umbp/distributed/master/external_kv_block_index.h +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright © Advanced Micro Devices, Inc. All rights reserved. -// -// MIT License -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "umbp/distributed/types.h" - -namespace mori::umbp { - -/// Lightweight index for externally-managed KV blocks (e.g. sglang HiCache). -/// Each (node, hash) pair tracks the set of tiers the node has reported. -class ExternalKvBlockIndex { - public: - ExternalKvBlockIndex() = default; - ~ExternalKvBlockIndex() = default; - - ExternalKvBlockIndex(const ExternalKvBlockIndex&) = delete; - ExternalKvBlockIndex& operator=(const ExternalKvBlockIndex&) = delete; - - // Mutators return the count of actually changed (hash, node, tier) tuples. - size_t Register(const std::string& node_id, const std::vector& hashes, - TierType tier); - size_t Unregister(const std::string& node_id, const std::vector& hashes, - TierType tier); - size_t UnregisterByNodeAtTier(const std::string& node_id, TierType tier); - size_t UnregisterByNode(const std::string& node_id); - - // Hoisted to umbp/distributed/types.h; alias kept temporarily so existing - // callers (e.g. ExternalKvBlockIndex::NodeMatch) compile. Removed in Phase 5. - using NodeMatch = mori::umbp::NodeMatch; - - std::vector Match(const std::vector& hashes) const; - size_t GetKvCount(const std::string& node_id) const; - - private: - mutable std::shared_mutex mutex_; - std::unordered_map>> entries_; -}; - -} // namespace mori::umbp diff --git a/src/umbp/include/umbp/distributed/master/external_kv_hit_index.h b/src/umbp/include/umbp/distributed/master/external_kv_hit_index.h deleted file mode 100644 index 4f4bc7720..000000000 --- a/src/umbp/include/umbp/distributed/master/external_kv_hit_index.h +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright © Advanced Micro Devices, Inc. All rights reserved. -// -// MIT License -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace mori::umbp { - -// Per-hash cumulative hit counter for external KV placement matches. -// Entries are created only for hashes that were actually matched by -// MatchExternalKv(count_as_hit=true); revoke does not remove them. -class ExternalKvHitIndex { - public: - static constexpr size_t kShards = 256; - - ExternalKvHitIndex() = default; - ~ExternalKvHitIndex() = default; - - ExternalKvHitIndex(const ExternalKvHitIndex&) = delete; - ExternalKvHitIndex& operator=(const ExternalKvHitIndex&) = delete; - - // Caller owns request-level de-duplication. Each input hash is incremented - // exactly once by this call. - void IncrementHits(const std::vector& unique_hashes, uint64_t now_ns); - - // Sparse lookup. Missing hashes are skipped, and duplicate query hashes - // produce at most one output entry. - size_t Lookup(const std::vector& hashes, - std::vector>* out) const; - - // Drop entries whose last activity is older than cutoff_ns. - size_t GarbageCollect(uint64_t cutoff_ns); - - size_t Size() const; - - private: - struct Entry { - std::atomic total{0}; - std::atomic last_seen_ns{0}; - }; - - struct Shard { - mutable std::shared_mutex mu; - std::unordered_map> entries; - }; - - static size_t ShardIdx(std::string_view hash); - static void UpdateLastSeen(Entry* entry, uint64_t now_ns); - - std::array shards_; -}; - -} // namespace mori::umbp diff --git a/src/umbp/include/umbp/distributed/master/global_block_index.h b/src/umbp/include/umbp/distributed/master/global_block_index.h deleted file mode 100644 index ba28a37aa..000000000 --- a/src/umbp/include/umbp/distributed/master/global_block_index.h +++ /dev/null @@ -1,139 +0,0 @@ -// Copyright © Advanced Micro Devices, Inc. All rights reserved. -// -// MIT License -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "umbp/distributed/types.h" - -namespace mori::umbp { - -struct BlockEntry { - std::vector locations; - BlockMetrics metrics; - - std::atomic lease_expiry_rep{0}; - std::atomic last_accessed_rep{0}; - std::atomic atomic_access_count{0}; - - void GrantLease(std::chrono::system_clock::duration duration) { - auto expiry = std::chrono::system_clock::now() + duration; - lease_expiry_rep.store(expiry.time_since_epoch().count(), std::memory_order_release); - } - - bool IsLeased() const { - auto now_rep = std::chrono::system_clock::now().time_since_epoch().count(); - return lease_expiry_rep.load(std::memory_order_acquire) > now_rep; - } - - void RecordAccessAtomic() { - last_accessed_rep.store(std::chrono::system_clock::now().time_since_epoch().count(), - std::memory_order_release); - atomic_access_count.fetch_add(1, std::memory_order_relaxed); - } - - std::chrono::system_clock::time_point GetLastAccessed() const { - auto rep = last_accessed_rep.load(std::memory_order_acquire); - return std::chrono::system_clock::time_point(std::chrono::system_clock::duration(rep)); - } -}; - -// EvictionCandidate hoisted to umbp/distributed/types.h since it is part of the -// IMasterMetadataStore contract; visible here via the types.h include above. - -// Master-side projection of every peer's owned-key set. In the -// master-as-advisor design this index is *only* mutated through the -// event-shipping heartbeat — there are no per-Put or per-Eviction -// master RPCs. Routing and eviction read from here. -class GlobalBlockIndex { - public: - GlobalBlockIndex() = default; - ~GlobalBlockIndex() = default; - - GlobalBlockIndex(const GlobalBlockIndex&) = delete; - GlobalBlockIndex& operator=(const GlobalBlockIndex&) = delete; - - // --- Mutators (event-driven only) --- - - // Apply one peer's heartbeat-shipped event batch. Returns the count - // of location mutations. ADD with a (node_id, tier) that already exists - // for the key is an idempotent no-op on the location's size. - // REMOVE for an unknown (key, node_id, tier) is a silent no-op. - // CLEAR_AT_TIER drops every key for (node_id, tier) and returns - // the number of locations removed. - size_t ApplyEvents(const std::string& node_id, const std::vector& events); - - // Replace this node's full set of locations with the ADDs carried in `adds`. - void ReplaceNodeLocations(const std::string& node_id, const std::vector& adds); - - void RemoveByNode(const std::string& node_id); - - // Bump last_accessed_at and access_count. Lock-free under the shared lock. - void RecordAccess(const std::string& key); - - // Grant a time-limited lease to protect a key from eviction. - void GrantLease(const std::string& key, std::chrono::system_clock::duration duration); - - // Batched Lookup + filter + (on non-empty result) RecordAccess + GrantLease, - // under a single shared_lock. - std::vector> BatchLookupForRouteGet( - const std::vector& keys, const std::unordered_set& exclude_nodes, - std::chrono::system_clock::duration lease_duration); - - // --- Queries --- - - std::vector Lookup(const std::string& key) const; - - // Batched existence check — single shared_lock acquisition for the - // whole batch. Read-only, no access-count or lease side-effects. - // Returns a vector parallel to `keys` where entry i is true iff the - // key has at least one registered Location. - std::vector BatchLookupExists(const std::vector& keys) const; - - std::optional GetMetrics(const std::string& key) const; - - // --- Eviction --- - - // Hoisted to umbp/distributed/types.h; alias kept temporarily so existing - // callers (e.g. GlobalBlockIndex::NodeTierKey) compile. Removed in Phase 5. - using NodeTierKey = mori::umbp::NodeTierKey; - - std::vector FindEvictionCandidates( - const std::set& overloaded_node_tiers) const; - - private: - mutable std::shared_mutex mutex_; - std::unordered_map entries_; - // Reverse index: lets ReplaceNodeLocations skip a full entries_ scan. - std::unordered_map> node_to_keys_; -}; - -} // namespace mori::umbp diff --git a/tests/cpp/umbp/distributed/CMakeLists.txt b/tests/cpp/umbp/distributed/CMakeLists.txt index be147569c..86f384dcd 100644 --- a/tests/cpp/umbp/distributed/CMakeLists.txt +++ b/tests/cpp/umbp/distributed/CMakeLists.txt @@ -206,36 +206,6 @@ target_link_libraries(test_in_memory_master_metadata_store target_compile_features(test_in_memory_master_metadata_store PRIVATE cxx_std_17) gtest_discover_tests(test_in_memory_master_metadata_store) -# test_external_kv_block_index -add_executable(test_external_kv_block_index test_external_kv_block_index.cpp) -target_link_libraries(test_external_kv_block_index PRIVATE umbp_common - GTest::gtest_main) -target_compile_features(test_external_kv_block_index PRIVATE cxx_std_17) -gtest_discover_tests(test_external_kv_block_index) - -# test_client_registry — membership ledger: register/re-register, capacity -# round-trip, heartbeat status, and the silent-node reaper. -add_executable(test_client_registry test_client_registry.cpp) -target_link_libraries(test_client_registry PRIVATE umbp_common - GTest::gtest_main) -target_compile_features(test_client_registry PRIVATE cxx_std_17) -gtest_discover_tests(test_client_registry) - -# test_client_registry_external_kv -add_executable(test_client_registry_external_kv - test_client_registry_external_kv.cpp) -target_link_libraries(test_client_registry_external_kv - PRIVATE umbp_common GTest::gtest_main) -target_compile_features(test_client_registry_external_kv PRIVATE cxx_std_17) -gtest_discover_tests(test_client_registry_external_kv) - -# test_external_kv_hit_index -add_executable(test_external_kv_hit_index test_external_kv_hit_index.cpp) -target_link_libraries(test_external_kv_hit_index PRIVATE umbp_common - GTest::gtest_main) -target_compile_features(test_external_kv_hit_index PRIVATE cxx_std_17) -gtest_discover_tests(test_external_kv_hit_index) - # test_peer_dram_allocator add_executable(test_peer_dram_allocator test_peer_dram_allocator.cpp) target_link_libraries(test_peer_dram_allocator PRIVATE umbp_common @@ -243,14 +213,6 @@ target_link_libraries(test_peer_dram_allocator PRIVATE umbp_common target_compile_features(test_peer_dram_allocator PRIVATE cxx_std_17) gtest_discover_tests(test_peer_dram_allocator) -# test_global_block_index_events -add_executable(test_global_block_index_events - test_global_block_index_events.cpp) -target_link_libraries(test_global_block_index_events PRIVATE umbp_common - GTest::gtest_main) -target_compile_features(test_global_block_index_events PRIVATE cxx_std_17) -gtest_discover_tests(test_global_block_index_events) - # test_router_dedup — master-side BatchRoutePut dedup via IMasterMetadataStore add_executable(test_router_dedup test_router_dedup.cpp) target_link_libraries(test_router_dedup PRIVATE umbp_common GTest::gtest_main) diff --git a/tests/cpp/umbp/distributed/test_client_registry.cpp b/tests/cpp/umbp/distributed/test_client_registry.cpp deleted file mode 100644 index adb363a30..000000000 --- a/tests/cpp/umbp/distributed/test_client_registry.cpp +++ /dev/null @@ -1,289 +0,0 @@ -// Copyright © Advanced Micro Devices, Inc. All rights reserved. -// -// MIT License -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -// -// Membership-ledger unit tests for ClientRegistry: registration / re- -// registration semantics, capacity round-trip, heartbeat status, and the -// background reaper that expires silent nodes. These exercise the registry -// in isolation (no GlobalBlockIndex / RPC), complementing the index- and -// external-kv-focused suites. In the master-as-advisor design the registry -// stores only membership + the capacities a peer last reported, so the -// assertions below check reported values verbatim rather than any allocator- -// derived view. -#include - -#include -#include -#include -#include -#include - -#include "umbp/distributed/master/client_registry.h" -#include "umbp/distributed/types.h" - -namespace mori::umbp { -namespace { - -std::map Caps(uint64_t total, uint64_t available) { - return {{TierType::HBM, TierCapacity{total, available}}}; -} - -const ClientRecord* FindClient(const std::vector& clients, const std::string& id) { - for (const auto& c : clients) { - if (c.node_id == id) return &c; - } - return nullptr; -} - -// Drive the current 7-arg Heartbeat with no events — the membership-keepalive -// path the reaper cares about. -ClientStatus Beat(ClientRegistry& registry, const std::string& node_id, - const std::map& caps) { - uint64_t acked = 0; - bool need_full = false; - return registry.Heartbeat(node_id, caps, /*bundles=*/{}, /*is_full_sync=*/false, - /*delta_seq_baseline=*/0, &acked, &need_full); -} - -template -bool WaitUntil(Predicate&& predicate, std::chrono::milliseconds timeout, - std::chrono::milliseconds poll = std::chrono::milliseconds(100)) { - const auto deadline = std::chrono::steady_clock::now() + timeout; - while (std::chrono::steady_clock::now() < deadline) { - if (predicate()) return true; - std::this_thread::sleep_for(poll); - } - return predicate(); -} - -// heartbeat_ttl * max_missed_heartbeats == 1s, so a node ages out ~1s after -// its last heartbeat. reaper_interval keeps the sweep responsive. -ClientRegistryConfig FastExpiryConfig() { - ClientRegistryConfig config; - config.heartbeat_ttl = std::chrono::seconds(1); - config.max_missed_heartbeats = 1; - config.reaper_interval = std::chrono::seconds(1); - return config; -} - -} // namespace - -// --- Registration / membership ---------------------------------------------- - -TEST(ClientRegistryTest, RegisterSingle) { - ClientRegistry registry(ClientRegistryConfig{}); - EXPECT_TRUE(registry.RegisterClient("node-1", "127.0.0.1:8080", Caps(80, 64))); - EXPECT_EQ(registry.ClientCount(), 1u); - EXPECT_TRUE(registry.IsClientAlive("node-1")); -} - -TEST(ClientRegistryTest, RegisterMultiple) { - ClientRegistry registry(ClientRegistryConfig{}); - EXPECT_TRUE(registry.RegisterClient("c1", "127.0.0.1:1001", Caps(100, 90))); - EXPECT_TRUE(registry.RegisterClient("c2", "127.0.0.1:1002", Caps(110, 80))); - EXPECT_TRUE(registry.RegisterClient("c3", "127.0.0.1:1003", Caps(120, 70))); - - EXPECT_EQ(registry.ClientCount(), 3u); - EXPECT_TRUE(registry.IsClientAlive("c1")); - EXPECT_TRUE(registry.IsClientAlive("c2")); - EXPECT_TRUE(registry.IsClientAlive("c3")); -} - -TEST(ClientRegistryTest, GetAliveClientsReportsMembershipAndCapacities) { - ClientRegistry registry(ClientRegistryConfig{}); - EXPECT_TRUE(registry.RegisterClient("c1", "host-a:8080", Caps(80, 64))); - EXPECT_TRUE(registry.RegisterClient("c2", "host-b:8080", Caps(96, 32))); - - const auto clients = registry.GetAliveClients(); - ASSERT_EQ(clients.size(), 2u); - - const ClientRecord* c1 = FindClient(clients, "c1"); - const ClientRecord* c2 = FindClient(clients, "c2"); - ASSERT_NE(c1, nullptr); - ASSERT_NE(c2, nullptr); - - EXPECT_EQ(c1->node_address, "host-a:8080"); - EXPECT_EQ(c2->node_address, "host-b:8080"); - EXPECT_EQ(c1->status, ClientStatus::ALIVE); - EXPECT_EQ(c2->status, ClientStatus::ALIVE); - - // Master stores the peer-reported capacities verbatim. - ASSERT_TRUE(c1->tier_capacities.count(TierType::HBM) > 0); - ASSERT_TRUE(c2->tier_capacities.count(TierType::HBM) > 0); - EXPECT_EQ(c1->tier_capacities.at(TierType::HBM).total_bytes, 80u); - EXPECT_EQ(c1->tier_capacities.at(TierType::HBM).available_bytes, 64u); - EXPECT_EQ(c2->tier_capacities.at(TierType::HBM).available_bytes, 32u); -} - -TEST(ClientRegistryTest, ReRegisterAliveRejected) { - ClientRegistry registry(ClientRegistryConfig{}); - EXPECT_TRUE(registry.RegisterClient("c1", "addr-1", Caps(80, 64))); - // A live node may not silently take over its own id with a new address. - EXPECT_FALSE(registry.RegisterClient("c1", "addr-2", Caps(80, 32))); - - EXPECT_EQ(registry.ClientCount(), 1u); - const auto clients = registry.GetAliveClients(); - ASSERT_EQ(clients.size(), 1u); - EXPECT_EQ(clients[0].node_address, "addr-1"); // original record untouched -} - -TEST(ClientRegistryTest, ReRegisterExpiredAllowed) { - // No reaper here: the aged-out branch in RegisterClient (now - last_heartbeat - // > expiry) must accept the re-registration on its own. - ClientRegistry registry(FastExpiryConfig()); - EXPECT_TRUE(registry.RegisterClient("c1", "addr-1", Caps(80, 64))); - - const bool reregistered = - WaitUntil([®istry] { return registry.RegisterClient("c1", "addr-2", Caps(80, 32)); }, - std::chrono::seconds(5)); - EXPECT_TRUE(reregistered); - - EXPECT_EQ(registry.ClientCount(), 1u); - const auto clients = registry.GetAliveClients(); - ASSERT_EQ(clients.size(), 1u); - EXPECT_EQ(clients[0].node_address, "addr-2"); // new address wins - EXPECT_EQ(clients[0].status, ClientStatus::ALIVE); -} - -// --- Unregister -------------------------------------------------------------- - -TEST(ClientRegistryTest, UnregisterExisting) { - ClientRegistry registry(ClientRegistryConfig{}); - EXPECT_TRUE(registry.RegisterClient("c1", "addr", Caps(80, 64))); - - registry.UnregisterClient("c1"); - EXPECT_EQ(registry.ClientCount(), 0u); - EXPECT_FALSE(registry.IsClientAlive("c1")); -} - -TEST(ClientRegistryTest, UnregisterUnknownIsNoop) { - ClientRegistry registry(ClientRegistryConfig{}); - EXPECT_TRUE(registry.RegisterClient("c1", "addr", Caps(80, 64))); - - registry.UnregisterClient("nonexistent"); - EXPECT_EQ(registry.ClientCount(), 1u); - EXPECT_TRUE(registry.IsClientAlive("c1")); -} - -TEST(ClientRegistryTest, UnregisterTwiceIsSafe) { - ClientRegistry registry(ClientRegistryConfig{}); - EXPECT_TRUE(registry.RegisterClient("c1", "addr", Caps(80, 64))); - - registry.UnregisterClient("c1"); - registry.UnregisterClient("c1"); - EXPECT_EQ(registry.ClientCount(), 0u); -} - -// --- Heartbeat --------------------------------------------------------------- - -TEST(ClientRegistryTest, HeartbeatAliveReplacesCapacities) { - ClientRegistry registry(ClientRegistryConfig{}); - EXPECT_TRUE(registry.RegisterClient("c1", "addr", Caps(80, 64))); - - EXPECT_EQ(Beat(registry, "c1", Caps(80, 16)), ClientStatus::ALIVE); - EXPECT_TRUE(registry.IsClientAlive("c1")); - - const auto clients = registry.GetAliveClients(); - ASSERT_EQ(clients.size(), 1u); - ASSERT_TRUE(clients[0].tier_capacities.count(TierType::HBM) > 0); - // The most recent heartbeat's capacities replace the stored values. - EXPECT_EQ(clients[0].tier_capacities.at(TierType::HBM).available_bytes, 16u); -} - -TEST(ClientRegistryTest, HeartbeatUnknownReturnsUnknown) { - ClientRegistry registry(ClientRegistryConfig{}); - EXPECT_EQ(Beat(registry, "nonexistent", Caps(80, 48)), ClientStatus::UNKNOWN); -} - -// --- Reaper ------------------------------------------------------------------ - -TEST(ClientRegistryTest, ReaperExpiresIdleClient) { - ClientRegistry registry(FastExpiryConfig()); - EXPECT_TRUE(registry.RegisterClient("c1", "addr", Caps(80, 64))); - registry.StartReaper(); - - const bool reaped = - WaitUntil([®istry] { return registry.ClientCount() == 0; }, std::chrono::seconds(6)); - - registry.StopReaper(); - EXPECT_TRUE(reaped); - EXPECT_FALSE(registry.IsClientAlive("c1")); -} - -TEST(ClientRegistryTest, ReaperKeepsClientAliveWithHeartbeats) { - ClientRegistry registry(FastExpiryConfig()); - EXPECT_TRUE(registry.RegisterClient("c1", "addr", Caps(80, 64))); - registry.StartReaper(); - - const auto start = std::chrono::steady_clock::now(); - while (std::chrono::steady_clock::now() - start < std::chrono::seconds(3)) { - EXPECT_EQ(Beat(registry, "c1", Caps(80, 48)), ClientStatus::ALIVE); - std::this_thread::sleep_for(std::chrono::milliseconds(300)); - } - - registry.StopReaper(); - EXPECT_EQ(registry.ClientCount(), 1u); - EXPECT_TRUE(registry.IsClientAlive("c1")); -} - -TEST(ClientRegistryTest, ReaperSelectiveExpiry) { - ClientRegistry registry(FastExpiryConfig()); - EXPECT_TRUE(registry.RegisterClient("c1", "addr-1", Caps(80, 64))); - EXPECT_TRUE(registry.RegisterClient("c2", "addr-2", Caps(80, 64))); - registry.StartReaper(); - - // Keep c1 fed; let c2 go silent. c2 must be reaped while c1 survives. - const bool reached = WaitUntil( - [®istry] { - Beat(registry, "c1", Caps(80, 48)); - return registry.IsClientAlive("c1") && !registry.IsClientAlive("c2"); - }, - std::chrono::seconds(6), std::chrono::milliseconds(200)); - - registry.StopReaper(); - EXPECT_TRUE(reached); - EXPECT_TRUE(registry.IsClientAlive("c1")); - EXPECT_FALSE(registry.IsClientAlive("c2")); -} - -TEST(ClientRegistryTest, StopReaperWhenNeverStarted) { - ClientRegistry registry(ClientRegistryConfig{}); - registry.StopReaper(); // must not hang or crash - SUCCEED(); -} - -TEST(ClientRegistryTest, StartStopReaperMultiple) { - ClientRegistry registry(ClientRegistryConfig{}); - registry.StartReaper(); - registry.StopReaper(); - registry.StartReaper(); - registry.StopReaper(); - SUCCEED(); -} - -TEST(ClientRegistryTest, DestructorStopsRunningReaper) { - ClientRegistry registry(ClientRegistryConfig{}); - registry.StartReaper(); - EXPECT_TRUE(registry.RegisterClient("c1", "addr", Caps(80, 64))); - // Falling out of scope must join the reaper thread cleanly. -} - -} // namespace mori::umbp diff --git a/tests/cpp/umbp/distributed/test_client_registry_external_kv.cpp b/tests/cpp/umbp/distributed/test_client_registry_external_kv.cpp deleted file mode 100644 index e232fe58a..000000000 --- a/tests/cpp/umbp/distributed/test_client_registry_external_kv.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright © Advanced Micro Devices, Inc. All rights reserved. -// -// MIT License -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -#include - -#include "umbp/distributed/master/client_registry.h" -#include "umbp/distributed/master/external_kv_block_index.h" -#include "umbp/distributed/master/global_block_index.h" - -namespace mori::umbp { - -TEST(ClientRegistryExternalKv, UnregisterClientClearsBothIndices) { - GlobalBlockIndex global_index; - ExternalKvBlockIndex external_index; - ClientRegistry registry(ClientRegistryConfig{}, global_index, &external_index); - - ASSERT_TRUE(registry.RegisterClient("node-A", "127.0.0.1:9000", {}, "127.0.0.1:9001")); - ASSERT_EQ(global_index.ApplyEvents("node-A", - {KvEvent{KvEvent::Kind::ADD, "owned", TierType::DRAM, 128}}), - 1u); - ASSERT_EQ(external_index.Register("node-A", {"external"}, TierType::DRAM), 1u); - - registry.UnregisterClient("node-A"); - - EXPECT_TRUE(global_index.Lookup("owned").empty()); - EXPECT_TRUE(external_index.Match({"external"}).empty()); -} - -TEST(ClientRegistryExternalKv, UnregisterWithoutExternalIndexDoesNotCrash) { - GlobalBlockIndex global_index; - ClientRegistry registry(ClientRegistryConfig{}, global_index); - - ASSERT_TRUE(registry.RegisterClient("node-A", "127.0.0.1:9000", {}, "127.0.0.1:9001")); - EXPECT_NO_THROW(registry.UnregisterClient("node-A")); -} - -} // namespace mori::umbp diff --git a/tests/cpp/umbp/distributed/test_external_kv_block_index.cpp b/tests/cpp/umbp/distributed/test_external_kv_block_index.cpp deleted file mode 100644 index 18ee654d5..000000000 --- a/tests/cpp/umbp/distributed/test_external_kv_block_index.cpp +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright © Advanced Micro Devices, Inc. All rights reserved. -// -// MIT License -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -#include - -#include -#include -#include - -#include "umbp/distributed/master/external_kv_block_index.h" - -namespace mori::umbp { -namespace { - -const ExternalKvBlockIndex::NodeMatch* FindMatch( - const std::vector& matches, const std::string& node_id) { - for (const auto& match : matches) { - if (match.node_id == node_id) return &match; - } - return nullptr; -} - -std::vector Sorted(std::vector values) { - std::sort(values.begin(), values.end()); - return values; -} - -} // namespace - -TEST(ExternalKvBlockIndex, RegisterIsAdditiveAcrossTiersAndCountsMutations) { - ExternalKvBlockIndex index; - - EXPECT_EQ(index.Register("node-A", {"h1"}, TierType::HBM), 1u); - EXPECT_EQ(index.Register("node-A", {"h1"}, TierType::DRAM), 1u); - EXPECT_EQ(index.Register("node-A", {"h1"}, TierType::DRAM), 0u); - - auto matches = index.Match({"h1"}); - ASSERT_EQ(matches.size(), 1u); - EXPECT_EQ(matches[0].MatchedHashCount(), 1u); - EXPECT_EQ(matches[0].hashes_by_tier.at(TierType::HBM), std::vector({"h1"})); - EXPECT_EQ(matches[0].hashes_by_tier.at(TierType::DRAM), std::vector({"h1"})); - EXPECT_EQ(index.GetKvCount("node-A"), 1u); -} - -TEST(ExternalKvBlockIndex, UnregisterRemovesOnlyRequestedTier) { - ExternalKvBlockIndex index; - ASSERT_EQ(index.Register("node-A", {"h1", "h2"}, TierType::HBM), 2u); - ASSERT_EQ(index.Register("node-A", {"h1"}, TierType::DRAM), 1u); - - EXPECT_EQ(index.Unregister("node-A", {"h1", "missing"}, TierType::HBM), 1u); - EXPECT_EQ(index.Unregister("node-A", {"h1"}, TierType::HBM), 0u); - - auto matches = index.Match({"h1", "h2"}); - ASSERT_EQ(matches.size(), 1u); - const auto& match = matches[0]; - EXPECT_EQ(match.hashes_by_tier.at(TierType::DRAM), std::vector({"h1"})); - EXPECT_EQ(match.hashes_by_tier.at(TierType::HBM), std::vector({"h2"})); - EXPECT_EQ(index.GetKvCount("node-A"), 2u); -} - -TEST(ExternalKvBlockIndex, BulkUnregisterByTierAndNode) { - ExternalKvBlockIndex index; - ASSERT_EQ(index.Register("node-A", {"h1", "h2", "h3"}, TierType::DRAM), 3u); - ASSERT_EQ(index.Register("node-A", {"h1", "h2"}, TierType::SSD), 2u); - ASSERT_EQ(index.Register("node-B", {"h1"}, TierType::SSD), 1u); - - EXPECT_EQ(index.UnregisterByNodeAtTier("node-A", TierType::SSD), 2u); - auto matches = index.Match({"h1", "h2", "h3"}); - ASSERT_EQ(matches.size(), 2u); - const auto* node_a = FindMatch(matches, "node-A"); - ASSERT_NE(node_a, nullptr); - ASSERT_EQ(node_a->hashes_by_tier.size(), 1u); - EXPECT_EQ(Sorted(node_a->hashes_by_tier.at(TierType::DRAM)), - (std::vector{"h1", "h2", "h3"})); - const auto* node_b = FindMatch(matches, "node-B"); - ASSERT_NE(node_b, nullptr); - EXPECT_EQ(node_b->hashes_by_tier.at(TierType::SSD), std::vector({"h1"})); - - EXPECT_EQ(index.UnregisterByNode("node-A"), 3u); - matches = index.Match({"h1", "h2", "h3"}); - ASSERT_EQ(matches.size(), 1u); - EXPECT_EQ(matches[0].node_id, "node-B"); -} - -} // namespace mori::umbp diff --git a/tests/cpp/umbp/distributed/test_external_kv_hit_index.cpp b/tests/cpp/umbp/distributed/test_external_kv_hit_index.cpp deleted file mode 100644 index 20685f0b7..000000000 --- a/tests/cpp/umbp/distributed/test_external_kv_hit_index.cpp +++ /dev/null @@ -1,116 +0,0 @@ -// Copyright © Advanced Micro Devices, Inc. All rights reserved. -// -// MIT License -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -#include - -#include -#include -#include -#include -#include -#include - -#include "umbp/distributed/master/external_kv_hit_index.h" - -namespace mori::umbp { -namespace { - -std::unordered_map LookupMap(ExternalKvHitIndex& index, - const std::vector& hashes) { - std::vector> entries; - index.Lookup(hashes, &entries); - std::unordered_map out; - for (const auto& [hash, total] : entries) out[hash] = total; - return out; -} - -TEST(ExternalKvHitIndexTest, IncrementAndLookup) { - ExternalKvHitIndex index; - index.IncrementHits({"h1", "h2"}, 100); - - auto counts = LookupMap(index, {"h1", "h2", "missing"}); - ASSERT_EQ(counts.size(), 2); - EXPECT_EQ(counts["h1"], 1); - EXPECT_EQ(counts["h2"], 1); -} - -TEST(ExternalKvHitIndexTest, RepeatedIncrementsAccumulate) { - ExternalKvHitIndex index; - for (int i = 0; i < 10; ++i) index.IncrementHits({"hot"}, 100 + i); - - auto counts = LookupMap(index, {"hot"}); - ASSERT_EQ(counts.size(), 1); - EXPECT_EQ(counts["hot"], 10); -} - -TEST(ExternalKvHitIndexTest, LookupSkipsMissingAndDedupesRequestHashes) { - ExternalKvHitIndex index; - index.IncrementHits({"h1"}, 100); - - std::vector> entries; - index.Lookup({"missing", "h1", "h1", "missing"}, &entries); - ASSERT_EQ(entries.size(), 1); - EXPECT_EQ(entries[0].first, "h1"); - EXPECT_EQ(entries[0].second, 1); -} - -TEST(ExternalKvHitIndexTest, GarbageCollectUsesLastSeenCutoff) { - ExternalKvHitIndex index; - index.IncrementHits({"old"}, 100); - index.IncrementHits({"fresh"}, 200); - - EXPECT_EQ(index.GarbageCollect(150), 1); - EXPECT_EQ(index.Size(), 1); - - auto counts = LookupMap(index, {"old", "fresh"}); - ASSERT_EQ(counts.size(), 1); - EXPECT_EQ(counts["fresh"], 1); -} - -TEST(ExternalKvHitIndexTest, ConcurrentCreationKeepsAllIncrements) { - ExternalKvHitIndex index; - constexpr int kThreads = 32; - constexpr int kIterations = 1000; - - std::atomic start{false}; - std::vector threads; - threads.reserve(kThreads); - for (int t = 0; t < kThreads; ++t) { - threads.emplace_back([&] { - while (!start.load(std::memory_order_acquire)) { - std::this_thread::yield(); - } - for (int i = 0; i < kIterations; ++i) { - index.IncrementHits({"shared"}, static_cast(100 + i)); - } - }); - } - - start.store(true, std::memory_order_release); - for (auto& thread : threads) thread.join(); - - auto counts = LookupMap(index, {"shared"}); - ASSERT_EQ(counts.size(), 1); - EXPECT_EQ(counts["shared"], static_cast(kThreads * kIterations)); -} - -} // namespace -} // namespace mori::umbp diff --git a/tests/cpp/umbp/distributed/test_global_block_index_events.cpp b/tests/cpp/umbp/distributed/test_global_block_index_events.cpp deleted file mode 100644 index 1b82e43ea..000000000 --- a/tests/cpp/umbp/distributed/test_global_block_index_events.cpp +++ /dev/null @@ -1,505 +0,0 @@ -// Copyright © Advanced Micro Devices, Inc. All rights reserved. -// -// MIT License -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -#include - -#include -#include -#include -#include - -#include "umbp/distributed/master/client_registry.h" -#include "umbp/distributed/master/global_block_index.h" -#include "umbp/distributed/types.h" - -namespace mori::umbp { - -namespace { - -KvEvent Add(std::string key, TierType tier, uint64_t size) { - return KvEvent{KvEvent::Kind::ADD, std::move(key), tier, size}; -} - -KvEvent Remove(std::string key, TierType tier) { - return KvEvent{KvEvent::Kind::REMOVE, std::move(key), tier, 0}; -} - -EventBundle Bundle(uint64_t seq, std::vector events) { - return EventBundle{seq, std::move(events)}; -} - -bool HasLocation(const std::vector& locs, const std::string& node, TierType tier, - uint64_t size) { - for (const auto& l : locs) { - if (l.node_id == node && l.tier == tier && l.size == size) return true; - } - return false; -} - -} // namespace - -// ---- ApplyEvents: ADD/REMOVE round-trip ------------------------------------ - -TEST(GlobalBlockIndexEvents, ApplyAddInsertsLocation) { - GlobalBlockIndex idx; - ASSERT_EQ(idx.ApplyEvents("node-A", {Add("k1", TierType::DRAM, 1024)}), 1u); - auto locs = idx.Lookup("k1"); - ASSERT_EQ(locs.size(), 1u); - EXPECT_EQ(locs[0].node_id, "node-A"); - EXPECT_EQ(locs[0].tier, TierType::DRAM); - EXPECT_EQ(locs[0].size, 1024u); -} - -// Duplicate ADD keeps the first observed size; only REMOVE retires it. -TEST(GlobalBlockIndexEvents, ApplyAddSameNodeTierKeepsExistingSize) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("k", TierType::DRAM, 1024)}); - idx.ApplyEvents("node-A", {Add("k", TierType::DRAM, 2048)}); - auto locs = idx.Lookup("k"); - ASSERT_EQ(locs.size(), 1u); - EXPECT_EQ(locs[0].size, 1024u); -} - -TEST(GlobalBlockIndexEvents, MultipleNodesCoexistForSameKey) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("k", TierType::DRAM, 100)}); - idx.ApplyEvents("node-B", {Add("k", TierType::DRAM, 200)}); - idx.ApplyEvents("node-A", {Add("k", TierType::HBM, 300)}); // different tier on A - auto locs = idx.Lookup("k"); - EXPECT_EQ(locs.size(), 3u); - EXPECT_TRUE(HasLocation(locs, "node-A", TierType::DRAM, 100)); - EXPECT_TRUE(HasLocation(locs, "node-B", TierType::DRAM, 200)); - EXPECT_TRUE(HasLocation(locs, "node-A", TierType::HBM, 300)); -} - -TEST(GlobalBlockIndexEvents, RemoveErasesMatchingLocationOnly) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("k", TierType::DRAM, 100)}); - idx.ApplyEvents("node-B", {Add("k", TierType::DRAM, 200)}); - - idx.ApplyEvents("node-A", {Remove("k", TierType::DRAM)}); - auto locs = idx.Lookup("k"); - ASSERT_EQ(locs.size(), 1u); - EXPECT_EQ(locs[0].node_id, "node-B"); -} - -TEST(GlobalBlockIndexEvents, RemoveLastLocationErasesEntry) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("k", TierType::DRAM, 100)}); - idx.ApplyEvents("node-A", {Remove("k", TierType::DRAM)}); - EXPECT_TRUE(idx.Lookup("k").empty()); - EXPECT_FALSE(idx.GetMetrics("k").has_value()); -} - -TEST(GlobalBlockIndexEvents, RemoveUnknownIsNoop) { - GlobalBlockIndex idx; - EXPECT_EQ(idx.ApplyEvents("ghost", {Remove("ghost-key", TierType::DRAM)}), 0u); -} - -// A key mirrored on both DRAM and SSD of one node: a DRAM eviction -// (REMOVE DRAM) must drop only the DRAM bucket and leave the SSD location -// readable. This is the additive-index invariant the SSD tier relies -// on (DRAM evict never touches the SSD copy); no master code is exercised here -// beyond ApplyEvents. -TEST(GlobalBlockIndexEvents, RemoveDramKeepsSsdBucket) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("k", TierType::DRAM, 100), Add("k", TierType::SSD, 100)}); - ASSERT_TRUE(HasLocation(idx.Lookup("k"), "node-A", TierType::DRAM, 100)); - ASSERT_TRUE(HasLocation(idx.Lookup("k"), "node-A", TierType::SSD, 100)); - - idx.ApplyEvents("node-A", {Remove("k", TierType::DRAM)}); - - auto locs = idx.Lookup("k"); - EXPECT_FALSE(HasLocation(locs, "node-A", TierType::DRAM, 100)); // DRAM bucket gone - EXPECT_TRUE(HasLocation(locs, "node-A", TierType::SSD, 100)); // SSD bucket retained -} - -TEST(GlobalBlockIndexEvents, ClearAtTierClearsOnlyTargetNodeTier) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("k1", TierType::DRAM, 1), Add("k2", TierType::SSD, 2), - Add("k3", TierType::DRAM, 3)}); - idx.ApplyEvents("node-B", {Add("k1", TierType::DRAM, 10)}); - - EXPECT_EQ( - idx.ApplyEvents("node-A", {KvEvent{KvEvent::Kind::CLEAR_AT_TIER, "", TierType::DRAM, 0}}), - 2u); - - EXPECT_FALSE(HasLocation(idx.Lookup("k1"), "node-A", TierType::DRAM, 1)); - EXPECT_TRUE(HasLocation(idx.Lookup("k1"), "node-B", TierType::DRAM, 10)); - EXPECT_TRUE(HasLocation(idx.Lookup("k2"), "node-A", TierType::SSD, 2)); - EXPECT_TRUE(idx.Lookup("k3").empty()); -} - -// ---- ReplaceNodeLocations: full-sync recovery ------------------------------ - -TEST(GlobalBlockIndexEvents, ReplaceNodeLocationsClearsThenInserts) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("k1", TierType::DRAM, 100), Add("k2", TierType::DRAM, 200)}); - idx.ApplyEvents("node-B", {Add("k1", TierType::DRAM, 999)}); // shared key, different node - - // Full-sync from node-A: k1 stays (different size), k2 is gone, new k3 appears. - idx.ReplaceNodeLocations("node-A", - {Add("k1", TierType::DRAM, 150), Add("k3", TierType::DRAM, 300)}); - - auto k1 = idx.Lookup("k1"); - EXPECT_TRUE(HasLocation(k1, "node-A", TierType::DRAM, 150)); - EXPECT_TRUE(HasLocation(k1, "node-B", TierType::DRAM, 999)); // node-B untouched - - EXPECT_TRUE(idx.Lookup("k2").empty()); // dropped — node-A's full-sync didn't include it - - auto k3 = idx.Lookup("k3"); - EXPECT_TRUE(HasLocation(k3, "node-A", TierType::DRAM, 300)); -} - -TEST(GlobalBlockIndexEvents, ReplaceNodeLocationsEmptyClearsAllForNode) { - // Used by ClientRegistry::UnregisterClient and the reaper to drop a - // dead node's index entries. - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("k1", TierType::DRAM, 1), Add("k2", TierType::HBM, 2)}); - idx.ApplyEvents("node-B", {Add("k1", TierType::DRAM, 3)}); - - idx.ReplaceNodeLocations("node-A", {}); - EXPECT_EQ(idx.Lookup("k1").size(), 1u); // node-B still owns k1 - EXPECT_TRUE(idx.Lookup("k2").empty()); // node-A's only HBM location is gone -} - -TEST(GlobalBlockIndexEvents, ReplaceNodeLocationsIgnoresRemoveEntries) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("k1", TierType::DRAM, 100)}); - // Snapshot full-sync conventionally carries only ADDs; sneaking - // a REMOVE in is silently skipped (the snapshot is the truth). - idx.ReplaceNodeLocations("node-A", - {Add("k2", TierType::DRAM, 200), Remove("k3", TierType::DRAM)}); - EXPECT_TRUE(idx.Lookup("k1").empty()); - EXPECT_FALSE(idx.Lookup("k2").empty()); - EXPECT_TRUE(idx.Lookup("k3").empty()); -} - -// ---- Reverse-index (node_to_keys_) invariants ------------------------------ - -TEST(GlobalBlockIndexEvents, ReplaceNodeLocationsAfterMultiTierRemoveKeepsKeyClean) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("k", TierType::DRAM, 100), Add("k", TierType::HBM, 200)}); - idx.ApplyEvents("node-B", {Add("k", TierType::DRAM, 300)}); - - // A still owns (k, HBM): reverse index must keep k. - idx.ApplyEvents("node-A", {Remove("k", TierType::DRAM)}); - auto mid = idx.Lookup("k"); - ASSERT_EQ(mid.size(), 2u); - EXPECT_TRUE(HasLocation(mid, "node-A", TierType::HBM, 200)); - EXPECT_TRUE(HasLocation(mid, "node-B", TierType::DRAM, 300)); - - idx.ReplaceNodeLocations("node-A", {}); - auto after = idx.Lookup("k"); - ASSERT_EQ(after.size(), 1u); - EXPECT_EQ(after[0].node_id, "node-B"); - EXPECT_EQ(after[0].tier, TierType::DRAM); - EXPECT_EQ(after[0].size, 300u); -} - -TEST(GlobalBlockIndexEvents, ReplaceNodeLocationsLeavesOtherNodesIntact) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("k1", TierType::DRAM, 1), Add("k2", TierType::DRAM, 2)}); - idx.ApplyEvents("node-B", {Add("k1", TierType::DRAM, 10), Add("k3", TierType::HBM, 30)}); - idx.ApplyEvents("node-C", {Add("k2", TierType::HBM, 200), Add("k4", TierType::DRAM, 400)}); - - idx.ReplaceNodeLocations("node-A", {Add("k_new", TierType::DRAM, 999)}); - - auto k1 = idx.Lookup("k1"); - ASSERT_EQ(k1.size(), 1u); - EXPECT_EQ(k1[0].node_id, "node-B"); - EXPECT_EQ(k1[0].size, 10u); - - auto k2 = idx.Lookup("k2"); - ASSERT_EQ(k2.size(), 1u); - EXPECT_EQ(k2[0].node_id, "node-C"); - EXPECT_EQ(k2[0].tier, TierType::HBM); - - EXPECT_TRUE(HasLocation(idx.Lookup("k_new"), "node-A", TierType::DRAM, 999)); - - auto k3 = idx.Lookup("k3"); - ASSERT_EQ(k3.size(), 1u); - EXPECT_EQ(k3[0].node_id, "node-B"); - EXPECT_EQ(k3[0].size, 30u); - - auto k4 = idx.Lookup("k4"); - ASSERT_EQ(k4.size(), 1u); - EXPECT_EQ(k4[0].node_id, "node-C"); - EXPECT_EQ(k4[0].size, 400u); -} - -// 2nd sync must see reverse index repopulated by 1st sync's replay. -TEST(GlobalBlockIndexEvents, ReplaceNodeLocationsTwiceRotatesKeys) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("k_old", TierType::DRAM, 1)}); - - idx.ReplaceNodeLocations("node-A", - {Add("k_mid_a", TierType::DRAM, 2), Add("k_mid_b", TierType::HBM, 3)}); - EXPECT_TRUE(idx.Lookup("k_old").empty()); - EXPECT_FALSE(idx.Lookup("k_mid_a").empty()); - EXPECT_FALSE(idx.Lookup("k_mid_b").empty()); - - idx.ReplaceNodeLocations("node-A", {Add("k_final", TierType::DRAM, 4)}); - EXPECT_TRUE(idx.Lookup("k_mid_a").empty()); - EXPECT_TRUE(idx.Lookup("k_mid_b").empty()); - auto final_locs = idx.Lookup("k_final"); - ASSERT_EQ(final_locs.size(), 1u); - EXPECT_EQ(final_locs[0].node_id, "node-A"); - EXPECT_EQ(final_locs[0].size, 4u); -} - -// Reverse-index insert must run even when inserted==false. -TEST(GlobalBlockIndexEvents, DuplicateAddKeepsReverseConsistent) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("dup", TierType::DRAM, 1024)}); - idx.ApplyEvents("node-A", {Add("dup", TierType::DRAM, 2048)}); - idx.ApplyEvents("node-A", {Add("dup", TierType::DRAM, 4096)}); - - auto locs = idx.Lookup("dup"); - ASSERT_EQ(locs.size(), 1u); - - idx.ReplaceNodeLocations("node-A", {}); - EXPECT_TRUE(idx.Lookup("dup").empty()); -} - -// No-op REMOVE must leave node_to_keys_ untouched on both sides. -TEST(GlobalBlockIndexEvents, RemoveNonMatchingTierLeavesReverseUntouched) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("k", TierType::DRAM, 100)}); - - idx.ApplyEvents("node-A", {Remove("k", TierType::HBM)}); - auto mid = idx.Lookup("k"); - ASSERT_EQ(mid.size(), 1u); - EXPECT_EQ(mid[0].tier, TierType::DRAM); - - idx.ReplaceNodeLocations("node-A", {}); - EXPECT_TRUE(idx.Lookup("k").empty()); - - idx.ApplyEvents("node-A", {Add("k", TierType::DRAM, 100)}); - idx.ApplyEvents("node-B", {Remove("k", TierType::DRAM)}); - idx.ReplaceNodeLocations("node-B", {}); - auto after = idx.Lookup("k"); - ASSERT_EQ(after.size(), 1u); - EXPECT_EQ(after[0].node_id, "node-A"); -} - -// ---- ClientRegistry::Heartbeat applies events end-to-end -------------------- - -TEST(ClientRegistryHeartbeat, AppliesEventsAndAdvancesSeq) { - GlobalBlockIndex idx; - ClientRegistryConfig cfg; - ClientRegistry reg(cfg, idx); - - ASSERT_TRUE(reg.RegisterClient("node-A", "10.0.0.1:1", /*caps=*/{}, "10.0.0.1:2", {})); - - uint64_t acked = 0; - bool need_full = false; - auto status = reg.Heartbeat("node-A", /*caps=*/{}, {Bundle(1, {Add("k", TierType::DRAM, 42)})}, - /*is_full_sync=*/false, /*delta_seq_baseline=*/0, &acked, &need_full); - EXPECT_EQ(status, ClientStatus::ALIVE); - EXPECT_EQ(acked, 1u); - EXPECT_FALSE(need_full); - EXPECT_FALSE(idx.Lookup("k").empty()); -} - -TEST(ClientRegistryHeartbeat, SeqGapTriggersFullSyncRequest) { - GlobalBlockIndex idx; - ClientRegistryConfig cfg; - ClientRegistry reg(cfg, idx); - reg.RegisterClient("node-A", "10.0.0.1:1", {}, "10.0.0.1:2", {}); - - uint64_t acked = 0; - bool need_full = false; - // First heartbeat seq=1 — applied normally. - reg.Heartbeat("node-A", {}, {Bundle(1, {Add("k1", TierType::DRAM, 1)})}, - /*is_full_sync=*/false, 0, &acked, &need_full); - ASSERT_FALSE(need_full); - ASSERT_EQ(acked, 1u); - - // Second heartbeat skips seq=2: master detects the gap. - reg.Heartbeat("node-A", {}, {Bundle(3, {Add("k2", TierType::DRAM, 2)})}, - /*is_full_sync=*/false, 0, &acked, &need_full); - EXPECT_TRUE(need_full); - EXPECT_EQ(acked, 1u); // unchanged — no events applied from this batch - - // k2 is NOT in the index because the gap-batch was rejected. - EXPECT_TRUE(idx.Lookup("k2").empty()); - EXPECT_FALSE(idx.Lookup("k1").empty()); -} - -TEST(ClientRegistryHeartbeat, FullSyncReplacesNodeLocations) { - GlobalBlockIndex idx; - ClientRegistryConfig cfg; - ClientRegistry reg(cfg, idx); - reg.RegisterClient("node-A", "10.0.0.1:1", {}, "10.0.0.1:2", {}); - - uint64_t acked = 0; - bool need_full = false; - reg.Heartbeat("node-A", {}, - {Bundle(1, {Add("k1", TierType::DRAM, 1), Add("k2", TierType::DRAM, 2)})}, - /*is_full_sync=*/false, 0, &acked, &need_full); - ASSERT_FALSE(idx.Lookup("k1").empty()); - ASSERT_FALSE(idx.Lookup("k2").empty()); - - // Full-sync: only k1 + k3 should remain for node-A. - reg.Heartbeat("node-A", {}, - {Bundle(2, {Add("k1", TierType::DRAM, 10), Add("k3", TierType::DRAM, 30)})}, - /*is_full_sync=*/true, /*delta_seq_baseline=*/2, &acked, &need_full); - EXPECT_EQ(acked, 2u); - EXPECT_FALSE(need_full); - - auto k1 = idx.Lookup("k1"); - ASSERT_EQ(k1.size(), 1u); - EXPECT_EQ(k1[0].size, 10u); // updated via full-sync - EXPECT_TRUE(idx.Lookup("k2").empty()); - EXPECT_FALSE(idx.Lookup("k3").empty()); -} - -TEST(ClientRegistryHeartbeat, UnregisterClearsNodeFromIndex) { - GlobalBlockIndex idx; - ClientRegistryConfig cfg; - ClientRegistry reg(cfg, idx); - reg.RegisterClient("node-A", "10.0.0.1:1", {}, "10.0.0.1:2", {}); - - uint64_t acked = 0; - bool need_full = false; - reg.Heartbeat("node-A", {}, {Bundle(1, {Add("k1", TierType::DRAM, 1)})}, - /*is_full_sync=*/false, 0, &acked, &need_full); - ASSERT_FALSE(idx.Lookup("k1").empty()); - - reg.UnregisterClient("node-A"); - EXPECT_TRUE(idx.Lookup("k1").empty()); - EXPECT_FALSE(reg.IsClientAlive("node-A")); -} - -// ---- FindEvictionCandidates -------------------------------------------------- - -TEST(GlobalBlockIndexEvents, FindEvictionCandidatesFiltersByOverloadedNodeTier) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("k1", TierType::DRAM, 100), Add("k2", TierType::HBM, 200)}); - idx.ApplyEvents("node-B", {Add("k1", TierType::DRAM, 100)}); - - std::set overloaded = { - {"node-A", TierType::DRAM}, - }; - auto candidates = idx.FindEvictionCandidates(overloaded); - // Only node-A's DRAM location of k1 is a candidate. - ASSERT_EQ(candidates.size(), 1u); - EXPECT_EQ(candidates[0].key, "k1"); - EXPECT_EQ(candidates[0].location.node_id, "node-A"); - EXPECT_EQ(candidates[0].location.tier, TierType::DRAM); -} - -// ---- BatchLookupForRouteGet ------------------------------------------------ - -TEST(GlobalBlockIndexEvents, BatchLookupForRouteGetEmptyInputReturnsEmpty) { - GlobalBlockIndex idx; - EXPECT_TRUE(idx.BatchLookupForRouteGet({}, {}, std::chrono::seconds{1}).empty()); -} - -TEST(GlobalBlockIndexEvents, BatchLookupForRouteGetMixedHitsAndMisses) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("k1", TierType::DRAM, 100)}); - idx.ApplyEvents("node-B", {Add("k1", TierType::DRAM, 200), Add("k2", TierType::HBM, 300)}); - - auto ref_k1 = idx.Lookup("k1"); - auto ref_k2 = idx.Lookup("k2"); - auto before_k1 = idx.GetMetrics("k1"); - auto before_k2 = idx.GetMetrics("k2"); - ASSERT_TRUE(before_k1.has_value()); - ASSERT_TRUE(before_k2.has_value()); - - auto results = idx.BatchLookupForRouteGet({"k1", "ghost", "k2"}, {}, std::chrono::seconds{10}); - ASSERT_EQ(results.size(), 3u); - EXPECT_EQ(results[0], ref_k1); - EXPECT_TRUE(results[1].empty()); - EXPECT_EQ(results[2], ref_k2); - - auto after_k1 = idx.GetMetrics("k1"); - auto after_k2 = idx.GetMetrics("k2"); - ASSERT_TRUE(after_k1.has_value()); - ASSERT_TRUE(after_k2.has_value()); - EXPECT_EQ(after_k1->access_count, before_k1->access_count + 1); - EXPECT_EQ(after_k2->access_count, before_k2->access_count + 1); - EXPECT_FALSE(idx.GetMetrics("ghost").has_value()); -} - -TEST(GlobalBlockIndexEvents, BatchLookupForRouteGetGrantsLeaseForHitsOnly) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("hit", TierType::DRAM, 100), Add("other", TierType::DRAM, 200)}); - - std::set overloaded{{"node-A", TierType::DRAM}}; - ASSERT_EQ(idx.FindEvictionCandidates(overloaded).size(), 2u); - - idx.BatchLookupForRouteGet({"hit", "ghost"}, {}, std::chrono::seconds{10}); - - auto candidates = idx.FindEvictionCandidates(overloaded); - ASSERT_EQ(candidates.size(), 1u); - EXPECT_EQ(candidates[0].key, "other"); -} - -// All replicas excluded -> slot empty, access_count NOT bumped, -// lease NOT granted. A key whose every replica is unreachable must -// not pollute LRU or block eviction. -TEST(GlobalBlockIndexEvents, BatchLookupForRouteGetSkipsSideEffectsWhenAllReplicasExcluded) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("k", TierType::DRAM, 100)}); - idx.ApplyEvents("node-B", {Add("k", TierType::DRAM, 200)}); - - auto before = idx.GetMetrics("k"); - ASSERT_TRUE(before.has_value()); - std::set overloaded{{"node-A", TierType::DRAM}, - {"node-B", TierType::DRAM}}; - ASSERT_EQ(idx.FindEvictionCandidates(overloaded).size(), 2u); - - std::unordered_set excludes{"node-A", "node-B"}; - auto results = idx.BatchLookupForRouteGet({"k"}, excludes, std::chrono::seconds{10}); - ASSERT_EQ(results.size(), 1u); - EXPECT_TRUE(results[0].empty()); - - auto after = idx.GetMetrics("k"); - ASSERT_TRUE(after.has_value()); - EXPECT_EQ(after->access_count, before->access_count); - EXPECT_EQ(idx.FindEvictionCandidates(overloaded).size(), 2u); -} - -// Some replicas excluded but not all -> returned slot has only the -// survivors, access_count IS bumped, lease IS granted. -TEST(GlobalBlockIndexEvents, BatchLookupForRouteGetFiltersAndLeasesWhenSomeReplicasSurvive) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("k", TierType::DRAM, 100)}); - idx.ApplyEvents("node-B", {Add("k", TierType::DRAM, 200)}); - - auto before = idx.GetMetrics("k"); - ASSERT_TRUE(before.has_value()); - - std::unordered_set excludes{"node-A"}; - auto results = idx.BatchLookupForRouteGet({"k"}, excludes, std::chrono::seconds{10}); - ASSERT_EQ(results.size(), 1u); - ASSERT_EQ(results[0].size(), 1u); - EXPECT_EQ(results[0][0].node_id, "node-B"); - - auto after = idx.GetMetrics("k"); - ASSERT_TRUE(after.has_value()); - EXPECT_EQ(after->access_count, before->access_count + 1); -} - -} // namespace mori::umbp diff --git a/tests/cpp/umbp/distributed/test_ssd_reliability.cpp b/tests/cpp/umbp/distributed/test_ssd_reliability.cpp index e7f1a37ac..81e927baa 100644 --- a/tests/cpp/umbp/distributed/test_ssd_reliability.cpp +++ b/tests/cpp/umbp/distributed/test_ssd_reliability.cpp @@ -25,7 +25,7 @@ // * the unified owned-location event source merges DRAM + SSD into one // snapshot/delta (so a heartbeat full-sync ships SSD owned keys too); // * a local SSD eviction's REMOVE SSD event converges the master -// GlobalBlockIndex while leaving the DRAM bucket intact; +// metadata store while leaving the DRAM bucket intact; // * tier-priority RouteGet over the real index picks DRAM, then SSD once the // DRAM replica is removed; // * crash-restart leftover is discarded at startup; @@ -33,18 +33,21 @@ // // (copy-pin vs DRAM evict is covered by test_ssd_copy_pipeline's // EvictBlockedWhilePinnedThenAllowedAfterRelease; seq-gap -> full-sync by -// test_global_block_index_events' ClientRegistryHeartbeat.SeqGap*.) +// test_in_memory_master_metadata_store's heartbeat SeqGap cases.) #include +#include #include +#include #include #include #include #include +#include #include #include -#include "umbp/distributed/master/global_block_index.h" +#include "umbp/distributed/master/in_memory_master_metadata_store.h" #include "umbp/distributed/peer/owned_location_source.h" #include "umbp/distributed/peer/peer_ssd_manager.h" #include "umbp/distributed/routing/route_get_strategy.h" @@ -138,6 +141,29 @@ int CountTier(const std::vector& events, KvEvent::Kind kind, TierType t return n; } +// Under the merged store a block location can only be created through an +// ApplyHeartbeat from a registered (alive) node — locations no longer exist +// independently of a client record the way the old GlobalBlockIndex allowed. +// These helpers register a node once and apply heartbeat deltas with an +// ascending seq, standing in for the old GlobalBlockIndex::ApplyEvents. +constexpr uint64_t kGB = 1024ULL * 1024 * 1024; + +std::map MakeCaps() { + std::map caps; + caps[TierType::DRAM] = {8 * kGB, 8 * kGB}; + caps[TierType::SSD] = {8 * kGB, 8 * kGB}; + return caps; +} + +ClientRegistration MakeReg(const std::string& node_id) { + ClientRegistration reg; + reg.node_id = node_id; + reg.node_address = node_id + ":1"; + reg.peer_address = node_id + ":peer"; + reg.tier_capacities = MakeCaps(); + return reg; +} + // A canned owned-location source standing in for PeerDramAllocator so the // aggregation can be tested without standing up a DRAM allocator. class FakeOwnedSource : public OwnedLocationSource { @@ -194,58 +220,86 @@ TEST(SsdReliability, DeltaDrainMergesDramAndSsdEvents) { } // --------------------------------------------------------------------------- -// SSD local eviction -> REMOVE SSD -> master GlobalBlockIndex converges. +// SSD local eviction -> REMOVE SSD -> master metadata store converges. // --------------------------------------------------------------------------- // A key mirrored on DRAM + SSD of one owner: a local SSD eviction emits -// REMOVE SSD, and applying that to the master index drops only the SSD bucket +// REMOVE SSD, and applying that to the master store drops only the SSD bucket // (the DRAM replica, owned independently, stays routable). TEST(SsdReliability, LocalSsdEvictionRemoveConvergesMasterIndex) { - GlobalBlockIndex idx; + InMemoryMasterMetadataStore store; + const auto now = std::chrono::system_clock::now(); + ASSERT_TRUE(store.RegisterClient(MakeReg("owner"), now, std::chrono::seconds{30})); auto be = std::make_unique(1'000'000); PeerSsdManager ssd(std::move(be), 0.9, 0.7); // DRAM replica added independently (a DRAM owner would emit this). - idx.ApplyEvents("owner", {KvEvent{KvEvent::Kind::ADD, "k", TierType::DRAM, 100}}); - // SSD copy lands -> ADD SSD drained into the index. + ASSERT_EQ(store + .ApplyHeartbeat("owner", /*seq=*/1, now, MakeCaps(), + {KvEvent{KvEvent::Kind::ADD, "k", TierType::DRAM, 100}}, + /*is_full_sync=*/false) + .status, + HeartbeatResult::APPLIED); + // SSD copy lands -> ADD SSD drained into the store. ASSERT_TRUE(ssd.Write("k", OneSeg(std::string(100, 'x')), 100)); - idx.ApplyEvents("owner", ssd.DrainPendingEvents()); + ASSERT_EQ(store + .ApplyHeartbeat("owner", /*seq=*/2, now, MakeCaps(), ssd.DrainPendingEvents(), + /*is_full_sync=*/false) + .status, + HeartbeatResult::APPLIED); - auto both = idx.Lookup("k"); + auto both = store.LookupBlock("k"); ASSERT_TRUE(HasLoc(both, "owner", TierType::DRAM)); ASSERT_TRUE(HasLoc(both, "owner", TierType::SSD)); - // Local SSD eviction -> REMOVE SSD -> index drops only the SSD bucket. + // Local SSD eviction -> REMOVE SSD -> store drops only the SSD bucket. ASSERT_TRUE(ssd.Evict("k")); auto ssd_events = ssd.DrainPendingEvents(); EXPECT_EQ(CountTier(ssd_events, KvEvent::Kind::REMOVE, TierType::SSD), 1); - idx.ApplyEvents("owner", ssd_events); + ASSERT_EQ(store + .ApplyHeartbeat("owner", /*seq=*/3, now, MakeCaps(), ssd_events, + /*is_full_sync=*/false) + .status, + HeartbeatResult::APPLIED); - auto after = idx.Lookup("k"); + auto after = store.LookupBlock("k"); EXPECT_TRUE(HasLoc(after, "owner", TierType::DRAM)); // DRAM replica still routable EXPECT_FALSE(HasLoc(after, "owner", TierType::SSD)); // SSD bucket converged away } // --------------------------------------------------------------------------- -// Tier-priority RouteGet over the real index: DRAM first, SSD after evict. +// Tier-priority RouteGet over the real store: DRAM first, SSD after evict. // --------------------------------------------------------------------------- TEST(SsdReliability, TierPriorityRoutesDramThenSsdAfterDramRemoved) { - GlobalBlockIndex idx; - idx.ApplyEvents("owner", {KvEvent{KvEvent::Kind::ADD, "k", TierType::DRAM, 100}, - KvEvent{KvEvent::Kind::ADD, "k", TierType::SSD, 100}}); + InMemoryMasterMetadataStore store; + const auto now = std::chrono::system_clock::now(); + const std::unordered_set kNoExclude; + ASSERT_TRUE(store.RegisterClient(MakeReg("owner"), now, std::chrono::seconds{30})); + ASSERT_EQ(store + .ApplyHeartbeat("owner", /*seq=*/1, now, MakeCaps(), + {KvEvent{KvEvent::Kind::ADD, "k", TierType::DRAM, 100}, + KvEvent{KvEvent::Kind::ADD, "k", TierType::SSD, 100}}, + /*is_full_sync=*/false) + .status, + HeartbeatResult::APPLIED); TierPriorityRouteGetStrategy strategy; - auto locs = idx.BatchLookupForRouteGet({"k"}, {}, std::chrono::seconds{10}); + auto locs = store.BatchLookupBlockForRouteGet({"k"}, kNoExclude, now, std::chrono::seconds{10}); ASSERT_EQ(locs.size(), 1u); auto dram_pick = strategy.Select(locs[0], "reader"); EXPECT_EQ(dram_pick.tier, TierType::DRAM) << "prefers the fast DRAM replica"; // DRAM evicted -> only the SSD bucket remains -> RouteGet must serve from SSD. - idx.ApplyEvents("owner", {KvEvent{KvEvent::Kind::REMOVE, "k", TierType::DRAM, 0}}); - auto locs2 = idx.BatchLookupForRouteGet({"k"}, {}, std::chrono::seconds{10}); + ASSERT_EQ(store + .ApplyHeartbeat("owner", /*seq=*/2, now, MakeCaps(), + {KvEvent{KvEvent::Kind::REMOVE, "k", TierType::DRAM, 0}}, + /*is_full_sync=*/false) + .status, + HeartbeatResult::APPLIED); + auto locs2 = store.BatchLookupBlockForRouteGet({"k"}, kNoExclude, now, std::chrono::seconds{10}); ASSERT_EQ(locs2.size(), 1u); auto ssd_pick = strategy.Select(locs2[0], "reader"); EXPECT_EQ(ssd_pick.tier, TierType::SSD) << "falls back to the surviving SSD replica"; diff --git a/tests/cpp/umbp/distributed/test_umbp_tags.cpp b/tests/cpp/umbp/distributed/test_umbp_tags.cpp index 51b965198..3a0084c32 100644 --- a/tests/cpp/umbp/distributed/test_umbp_tags.cpp +++ b/tests/cpp/umbp/distributed/test_umbp_tags.cpp @@ -21,9 +21,9 @@ // SOFTWARE. // Tests for the client-tag feature: // -// Suite 1 — ClientRegistryTagsTest -// Unit tests directly on ClientRegistry: verify tags are stored on -// RegisterClient and returned verbatim by GetClientTags. +// Suite 1 — ClientTagsTest +// Unit tests directly on InMemoryMasterMetadataStore: verify tags are stored +// on RegisterClient and returned verbatim by GetClientTags. // // Suite 2 — MasterClientTagsE2ETest // Integration test: MasterClient registers with tags via gRPC, the real @@ -53,7 +53,7 @@ #include "umbp.grpc.pb.h" #include "umbp/distributed/config.h" -#include "umbp/distributed/master/client_registry.h" +#include "umbp/distributed/master/in_memory_master_metadata_store.h" #include "umbp/distributed/master/master_client.h" #include "umbp/distributed/master/master_server.h" #include "umbp/distributed/types.h" @@ -94,54 +94,77 @@ static bool WaitFor(std::function pred, std::chrono::milliseconds timeou } // --------------------------------------------------------------------------- -// Suite 1: ClientRegistry unit tests (no gRPC) +// Suite 1: InMemoryMasterMetadataStore unit tests (no gRPC) // --------------------------------------------------------------------------- -TEST(ClientRegistryTagsTest, TagsStoredOnRegister) { - ClientRegistry reg(ClientRegistryConfig{}); +// Build a registration for `node_id` carrying `tags`. The merged store creates +// client records through RegisterClient(ClientRegistration, now, stale_after), +// replacing the old ClientRegistry::RegisterClient(node, addr, caps, ...) form. +ClientRegistration MakeReg(const std::string& node_id, const std::string& node_address, + std::vector tags = {}) { + ClientRegistration reg; + reg.node_id = node_id; + reg.node_address = node_address; + reg.tags = std::move(tags); + return reg; +} + +TEST(ClientTagsTest, TagsStoredOnRegister) { + InMemoryMasterMetadataStore store; + const auto now = std::chrono::system_clock::now(); const std::vector tags = {"sgl_role=prefill", "env=test"}; - ASSERT_TRUE(reg.RegisterClient("n1", "127.0.0.1:9001", {}, /*peer=*/"", - /*engine=*/{}, tags)); - EXPECT_EQ(reg.GetClientTags("n1"), tags); + ASSERT_TRUE( + store.RegisterClient(MakeReg("n1", "127.0.0.1:9001", tags), now, std::chrono::seconds{30})); + EXPECT_EQ(store.GetClientTags("n1"), tags); } -TEST(ClientRegistryTagsTest, EmptyTagsReturnedForUnknownNode) { - ClientRegistry reg(ClientRegistryConfig{}); - EXPECT_TRUE(reg.GetClientTags("ghost").empty()); +TEST(ClientTagsTest, EmptyTagsReturnedForUnknownNode) { + InMemoryMasterMetadataStore store; + EXPECT_TRUE(store.GetClientTags("ghost").empty()); } -TEST(ClientRegistryTagsTest, EmptyTagsWhenNoneProvided) { - ClientRegistry reg(ClientRegistryConfig{}); - ASSERT_TRUE(reg.RegisterClient("n1", "127.0.0.1:9002", {})); - EXPECT_TRUE(reg.GetClientTags("n1").empty()); +TEST(ClientTagsTest, EmptyTagsWhenNoneProvided) { + InMemoryMasterMetadataStore store; + const auto now = std::chrono::system_clock::now(); + ASSERT_TRUE(store.RegisterClient(MakeReg("n1", "127.0.0.1:9002"), now, std::chrono::seconds{30})); + EXPECT_TRUE(store.GetClientTags("n1").empty()); } -TEST(ClientRegistryTagsTest, TagsUnchangedByHeartbeat) { - ClientRegistry reg(ClientRegistryConfig{}); +TEST(ClientTagsTest, TagsUnchangedByHeartbeat) { + InMemoryMasterMetadataStore store; + const auto now = std::chrono::system_clock::now(); const std::vector tags = {"sgl_role=decode"}; - ASSERT_TRUE(reg.RegisterClient("n1", "127.0.0.1:9003", {}, "", {}, tags)); + ASSERT_TRUE( + store.RegisterClient(MakeReg("n1", "127.0.0.1:9003", tags), now, std::chrono::seconds{30})); - uint64_t acked = 0; - bool request_full_sync = false; - reg.Heartbeat("n1", {}, {}, /*is_full_sync=*/false, 0, &acked, &request_full_sync); + ASSERT_EQ(store + .ApplyHeartbeat("n1", /*seq=*/1, now, /*caps=*/{}, /*events=*/{}, + /*is_full_sync=*/false) + .status, + HeartbeatResult::APPLIED); - EXPECT_EQ(reg.GetClientTags("n1"), tags); + EXPECT_EQ(store.GetClientTags("n1"), tags); } -TEST(ClientRegistryTagsTest, TagsClearedAfterUnregister) { - ClientRegistry reg(ClientRegistryConfig{}); - ASSERT_TRUE(reg.RegisterClient("n1", "127.0.0.1:9004", {}, "", {}, {"sgl_role=prefill"})); - reg.UnregisterClient("n1"); - EXPECT_TRUE(reg.GetClientTags("n1").empty()); +TEST(ClientTagsTest, TagsClearedAfterUnregister) { + InMemoryMasterMetadataStore store; + const auto now = std::chrono::system_clock::now(); + ASSERT_TRUE(store.RegisterClient(MakeReg("n1", "127.0.0.1:9004", {"sgl_role=prefill"}), now, + std::chrono::seconds{30})); + store.UnregisterClient("n1"); + EXPECT_TRUE(store.GetClientTags("n1").empty()); } -TEST(ClientRegistryTagsTest, MultipleNodesHaveIndependentTags) { - ClientRegistry reg(ClientRegistryConfig{}); - ASSERT_TRUE(reg.RegisterClient("p", "127.0.0.1:9005", {}, "", {}, {"sgl_role=prefill"})); - ASSERT_TRUE(reg.RegisterClient("d", "127.0.0.1:9006", {}, "", {}, {"sgl_role=decode"})); +TEST(ClientTagsTest, MultipleNodesHaveIndependentTags) { + InMemoryMasterMetadataStore store; + const auto now = std::chrono::system_clock::now(); + ASSERT_TRUE(store.RegisterClient(MakeReg("p", "127.0.0.1:9005", {"sgl_role=prefill"}), now, + std::chrono::seconds{30})); + ASSERT_TRUE(store.RegisterClient(MakeReg("d", "127.0.0.1:9006", {"sgl_role=decode"}), now, + std::chrono::seconds{30})); - EXPECT_EQ(reg.GetClientTags("p"), std::vector{"sgl_role=prefill"}); - EXPECT_EQ(reg.GetClientTags("d"), std::vector{"sgl_role=decode"}); + EXPECT_EQ(store.GetClientTags("p"), std::vector{"sgl_role=prefill"}); + EXPECT_EQ(store.GetClientTags("d"), std::vector{"sgl_role=decode"}); } // --------------------------------------------------------------------------- @@ -282,7 +305,7 @@ TEST_F(MasterClientTagsE2ETest, EmptyTagsSentWhenNoneConfigured) { // Verify that MasterClient::AddCounter forwards labels to the server and // that a capturing server would see the node label. The real tag-injection // into ReportMetrics base labels is exercised in the real-MasterServer suite -// below because CapturingMasterService doesn't run ClientRegistry. +// below because CapturingMasterService doesn't run the master metadata store. TEST_F(MasterClientTagsE2ETest, ReportMetricsCarriesNodeId) { setenv("UMBP_METRICS_REPORT_INTERVAL_MS", "50", 1); client_ = std::make_unique(MakeConfig({"sgl_role=decode"})); From 0732c532e276502a253f204e010cc26bdbbd66b9 Mon Sep 17 00:00:00 2001 From: TianDi101 Date: Mon, 15 Jun 2026 15:19:39 +0000 Subject: [PATCH 8/8] test(umbp): drop relocated src/umbp/tests copies; ignore .rocprofv3 in docker The UMBP C++ tests were consolidated under tests/cpp/umbp/distributed (commit 00652faf); remove the now-duplicate originals left behind under src/umbp/tests (their CMakeLists was already deleted in that commit, so these files were orphaned and unbuilt). Also add .rocprofv3/ to .dockerignore so profiler output stays out of the docker build context. Co-Authored-By: Claude Opus 4.8 --- .dockerignore | 1 + .../master_metadata_store_self_compile.cpp | 27 - src/umbp/tests/mock_master_metadata_store.h | 125 --- src/umbp/tests/test_client_registry.cpp | 289 ------ .../test_client_registry_external_kv.cpp | 55 -- .../tests/test_external_kv_block_index.cpp | 103 -- src/umbp/tests/test_external_kv_hit_index.cpp | 116 --- .../tests/test_global_block_index_events.cpp | 505 ---------- .../test_in_memory_master_metadata_store.cpp | 693 -------------- .../test_master_metadata_store_interface.cpp | 133 --- src/umbp/tests/test_peer_dram_allocator.cpp | 899 ------------------ src/umbp/tests/test_peer_ssd_eviction.cpp | 406 -------- src/umbp/tests/test_peer_ssd_manager.cpp | 233 ----- src/umbp/tests/test_peer_ssd_read_rpc.cpp | 258 ----- src/umbp/tests/test_router_dedup.cpp | 126 --- src/umbp/tests/test_ssd_copy_pipeline.cpp | 341 ------- src/umbp/tests/test_ssd_read_lease_gating.cpp | 97 -- src/umbp/tests/test_ssd_reliability.cpp | 345 ------- .../tests/test_tier_priority_route_get.cpp | 112 --- 19 files changed, 1 insertion(+), 4863 deletions(-) delete mode 100644 src/umbp/tests/master_metadata_store_self_compile.cpp delete mode 100644 src/umbp/tests/mock_master_metadata_store.h delete mode 100644 src/umbp/tests/test_client_registry.cpp delete mode 100644 src/umbp/tests/test_client_registry_external_kv.cpp delete mode 100644 src/umbp/tests/test_external_kv_block_index.cpp delete mode 100644 src/umbp/tests/test_external_kv_hit_index.cpp delete mode 100644 src/umbp/tests/test_global_block_index_events.cpp delete mode 100644 src/umbp/tests/test_in_memory_master_metadata_store.cpp delete mode 100644 src/umbp/tests/test_master_metadata_store_interface.cpp delete mode 100644 src/umbp/tests/test_peer_dram_allocator.cpp delete mode 100644 src/umbp/tests/test_peer_ssd_eviction.cpp delete mode 100644 src/umbp/tests/test_peer_ssd_manager.cpp delete mode 100644 src/umbp/tests/test_peer_ssd_read_rpc.cpp delete mode 100644 src/umbp/tests/test_router_dedup.cpp delete mode 100644 src/umbp/tests/test_ssd_copy_pipeline.cpp delete mode 100644 src/umbp/tests/test_ssd_read_lease_gating.cpp delete mode 100644 src/umbp/tests/test_ssd_reliability.cpp delete mode 100644 src/umbp/tests/test_tier_priority_route_get.cpp diff --git a/.dockerignore b/.dockerignore index 3f691b5bc..e0d9d8be6 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,6 +1,7 @@ # Keep build outputs, VCS data, and caches out of the docker build context. build/ .git/ +.rocprofv3/ **/__pycache__/ **/*.egg-info/ **/*.pyc diff --git a/src/umbp/tests/master_metadata_store_self_compile.cpp b/src/umbp/tests/master_metadata_store_self_compile.cpp deleted file mode 100644 index e8e016ce8..000000000 --- a/src/umbp/tests/master_metadata_store_self_compile.cpp +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright © Advanced Micro Devices, Inc. All rights reserved. -// -// MIT License -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -// Phase 1 "header self-compiles" gate: this translation unit includes ONLY the -// interface header (which in turn includes types.h). If it compiles, the header -// is self-contained — no missing includes or forward declarations. Deliberately -// has no other includes and no symbols of its own. -#include "umbp/distributed/master/master_metadata_store.h" diff --git a/src/umbp/tests/mock_master_metadata_store.h b/src/umbp/tests/mock_master_metadata_store.h deleted file mode 100644 index ae62ea4d7..000000000 --- a/src/umbp/tests/mock_master_metadata_store.h +++ /dev/null @@ -1,125 +0,0 @@ -// Copyright © Advanced Micro Devices, Inc. All rights reserved. -// -// MIT License -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -// GMock mock for IMasterMetadataStore. -// -// Phase 1 use: instantiation gate. If this type compiles and instantiates, -// every pure-virtual on the interface is overridden with a well-typed -// signature — proving the contract has no orphaned/ill-typed methods. -// -// Reused in Phase 3 (consumer-integration) to assert that each rewired -// consumer (Router / EvictionManager / UMBPMasterServiceImpl handlers) calls -// the right store method with correctly-translated arguments. -#pragma once - -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "umbp/distributed/master/master_metadata_store.h" -#include "umbp/distributed/types.h" - -namespace mori::umbp { - -class MockMasterMetadataStore : public IMasterMetadataStore { - public: - // Aliases for types whose commas would otherwise break the MOCK_METHOD macro - // parser (it splits the argument list on top-level commas). - using CapsMap = std::map; - using BudgetMap = std::map; - using LruResult = std::map>; - using LocationBatch = std::vector>; - - // --- Cross-store writes --- - MOCK_METHOD(bool, RegisterClient, - (const ClientRegistration& registration, std::chrono::system_clock::time_point now, - std::chrono::system_clock::duration stale_after), - (override)); - MOCK_METHOD(void, UnregisterClient, (const std::string& node_id), (override)); - MOCK_METHOD(HeartbeatResult, ApplyHeartbeat, - (const std::string& node_id, uint64_t seq, std::chrono::system_clock::time_point now, - const CapsMap& caps, (const std::vector&)events, bool is_full_sync), - (override)); - MOCK_METHOD(std::vector, ExpireStaleClients, - (std::chrono::system_clock::time_point cutoff), (override)); - - // --- External-KV writes --- - MOCK_METHOD(bool, RegisterExternalKvIfAlive, - (const std::string& node_id, (const std::vector&)hashes, TierType tier), - (override)); - MOCK_METHOD(void, UnregisterExternalKv, - (const std::string& node_id, (const std::vector&)hashes, TierType tier), - (override)); - MOCK_METHOD(void, UnregisterExternalKvByTier, (const std::string& node_id, TierType tier), - (override)); - MOCK_METHOD(void, UnregisterExternalKvByNode, (const std::string& node_id), (override)); - MOCK_METHOD(std::size_t, GarbageCollectHits, (std::chrono::system_clock::time_point cutoff), - (override)); - - // --- Block reads --- - MOCK_METHOD(std::vector, LookupBlock, (const std::string& key), (const, override)); - MOCK_METHOD(std::vector, LookupBlockForRouteGet, - (const std::string& key, (const std::unordered_set&)exclude_nodes, - std::chrono::system_clock::time_point now, - std::chrono::system_clock::duration lease_duration), - (override)); - MOCK_METHOD(LocationBatch, BatchLookupBlockForRouteGet, - ((const std::vector&)keys, - (const std::unordered_set&)exclude_nodes, - std::chrono::system_clock::time_point now, - std::chrono::system_clock::duration lease_duration), - (override)); - MOCK_METHOD(std::vector, BatchExistsBlock, ((const std::vector&)keys), - (const, override)); - MOCK_METHOD(LruResult, EnumerateLruForEviction, - (const BudgetMap& bytes_to_free, std::chrono::system_clock::time_point now), - (const, override)); - - // --- Client reads --- - MOCK_METHOD(std::optional, GetClient, (const std::string& node_id), - (const, override)); - MOCK_METHOD(bool, IsClientAlive, (const std::string& node_id), (const, override)); - MOCK_METHOD(std::optional, GetPeerAddress, (const std::string& node_id), - (const, override)); - MOCK_METHOD(std::vector, ListAliveClients, (), (const, override)); - MOCK_METHOD(std::size_t, AliveClientCount, (), (const, override)); - MOCK_METHOD(std::vector, GetClientTags, (const std::string& node_id), - (const, override)); - - // --- External-KV reads --- - MOCK_METHOD(std::vector, MatchExternalKv, - ((const std::vector&)hashes, bool count_as_hit, - std::chrono::system_clock::time_point now), - (override)); - MOCK_METHOD(std::vector, GetExternalKvHitCounts, - ((const std::vector&)hashes), (const, override)); - MOCK_METHOD(std::size_t, GetExternalKvCount, (const std::string& node_id), (const, override)); -}; - -} // namespace mori::umbp diff --git a/src/umbp/tests/test_client_registry.cpp b/src/umbp/tests/test_client_registry.cpp deleted file mode 100644 index adb363a30..000000000 --- a/src/umbp/tests/test_client_registry.cpp +++ /dev/null @@ -1,289 +0,0 @@ -// Copyright © Advanced Micro Devices, Inc. All rights reserved. -// -// MIT License -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -// -// Membership-ledger unit tests for ClientRegistry: registration / re- -// registration semantics, capacity round-trip, heartbeat status, and the -// background reaper that expires silent nodes. These exercise the registry -// in isolation (no GlobalBlockIndex / RPC), complementing the index- and -// external-kv-focused suites. In the master-as-advisor design the registry -// stores only membership + the capacities a peer last reported, so the -// assertions below check reported values verbatim rather than any allocator- -// derived view. -#include - -#include -#include -#include -#include -#include - -#include "umbp/distributed/master/client_registry.h" -#include "umbp/distributed/types.h" - -namespace mori::umbp { -namespace { - -std::map Caps(uint64_t total, uint64_t available) { - return {{TierType::HBM, TierCapacity{total, available}}}; -} - -const ClientRecord* FindClient(const std::vector& clients, const std::string& id) { - for (const auto& c : clients) { - if (c.node_id == id) return &c; - } - return nullptr; -} - -// Drive the current 7-arg Heartbeat with no events — the membership-keepalive -// path the reaper cares about. -ClientStatus Beat(ClientRegistry& registry, const std::string& node_id, - const std::map& caps) { - uint64_t acked = 0; - bool need_full = false; - return registry.Heartbeat(node_id, caps, /*bundles=*/{}, /*is_full_sync=*/false, - /*delta_seq_baseline=*/0, &acked, &need_full); -} - -template -bool WaitUntil(Predicate&& predicate, std::chrono::milliseconds timeout, - std::chrono::milliseconds poll = std::chrono::milliseconds(100)) { - const auto deadline = std::chrono::steady_clock::now() + timeout; - while (std::chrono::steady_clock::now() < deadline) { - if (predicate()) return true; - std::this_thread::sleep_for(poll); - } - return predicate(); -} - -// heartbeat_ttl * max_missed_heartbeats == 1s, so a node ages out ~1s after -// its last heartbeat. reaper_interval keeps the sweep responsive. -ClientRegistryConfig FastExpiryConfig() { - ClientRegistryConfig config; - config.heartbeat_ttl = std::chrono::seconds(1); - config.max_missed_heartbeats = 1; - config.reaper_interval = std::chrono::seconds(1); - return config; -} - -} // namespace - -// --- Registration / membership ---------------------------------------------- - -TEST(ClientRegistryTest, RegisterSingle) { - ClientRegistry registry(ClientRegistryConfig{}); - EXPECT_TRUE(registry.RegisterClient("node-1", "127.0.0.1:8080", Caps(80, 64))); - EXPECT_EQ(registry.ClientCount(), 1u); - EXPECT_TRUE(registry.IsClientAlive("node-1")); -} - -TEST(ClientRegistryTest, RegisterMultiple) { - ClientRegistry registry(ClientRegistryConfig{}); - EXPECT_TRUE(registry.RegisterClient("c1", "127.0.0.1:1001", Caps(100, 90))); - EXPECT_TRUE(registry.RegisterClient("c2", "127.0.0.1:1002", Caps(110, 80))); - EXPECT_TRUE(registry.RegisterClient("c3", "127.0.0.1:1003", Caps(120, 70))); - - EXPECT_EQ(registry.ClientCount(), 3u); - EXPECT_TRUE(registry.IsClientAlive("c1")); - EXPECT_TRUE(registry.IsClientAlive("c2")); - EXPECT_TRUE(registry.IsClientAlive("c3")); -} - -TEST(ClientRegistryTest, GetAliveClientsReportsMembershipAndCapacities) { - ClientRegistry registry(ClientRegistryConfig{}); - EXPECT_TRUE(registry.RegisterClient("c1", "host-a:8080", Caps(80, 64))); - EXPECT_TRUE(registry.RegisterClient("c2", "host-b:8080", Caps(96, 32))); - - const auto clients = registry.GetAliveClients(); - ASSERT_EQ(clients.size(), 2u); - - const ClientRecord* c1 = FindClient(clients, "c1"); - const ClientRecord* c2 = FindClient(clients, "c2"); - ASSERT_NE(c1, nullptr); - ASSERT_NE(c2, nullptr); - - EXPECT_EQ(c1->node_address, "host-a:8080"); - EXPECT_EQ(c2->node_address, "host-b:8080"); - EXPECT_EQ(c1->status, ClientStatus::ALIVE); - EXPECT_EQ(c2->status, ClientStatus::ALIVE); - - // Master stores the peer-reported capacities verbatim. - ASSERT_TRUE(c1->tier_capacities.count(TierType::HBM) > 0); - ASSERT_TRUE(c2->tier_capacities.count(TierType::HBM) > 0); - EXPECT_EQ(c1->tier_capacities.at(TierType::HBM).total_bytes, 80u); - EXPECT_EQ(c1->tier_capacities.at(TierType::HBM).available_bytes, 64u); - EXPECT_EQ(c2->tier_capacities.at(TierType::HBM).available_bytes, 32u); -} - -TEST(ClientRegistryTest, ReRegisterAliveRejected) { - ClientRegistry registry(ClientRegistryConfig{}); - EXPECT_TRUE(registry.RegisterClient("c1", "addr-1", Caps(80, 64))); - // A live node may not silently take over its own id with a new address. - EXPECT_FALSE(registry.RegisterClient("c1", "addr-2", Caps(80, 32))); - - EXPECT_EQ(registry.ClientCount(), 1u); - const auto clients = registry.GetAliveClients(); - ASSERT_EQ(clients.size(), 1u); - EXPECT_EQ(clients[0].node_address, "addr-1"); // original record untouched -} - -TEST(ClientRegistryTest, ReRegisterExpiredAllowed) { - // No reaper here: the aged-out branch in RegisterClient (now - last_heartbeat - // > expiry) must accept the re-registration on its own. - ClientRegistry registry(FastExpiryConfig()); - EXPECT_TRUE(registry.RegisterClient("c1", "addr-1", Caps(80, 64))); - - const bool reregistered = - WaitUntil([®istry] { return registry.RegisterClient("c1", "addr-2", Caps(80, 32)); }, - std::chrono::seconds(5)); - EXPECT_TRUE(reregistered); - - EXPECT_EQ(registry.ClientCount(), 1u); - const auto clients = registry.GetAliveClients(); - ASSERT_EQ(clients.size(), 1u); - EXPECT_EQ(clients[0].node_address, "addr-2"); // new address wins - EXPECT_EQ(clients[0].status, ClientStatus::ALIVE); -} - -// --- Unregister -------------------------------------------------------------- - -TEST(ClientRegistryTest, UnregisterExisting) { - ClientRegistry registry(ClientRegistryConfig{}); - EXPECT_TRUE(registry.RegisterClient("c1", "addr", Caps(80, 64))); - - registry.UnregisterClient("c1"); - EXPECT_EQ(registry.ClientCount(), 0u); - EXPECT_FALSE(registry.IsClientAlive("c1")); -} - -TEST(ClientRegistryTest, UnregisterUnknownIsNoop) { - ClientRegistry registry(ClientRegistryConfig{}); - EXPECT_TRUE(registry.RegisterClient("c1", "addr", Caps(80, 64))); - - registry.UnregisterClient("nonexistent"); - EXPECT_EQ(registry.ClientCount(), 1u); - EXPECT_TRUE(registry.IsClientAlive("c1")); -} - -TEST(ClientRegistryTest, UnregisterTwiceIsSafe) { - ClientRegistry registry(ClientRegistryConfig{}); - EXPECT_TRUE(registry.RegisterClient("c1", "addr", Caps(80, 64))); - - registry.UnregisterClient("c1"); - registry.UnregisterClient("c1"); - EXPECT_EQ(registry.ClientCount(), 0u); -} - -// --- Heartbeat --------------------------------------------------------------- - -TEST(ClientRegistryTest, HeartbeatAliveReplacesCapacities) { - ClientRegistry registry(ClientRegistryConfig{}); - EXPECT_TRUE(registry.RegisterClient("c1", "addr", Caps(80, 64))); - - EXPECT_EQ(Beat(registry, "c1", Caps(80, 16)), ClientStatus::ALIVE); - EXPECT_TRUE(registry.IsClientAlive("c1")); - - const auto clients = registry.GetAliveClients(); - ASSERT_EQ(clients.size(), 1u); - ASSERT_TRUE(clients[0].tier_capacities.count(TierType::HBM) > 0); - // The most recent heartbeat's capacities replace the stored values. - EXPECT_EQ(clients[0].tier_capacities.at(TierType::HBM).available_bytes, 16u); -} - -TEST(ClientRegistryTest, HeartbeatUnknownReturnsUnknown) { - ClientRegistry registry(ClientRegistryConfig{}); - EXPECT_EQ(Beat(registry, "nonexistent", Caps(80, 48)), ClientStatus::UNKNOWN); -} - -// --- Reaper ------------------------------------------------------------------ - -TEST(ClientRegistryTest, ReaperExpiresIdleClient) { - ClientRegistry registry(FastExpiryConfig()); - EXPECT_TRUE(registry.RegisterClient("c1", "addr", Caps(80, 64))); - registry.StartReaper(); - - const bool reaped = - WaitUntil([®istry] { return registry.ClientCount() == 0; }, std::chrono::seconds(6)); - - registry.StopReaper(); - EXPECT_TRUE(reaped); - EXPECT_FALSE(registry.IsClientAlive("c1")); -} - -TEST(ClientRegistryTest, ReaperKeepsClientAliveWithHeartbeats) { - ClientRegistry registry(FastExpiryConfig()); - EXPECT_TRUE(registry.RegisterClient("c1", "addr", Caps(80, 64))); - registry.StartReaper(); - - const auto start = std::chrono::steady_clock::now(); - while (std::chrono::steady_clock::now() - start < std::chrono::seconds(3)) { - EXPECT_EQ(Beat(registry, "c1", Caps(80, 48)), ClientStatus::ALIVE); - std::this_thread::sleep_for(std::chrono::milliseconds(300)); - } - - registry.StopReaper(); - EXPECT_EQ(registry.ClientCount(), 1u); - EXPECT_TRUE(registry.IsClientAlive("c1")); -} - -TEST(ClientRegistryTest, ReaperSelectiveExpiry) { - ClientRegistry registry(FastExpiryConfig()); - EXPECT_TRUE(registry.RegisterClient("c1", "addr-1", Caps(80, 64))); - EXPECT_TRUE(registry.RegisterClient("c2", "addr-2", Caps(80, 64))); - registry.StartReaper(); - - // Keep c1 fed; let c2 go silent. c2 must be reaped while c1 survives. - const bool reached = WaitUntil( - [®istry] { - Beat(registry, "c1", Caps(80, 48)); - return registry.IsClientAlive("c1") && !registry.IsClientAlive("c2"); - }, - std::chrono::seconds(6), std::chrono::milliseconds(200)); - - registry.StopReaper(); - EXPECT_TRUE(reached); - EXPECT_TRUE(registry.IsClientAlive("c1")); - EXPECT_FALSE(registry.IsClientAlive("c2")); -} - -TEST(ClientRegistryTest, StopReaperWhenNeverStarted) { - ClientRegistry registry(ClientRegistryConfig{}); - registry.StopReaper(); // must not hang or crash - SUCCEED(); -} - -TEST(ClientRegistryTest, StartStopReaperMultiple) { - ClientRegistry registry(ClientRegistryConfig{}); - registry.StartReaper(); - registry.StopReaper(); - registry.StartReaper(); - registry.StopReaper(); - SUCCEED(); -} - -TEST(ClientRegistryTest, DestructorStopsRunningReaper) { - ClientRegistry registry(ClientRegistryConfig{}); - registry.StartReaper(); - EXPECT_TRUE(registry.RegisterClient("c1", "addr", Caps(80, 64))); - // Falling out of scope must join the reaper thread cleanly. -} - -} // namespace mori::umbp diff --git a/src/umbp/tests/test_client_registry_external_kv.cpp b/src/umbp/tests/test_client_registry_external_kv.cpp deleted file mode 100644 index e232fe58a..000000000 --- a/src/umbp/tests/test_client_registry_external_kv.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright © Advanced Micro Devices, Inc. All rights reserved. -// -// MIT License -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -#include - -#include "umbp/distributed/master/client_registry.h" -#include "umbp/distributed/master/external_kv_block_index.h" -#include "umbp/distributed/master/global_block_index.h" - -namespace mori::umbp { - -TEST(ClientRegistryExternalKv, UnregisterClientClearsBothIndices) { - GlobalBlockIndex global_index; - ExternalKvBlockIndex external_index; - ClientRegistry registry(ClientRegistryConfig{}, global_index, &external_index); - - ASSERT_TRUE(registry.RegisterClient("node-A", "127.0.0.1:9000", {}, "127.0.0.1:9001")); - ASSERT_EQ(global_index.ApplyEvents("node-A", - {KvEvent{KvEvent::Kind::ADD, "owned", TierType::DRAM, 128}}), - 1u); - ASSERT_EQ(external_index.Register("node-A", {"external"}, TierType::DRAM), 1u); - - registry.UnregisterClient("node-A"); - - EXPECT_TRUE(global_index.Lookup("owned").empty()); - EXPECT_TRUE(external_index.Match({"external"}).empty()); -} - -TEST(ClientRegistryExternalKv, UnregisterWithoutExternalIndexDoesNotCrash) { - GlobalBlockIndex global_index; - ClientRegistry registry(ClientRegistryConfig{}, global_index); - - ASSERT_TRUE(registry.RegisterClient("node-A", "127.0.0.1:9000", {}, "127.0.0.1:9001")); - EXPECT_NO_THROW(registry.UnregisterClient("node-A")); -} - -} // namespace mori::umbp diff --git a/src/umbp/tests/test_external_kv_block_index.cpp b/src/umbp/tests/test_external_kv_block_index.cpp deleted file mode 100644 index 18ee654d5..000000000 --- a/src/umbp/tests/test_external_kv_block_index.cpp +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright © Advanced Micro Devices, Inc. All rights reserved. -// -// MIT License -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -#include - -#include -#include -#include - -#include "umbp/distributed/master/external_kv_block_index.h" - -namespace mori::umbp { -namespace { - -const ExternalKvBlockIndex::NodeMatch* FindMatch( - const std::vector& matches, const std::string& node_id) { - for (const auto& match : matches) { - if (match.node_id == node_id) return &match; - } - return nullptr; -} - -std::vector Sorted(std::vector values) { - std::sort(values.begin(), values.end()); - return values; -} - -} // namespace - -TEST(ExternalKvBlockIndex, RegisterIsAdditiveAcrossTiersAndCountsMutations) { - ExternalKvBlockIndex index; - - EXPECT_EQ(index.Register("node-A", {"h1"}, TierType::HBM), 1u); - EXPECT_EQ(index.Register("node-A", {"h1"}, TierType::DRAM), 1u); - EXPECT_EQ(index.Register("node-A", {"h1"}, TierType::DRAM), 0u); - - auto matches = index.Match({"h1"}); - ASSERT_EQ(matches.size(), 1u); - EXPECT_EQ(matches[0].MatchedHashCount(), 1u); - EXPECT_EQ(matches[0].hashes_by_tier.at(TierType::HBM), std::vector({"h1"})); - EXPECT_EQ(matches[0].hashes_by_tier.at(TierType::DRAM), std::vector({"h1"})); - EXPECT_EQ(index.GetKvCount("node-A"), 1u); -} - -TEST(ExternalKvBlockIndex, UnregisterRemovesOnlyRequestedTier) { - ExternalKvBlockIndex index; - ASSERT_EQ(index.Register("node-A", {"h1", "h2"}, TierType::HBM), 2u); - ASSERT_EQ(index.Register("node-A", {"h1"}, TierType::DRAM), 1u); - - EXPECT_EQ(index.Unregister("node-A", {"h1", "missing"}, TierType::HBM), 1u); - EXPECT_EQ(index.Unregister("node-A", {"h1"}, TierType::HBM), 0u); - - auto matches = index.Match({"h1", "h2"}); - ASSERT_EQ(matches.size(), 1u); - const auto& match = matches[0]; - EXPECT_EQ(match.hashes_by_tier.at(TierType::DRAM), std::vector({"h1"})); - EXPECT_EQ(match.hashes_by_tier.at(TierType::HBM), std::vector({"h2"})); - EXPECT_EQ(index.GetKvCount("node-A"), 2u); -} - -TEST(ExternalKvBlockIndex, BulkUnregisterByTierAndNode) { - ExternalKvBlockIndex index; - ASSERT_EQ(index.Register("node-A", {"h1", "h2", "h3"}, TierType::DRAM), 3u); - ASSERT_EQ(index.Register("node-A", {"h1", "h2"}, TierType::SSD), 2u); - ASSERT_EQ(index.Register("node-B", {"h1"}, TierType::SSD), 1u); - - EXPECT_EQ(index.UnregisterByNodeAtTier("node-A", TierType::SSD), 2u); - auto matches = index.Match({"h1", "h2", "h3"}); - ASSERT_EQ(matches.size(), 2u); - const auto* node_a = FindMatch(matches, "node-A"); - ASSERT_NE(node_a, nullptr); - ASSERT_EQ(node_a->hashes_by_tier.size(), 1u); - EXPECT_EQ(Sorted(node_a->hashes_by_tier.at(TierType::DRAM)), - (std::vector{"h1", "h2", "h3"})); - const auto* node_b = FindMatch(matches, "node-B"); - ASSERT_NE(node_b, nullptr); - EXPECT_EQ(node_b->hashes_by_tier.at(TierType::SSD), std::vector({"h1"})); - - EXPECT_EQ(index.UnregisterByNode("node-A"), 3u); - matches = index.Match({"h1", "h2", "h3"}); - ASSERT_EQ(matches.size(), 1u); - EXPECT_EQ(matches[0].node_id, "node-B"); -} - -} // namespace mori::umbp diff --git a/src/umbp/tests/test_external_kv_hit_index.cpp b/src/umbp/tests/test_external_kv_hit_index.cpp deleted file mode 100644 index 20685f0b7..000000000 --- a/src/umbp/tests/test_external_kv_hit_index.cpp +++ /dev/null @@ -1,116 +0,0 @@ -// Copyright © Advanced Micro Devices, Inc. All rights reserved. -// -// MIT License -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -#include - -#include -#include -#include -#include -#include -#include - -#include "umbp/distributed/master/external_kv_hit_index.h" - -namespace mori::umbp { -namespace { - -std::unordered_map LookupMap(ExternalKvHitIndex& index, - const std::vector& hashes) { - std::vector> entries; - index.Lookup(hashes, &entries); - std::unordered_map out; - for (const auto& [hash, total] : entries) out[hash] = total; - return out; -} - -TEST(ExternalKvHitIndexTest, IncrementAndLookup) { - ExternalKvHitIndex index; - index.IncrementHits({"h1", "h2"}, 100); - - auto counts = LookupMap(index, {"h1", "h2", "missing"}); - ASSERT_EQ(counts.size(), 2); - EXPECT_EQ(counts["h1"], 1); - EXPECT_EQ(counts["h2"], 1); -} - -TEST(ExternalKvHitIndexTest, RepeatedIncrementsAccumulate) { - ExternalKvHitIndex index; - for (int i = 0; i < 10; ++i) index.IncrementHits({"hot"}, 100 + i); - - auto counts = LookupMap(index, {"hot"}); - ASSERT_EQ(counts.size(), 1); - EXPECT_EQ(counts["hot"], 10); -} - -TEST(ExternalKvHitIndexTest, LookupSkipsMissingAndDedupesRequestHashes) { - ExternalKvHitIndex index; - index.IncrementHits({"h1"}, 100); - - std::vector> entries; - index.Lookup({"missing", "h1", "h1", "missing"}, &entries); - ASSERT_EQ(entries.size(), 1); - EXPECT_EQ(entries[0].first, "h1"); - EXPECT_EQ(entries[0].second, 1); -} - -TEST(ExternalKvHitIndexTest, GarbageCollectUsesLastSeenCutoff) { - ExternalKvHitIndex index; - index.IncrementHits({"old"}, 100); - index.IncrementHits({"fresh"}, 200); - - EXPECT_EQ(index.GarbageCollect(150), 1); - EXPECT_EQ(index.Size(), 1); - - auto counts = LookupMap(index, {"old", "fresh"}); - ASSERT_EQ(counts.size(), 1); - EXPECT_EQ(counts["fresh"], 1); -} - -TEST(ExternalKvHitIndexTest, ConcurrentCreationKeepsAllIncrements) { - ExternalKvHitIndex index; - constexpr int kThreads = 32; - constexpr int kIterations = 1000; - - std::atomic start{false}; - std::vector threads; - threads.reserve(kThreads); - for (int t = 0; t < kThreads; ++t) { - threads.emplace_back([&] { - while (!start.load(std::memory_order_acquire)) { - std::this_thread::yield(); - } - for (int i = 0; i < kIterations; ++i) { - index.IncrementHits({"shared"}, static_cast(100 + i)); - } - }); - } - - start.store(true, std::memory_order_release); - for (auto& thread : threads) thread.join(); - - auto counts = LookupMap(index, {"shared"}); - ASSERT_EQ(counts.size(), 1); - EXPECT_EQ(counts["shared"], static_cast(kThreads * kIterations)); -} - -} // namespace -} // namespace mori::umbp diff --git a/src/umbp/tests/test_global_block_index_events.cpp b/src/umbp/tests/test_global_block_index_events.cpp deleted file mode 100644 index 1b82e43ea..000000000 --- a/src/umbp/tests/test_global_block_index_events.cpp +++ /dev/null @@ -1,505 +0,0 @@ -// Copyright © Advanced Micro Devices, Inc. All rights reserved. -// -// MIT License -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -#include - -#include -#include -#include -#include - -#include "umbp/distributed/master/client_registry.h" -#include "umbp/distributed/master/global_block_index.h" -#include "umbp/distributed/types.h" - -namespace mori::umbp { - -namespace { - -KvEvent Add(std::string key, TierType tier, uint64_t size) { - return KvEvent{KvEvent::Kind::ADD, std::move(key), tier, size}; -} - -KvEvent Remove(std::string key, TierType tier) { - return KvEvent{KvEvent::Kind::REMOVE, std::move(key), tier, 0}; -} - -EventBundle Bundle(uint64_t seq, std::vector events) { - return EventBundle{seq, std::move(events)}; -} - -bool HasLocation(const std::vector& locs, const std::string& node, TierType tier, - uint64_t size) { - for (const auto& l : locs) { - if (l.node_id == node && l.tier == tier && l.size == size) return true; - } - return false; -} - -} // namespace - -// ---- ApplyEvents: ADD/REMOVE round-trip ------------------------------------ - -TEST(GlobalBlockIndexEvents, ApplyAddInsertsLocation) { - GlobalBlockIndex idx; - ASSERT_EQ(idx.ApplyEvents("node-A", {Add("k1", TierType::DRAM, 1024)}), 1u); - auto locs = idx.Lookup("k1"); - ASSERT_EQ(locs.size(), 1u); - EXPECT_EQ(locs[0].node_id, "node-A"); - EXPECT_EQ(locs[0].tier, TierType::DRAM); - EXPECT_EQ(locs[0].size, 1024u); -} - -// Duplicate ADD keeps the first observed size; only REMOVE retires it. -TEST(GlobalBlockIndexEvents, ApplyAddSameNodeTierKeepsExistingSize) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("k", TierType::DRAM, 1024)}); - idx.ApplyEvents("node-A", {Add("k", TierType::DRAM, 2048)}); - auto locs = idx.Lookup("k"); - ASSERT_EQ(locs.size(), 1u); - EXPECT_EQ(locs[0].size, 1024u); -} - -TEST(GlobalBlockIndexEvents, MultipleNodesCoexistForSameKey) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("k", TierType::DRAM, 100)}); - idx.ApplyEvents("node-B", {Add("k", TierType::DRAM, 200)}); - idx.ApplyEvents("node-A", {Add("k", TierType::HBM, 300)}); // different tier on A - auto locs = idx.Lookup("k"); - EXPECT_EQ(locs.size(), 3u); - EXPECT_TRUE(HasLocation(locs, "node-A", TierType::DRAM, 100)); - EXPECT_TRUE(HasLocation(locs, "node-B", TierType::DRAM, 200)); - EXPECT_TRUE(HasLocation(locs, "node-A", TierType::HBM, 300)); -} - -TEST(GlobalBlockIndexEvents, RemoveErasesMatchingLocationOnly) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("k", TierType::DRAM, 100)}); - idx.ApplyEvents("node-B", {Add("k", TierType::DRAM, 200)}); - - idx.ApplyEvents("node-A", {Remove("k", TierType::DRAM)}); - auto locs = idx.Lookup("k"); - ASSERT_EQ(locs.size(), 1u); - EXPECT_EQ(locs[0].node_id, "node-B"); -} - -TEST(GlobalBlockIndexEvents, RemoveLastLocationErasesEntry) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("k", TierType::DRAM, 100)}); - idx.ApplyEvents("node-A", {Remove("k", TierType::DRAM)}); - EXPECT_TRUE(idx.Lookup("k").empty()); - EXPECT_FALSE(idx.GetMetrics("k").has_value()); -} - -TEST(GlobalBlockIndexEvents, RemoveUnknownIsNoop) { - GlobalBlockIndex idx; - EXPECT_EQ(idx.ApplyEvents("ghost", {Remove("ghost-key", TierType::DRAM)}), 0u); -} - -// A key mirrored on both DRAM and SSD of one node: a DRAM eviction -// (REMOVE DRAM) must drop only the DRAM bucket and leave the SSD location -// readable. This is the additive-index invariant the SSD tier relies -// on (DRAM evict never touches the SSD copy); no master code is exercised here -// beyond ApplyEvents. -TEST(GlobalBlockIndexEvents, RemoveDramKeepsSsdBucket) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("k", TierType::DRAM, 100), Add("k", TierType::SSD, 100)}); - ASSERT_TRUE(HasLocation(idx.Lookup("k"), "node-A", TierType::DRAM, 100)); - ASSERT_TRUE(HasLocation(idx.Lookup("k"), "node-A", TierType::SSD, 100)); - - idx.ApplyEvents("node-A", {Remove("k", TierType::DRAM)}); - - auto locs = idx.Lookup("k"); - EXPECT_FALSE(HasLocation(locs, "node-A", TierType::DRAM, 100)); // DRAM bucket gone - EXPECT_TRUE(HasLocation(locs, "node-A", TierType::SSD, 100)); // SSD bucket retained -} - -TEST(GlobalBlockIndexEvents, ClearAtTierClearsOnlyTargetNodeTier) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("k1", TierType::DRAM, 1), Add("k2", TierType::SSD, 2), - Add("k3", TierType::DRAM, 3)}); - idx.ApplyEvents("node-B", {Add("k1", TierType::DRAM, 10)}); - - EXPECT_EQ( - idx.ApplyEvents("node-A", {KvEvent{KvEvent::Kind::CLEAR_AT_TIER, "", TierType::DRAM, 0}}), - 2u); - - EXPECT_FALSE(HasLocation(idx.Lookup("k1"), "node-A", TierType::DRAM, 1)); - EXPECT_TRUE(HasLocation(idx.Lookup("k1"), "node-B", TierType::DRAM, 10)); - EXPECT_TRUE(HasLocation(idx.Lookup("k2"), "node-A", TierType::SSD, 2)); - EXPECT_TRUE(idx.Lookup("k3").empty()); -} - -// ---- ReplaceNodeLocations: full-sync recovery ------------------------------ - -TEST(GlobalBlockIndexEvents, ReplaceNodeLocationsClearsThenInserts) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("k1", TierType::DRAM, 100), Add("k2", TierType::DRAM, 200)}); - idx.ApplyEvents("node-B", {Add("k1", TierType::DRAM, 999)}); // shared key, different node - - // Full-sync from node-A: k1 stays (different size), k2 is gone, new k3 appears. - idx.ReplaceNodeLocations("node-A", - {Add("k1", TierType::DRAM, 150), Add("k3", TierType::DRAM, 300)}); - - auto k1 = idx.Lookup("k1"); - EXPECT_TRUE(HasLocation(k1, "node-A", TierType::DRAM, 150)); - EXPECT_TRUE(HasLocation(k1, "node-B", TierType::DRAM, 999)); // node-B untouched - - EXPECT_TRUE(idx.Lookup("k2").empty()); // dropped — node-A's full-sync didn't include it - - auto k3 = idx.Lookup("k3"); - EXPECT_TRUE(HasLocation(k3, "node-A", TierType::DRAM, 300)); -} - -TEST(GlobalBlockIndexEvents, ReplaceNodeLocationsEmptyClearsAllForNode) { - // Used by ClientRegistry::UnregisterClient and the reaper to drop a - // dead node's index entries. - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("k1", TierType::DRAM, 1), Add("k2", TierType::HBM, 2)}); - idx.ApplyEvents("node-B", {Add("k1", TierType::DRAM, 3)}); - - idx.ReplaceNodeLocations("node-A", {}); - EXPECT_EQ(idx.Lookup("k1").size(), 1u); // node-B still owns k1 - EXPECT_TRUE(idx.Lookup("k2").empty()); // node-A's only HBM location is gone -} - -TEST(GlobalBlockIndexEvents, ReplaceNodeLocationsIgnoresRemoveEntries) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("k1", TierType::DRAM, 100)}); - // Snapshot full-sync conventionally carries only ADDs; sneaking - // a REMOVE in is silently skipped (the snapshot is the truth). - idx.ReplaceNodeLocations("node-A", - {Add("k2", TierType::DRAM, 200), Remove("k3", TierType::DRAM)}); - EXPECT_TRUE(idx.Lookup("k1").empty()); - EXPECT_FALSE(idx.Lookup("k2").empty()); - EXPECT_TRUE(idx.Lookup("k3").empty()); -} - -// ---- Reverse-index (node_to_keys_) invariants ------------------------------ - -TEST(GlobalBlockIndexEvents, ReplaceNodeLocationsAfterMultiTierRemoveKeepsKeyClean) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("k", TierType::DRAM, 100), Add("k", TierType::HBM, 200)}); - idx.ApplyEvents("node-B", {Add("k", TierType::DRAM, 300)}); - - // A still owns (k, HBM): reverse index must keep k. - idx.ApplyEvents("node-A", {Remove("k", TierType::DRAM)}); - auto mid = idx.Lookup("k"); - ASSERT_EQ(mid.size(), 2u); - EXPECT_TRUE(HasLocation(mid, "node-A", TierType::HBM, 200)); - EXPECT_TRUE(HasLocation(mid, "node-B", TierType::DRAM, 300)); - - idx.ReplaceNodeLocations("node-A", {}); - auto after = idx.Lookup("k"); - ASSERT_EQ(after.size(), 1u); - EXPECT_EQ(after[0].node_id, "node-B"); - EXPECT_EQ(after[0].tier, TierType::DRAM); - EXPECT_EQ(after[0].size, 300u); -} - -TEST(GlobalBlockIndexEvents, ReplaceNodeLocationsLeavesOtherNodesIntact) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("k1", TierType::DRAM, 1), Add("k2", TierType::DRAM, 2)}); - idx.ApplyEvents("node-B", {Add("k1", TierType::DRAM, 10), Add("k3", TierType::HBM, 30)}); - idx.ApplyEvents("node-C", {Add("k2", TierType::HBM, 200), Add("k4", TierType::DRAM, 400)}); - - idx.ReplaceNodeLocations("node-A", {Add("k_new", TierType::DRAM, 999)}); - - auto k1 = idx.Lookup("k1"); - ASSERT_EQ(k1.size(), 1u); - EXPECT_EQ(k1[0].node_id, "node-B"); - EXPECT_EQ(k1[0].size, 10u); - - auto k2 = idx.Lookup("k2"); - ASSERT_EQ(k2.size(), 1u); - EXPECT_EQ(k2[0].node_id, "node-C"); - EXPECT_EQ(k2[0].tier, TierType::HBM); - - EXPECT_TRUE(HasLocation(idx.Lookup("k_new"), "node-A", TierType::DRAM, 999)); - - auto k3 = idx.Lookup("k3"); - ASSERT_EQ(k3.size(), 1u); - EXPECT_EQ(k3[0].node_id, "node-B"); - EXPECT_EQ(k3[0].size, 30u); - - auto k4 = idx.Lookup("k4"); - ASSERT_EQ(k4.size(), 1u); - EXPECT_EQ(k4[0].node_id, "node-C"); - EXPECT_EQ(k4[0].size, 400u); -} - -// 2nd sync must see reverse index repopulated by 1st sync's replay. -TEST(GlobalBlockIndexEvents, ReplaceNodeLocationsTwiceRotatesKeys) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("k_old", TierType::DRAM, 1)}); - - idx.ReplaceNodeLocations("node-A", - {Add("k_mid_a", TierType::DRAM, 2), Add("k_mid_b", TierType::HBM, 3)}); - EXPECT_TRUE(idx.Lookup("k_old").empty()); - EXPECT_FALSE(idx.Lookup("k_mid_a").empty()); - EXPECT_FALSE(idx.Lookup("k_mid_b").empty()); - - idx.ReplaceNodeLocations("node-A", {Add("k_final", TierType::DRAM, 4)}); - EXPECT_TRUE(idx.Lookup("k_mid_a").empty()); - EXPECT_TRUE(idx.Lookup("k_mid_b").empty()); - auto final_locs = idx.Lookup("k_final"); - ASSERT_EQ(final_locs.size(), 1u); - EXPECT_EQ(final_locs[0].node_id, "node-A"); - EXPECT_EQ(final_locs[0].size, 4u); -} - -// Reverse-index insert must run even when inserted==false. -TEST(GlobalBlockIndexEvents, DuplicateAddKeepsReverseConsistent) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("dup", TierType::DRAM, 1024)}); - idx.ApplyEvents("node-A", {Add("dup", TierType::DRAM, 2048)}); - idx.ApplyEvents("node-A", {Add("dup", TierType::DRAM, 4096)}); - - auto locs = idx.Lookup("dup"); - ASSERT_EQ(locs.size(), 1u); - - idx.ReplaceNodeLocations("node-A", {}); - EXPECT_TRUE(idx.Lookup("dup").empty()); -} - -// No-op REMOVE must leave node_to_keys_ untouched on both sides. -TEST(GlobalBlockIndexEvents, RemoveNonMatchingTierLeavesReverseUntouched) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("k", TierType::DRAM, 100)}); - - idx.ApplyEvents("node-A", {Remove("k", TierType::HBM)}); - auto mid = idx.Lookup("k"); - ASSERT_EQ(mid.size(), 1u); - EXPECT_EQ(mid[0].tier, TierType::DRAM); - - idx.ReplaceNodeLocations("node-A", {}); - EXPECT_TRUE(idx.Lookup("k").empty()); - - idx.ApplyEvents("node-A", {Add("k", TierType::DRAM, 100)}); - idx.ApplyEvents("node-B", {Remove("k", TierType::DRAM)}); - idx.ReplaceNodeLocations("node-B", {}); - auto after = idx.Lookup("k"); - ASSERT_EQ(after.size(), 1u); - EXPECT_EQ(after[0].node_id, "node-A"); -} - -// ---- ClientRegistry::Heartbeat applies events end-to-end -------------------- - -TEST(ClientRegistryHeartbeat, AppliesEventsAndAdvancesSeq) { - GlobalBlockIndex idx; - ClientRegistryConfig cfg; - ClientRegistry reg(cfg, idx); - - ASSERT_TRUE(reg.RegisterClient("node-A", "10.0.0.1:1", /*caps=*/{}, "10.0.0.1:2", {})); - - uint64_t acked = 0; - bool need_full = false; - auto status = reg.Heartbeat("node-A", /*caps=*/{}, {Bundle(1, {Add("k", TierType::DRAM, 42)})}, - /*is_full_sync=*/false, /*delta_seq_baseline=*/0, &acked, &need_full); - EXPECT_EQ(status, ClientStatus::ALIVE); - EXPECT_EQ(acked, 1u); - EXPECT_FALSE(need_full); - EXPECT_FALSE(idx.Lookup("k").empty()); -} - -TEST(ClientRegistryHeartbeat, SeqGapTriggersFullSyncRequest) { - GlobalBlockIndex idx; - ClientRegistryConfig cfg; - ClientRegistry reg(cfg, idx); - reg.RegisterClient("node-A", "10.0.0.1:1", {}, "10.0.0.1:2", {}); - - uint64_t acked = 0; - bool need_full = false; - // First heartbeat seq=1 — applied normally. - reg.Heartbeat("node-A", {}, {Bundle(1, {Add("k1", TierType::DRAM, 1)})}, - /*is_full_sync=*/false, 0, &acked, &need_full); - ASSERT_FALSE(need_full); - ASSERT_EQ(acked, 1u); - - // Second heartbeat skips seq=2: master detects the gap. - reg.Heartbeat("node-A", {}, {Bundle(3, {Add("k2", TierType::DRAM, 2)})}, - /*is_full_sync=*/false, 0, &acked, &need_full); - EXPECT_TRUE(need_full); - EXPECT_EQ(acked, 1u); // unchanged — no events applied from this batch - - // k2 is NOT in the index because the gap-batch was rejected. - EXPECT_TRUE(idx.Lookup("k2").empty()); - EXPECT_FALSE(idx.Lookup("k1").empty()); -} - -TEST(ClientRegistryHeartbeat, FullSyncReplacesNodeLocations) { - GlobalBlockIndex idx; - ClientRegistryConfig cfg; - ClientRegistry reg(cfg, idx); - reg.RegisterClient("node-A", "10.0.0.1:1", {}, "10.0.0.1:2", {}); - - uint64_t acked = 0; - bool need_full = false; - reg.Heartbeat("node-A", {}, - {Bundle(1, {Add("k1", TierType::DRAM, 1), Add("k2", TierType::DRAM, 2)})}, - /*is_full_sync=*/false, 0, &acked, &need_full); - ASSERT_FALSE(idx.Lookup("k1").empty()); - ASSERT_FALSE(idx.Lookup("k2").empty()); - - // Full-sync: only k1 + k3 should remain for node-A. - reg.Heartbeat("node-A", {}, - {Bundle(2, {Add("k1", TierType::DRAM, 10), Add("k3", TierType::DRAM, 30)})}, - /*is_full_sync=*/true, /*delta_seq_baseline=*/2, &acked, &need_full); - EXPECT_EQ(acked, 2u); - EXPECT_FALSE(need_full); - - auto k1 = idx.Lookup("k1"); - ASSERT_EQ(k1.size(), 1u); - EXPECT_EQ(k1[0].size, 10u); // updated via full-sync - EXPECT_TRUE(idx.Lookup("k2").empty()); - EXPECT_FALSE(idx.Lookup("k3").empty()); -} - -TEST(ClientRegistryHeartbeat, UnregisterClearsNodeFromIndex) { - GlobalBlockIndex idx; - ClientRegistryConfig cfg; - ClientRegistry reg(cfg, idx); - reg.RegisterClient("node-A", "10.0.0.1:1", {}, "10.0.0.1:2", {}); - - uint64_t acked = 0; - bool need_full = false; - reg.Heartbeat("node-A", {}, {Bundle(1, {Add("k1", TierType::DRAM, 1)})}, - /*is_full_sync=*/false, 0, &acked, &need_full); - ASSERT_FALSE(idx.Lookup("k1").empty()); - - reg.UnregisterClient("node-A"); - EXPECT_TRUE(idx.Lookup("k1").empty()); - EXPECT_FALSE(reg.IsClientAlive("node-A")); -} - -// ---- FindEvictionCandidates -------------------------------------------------- - -TEST(GlobalBlockIndexEvents, FindEvictionCandidatesFiltersByOverloadedNodeTier) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("k1", TierType::DRAM, 100), Add("k2", TierType::HBM, 200)}); - idx.ApplyEvents("node-B", {Add("k1", TierType::DRAM, 100)}); - - std::set overloaded = { - {"node-A", TierType::DRAM}, - }; - auto candidates = idx.FindEvictionCandidates(overloaded); - // Only node-A's DRAM location of k1 is a candidate. - ASSERT_EQ(candidates.size(), 1u); - EXPECT_EQ(candidates[0].key, "k1"); - EXPECT_EQ(candidates[0].location.node_id, "node-A"); - EXPECT_EQ(candidates[0].location.tier, TierType::DRAM); -} - -// ---- BatchLookupForRouteGet ------------------------------------------------ - -TEST(GlobalBlockIndexEvents, BatchLookupForRouteGetEmptyInputReturnsEmpty) { - GlobalBlockIndex idx; - EXPECT_TRUE(idx.BatchLookupForRouteGet({}, {}, std::chrono::seconds{1}).empty()); -} - -TEST(GlobalBlockIndexEvents, BatchLookupForRouteGetMixedHitsAndMisses) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("k1", TierType::DRAM, 100)}); - idx.ApplyEvents("node-B", {Add("k1", TierType::DRAM, 200), Add("k2", TierType::HBM, 300)}); - - auto ref_k1 = idx.Lookup("k1"); - auto ref_k2 = idx.Lookup("k2"); - auto before_k1 = idx.GetMetrics("k1"); - auto before_k2 = idx.GetMetrics("k2"); - ASSERT_TRUE(before_k1.has_value()); - ASSERT_TRUE(before_k2.has_value()); - - auto results = idx.BatchLookupForRouteGet({"k1", "ghost", "k2"}, {}, std::chrono::seconds{10}); - ASSERT_EQ(results.size(), 3u); - EXPECT_EQ(results[0], ref_k1); - EXPECT_TRUE(results[1].empty()); - EXPECT_EQ(results[2], ref_k2); - - auto after_k1 = idx.GetMetrics("k1"); - auto after_k2 = idx.GetMetrics("k2"); - ASSERT_TRUE(after_k1.has_value()); - ASSERT_TRUE(after_k2.has_value()); - EXPECT_EQ(after_k1->access_count, before_k1->access_count + 1); - EXPECT_EQ(after_k2->access_count, before_k2->access_count + 1); - EXPECT_FALSE(idx.GetMetrics("ghost").has_value()); -} - -TEST(GlobalBlockIndexEvents, BatchLookupForRouteGetGrantsLeaseForHitsOnly) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("hit", TierType::DRAM, 100), Add("other", TierType::DRAM, 200)}); - - std::set overloaded{{"node-A", TierType::DRAM}}; - ASSERT_EQ(idx.FindEvictionCandidates(overloaded).size(), 2u); - - idx.BatchLookupForRouteGet({"hit", "ghost"}, {}, std::chrono::seconds{10}); - - auto candidates = idx.FindEvictionCandidates(overloaded); - ASSERT_EQ(candidates.size(), 1u); - EXPECT_EQ(candidates[0].key, "other"); -} - -// All replicas excluded -> slot empty, access_count NOT bumped, -// lease NOT granted. A key whose every replica is unreachable must -// not pollute LRU or block eviction. -TEST(GlobalBlockIndexEvents, BatchLookupForRouteGetSkipsSideEffectsWhenAllReplicasExcluded) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("k", TierType::DRAM, 100)}); - idx.ApplyEvents("node-B", {Add("k", TierType::DRAM, 200)}); - - auto before = idx.GetMetrics("k"); - ASSERT_TRUE(before.has_value()); - std::set overloaded{{"node-A", TierType::DRAM}, - {"node-B", TierType::DRAM}}; - ASSERT_EQ(idx.FindEvictionCandidates(overloaded).size(), 2u); - - std::unordered_set excludes{"node-A", "node-B"}; - auto results = idx.BatchLookupForRouteGet({"k"}, excludes, std::chrono::seconds{10}); - ASSERT_EQ(results.size(), 1u); - EXPECT_TRUE(results[0].empty()); - - auto after = idx.GetMetrics("k"); - ASSERT_TRUE(after.has_value()); - EXPECT_EQ(after->access_count, before->access_count); - EXPECT_EQ(idx.FindEvictionCandidates(overloaded).size(), 2u); -} - -// Some replicas excluded but not all -> returned slot has only the -// survivors, access_count IS bumped, lease IS granted. -TEST(GlobalBlockIndexEvents, BatchLookupForRouteGetFiltersAndLeasesWhenSomeReplicasSurvive) { - GlobalBlockIndex idx; - idx.ApplyEvents("node-A", {Add("k", TierType::DRAM, 100)}); - idx.ApplyEvents("node-B", {Add("k", TierType::DRAM, 200)}); - - auto before = idx.GetMetrics("k"); - ASSERT_TRUE(before.has_value()); - - std::unordered_set excludes{"node-A"}; - auto results = idx.BatchLookupForRouteGet({"k"}, excludes, std::chrono::seconds{10}); - ASSERT_EQ(results.size(), 1u); - ASSERT_EQ(results[0].size(), 1u); - EXPECT_EQ(results[0][0].node_id, "node-B"); - - auto after = idx.GetMetrics("k"); - ASSERT_TRUE(after.has_value()); - EXPECT_EQ(after->access_count, before->access_count + 1); -} - -} // namespace mori::umbp diff --git a/src/umbp/tests/test_in_memory_master_metadata_store.cpp b/src/umbp/tests/test_in_memory_master_metadata_store.cpp deleted file mode 100644 index a8ca2f6a2..000000000 --- a/src/umbp/tests/test_in_memory_master_metadata_store.cpp +++ /dev/null @@ -1,693 +0,0 @@ -// Copyright © Advanced Micro Devices, Inc. All rights reserved. -// -// MIT License -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -// Phase 2 behavioral suite for InMemoryMasterMetadataStore (§6a). Written -// against IMasterMetadataStore& so the same cases validate the Redis backend -// later. Tests use injected system_clock times (no real-time sleeps) so they -// are deterministic in CI. -// -// State that the interface does not expose directly — lease_expiry and -// last_accessed_at on block entries — is observed through EnumerateLruForEviction: -// a leased entry is filtered out, and LRU ordering reflects last_accessed_at. - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "umbp/distributed/master/in_memory_master_metadata_store.h" -#include "umbp/distributed/types.h" - -namespace mori::umbp { -namespace { - -using namespace std::chrono_literals; -using Clock = std::chrono::system_clock; - -// Fixed, NTP-plausible base instant so offsets read cleanly. -const Clock::time_point kT0 = Clock::time_point(std::chrono::hours(24 * 365 * 50)); - -std::map Caps(uint64_t total = 1000, uint64_t available = 1000) { - return {{TierType::HBM, TierCapacity{total, available}}}; -} - -ClientRegistration MakeReg(const std::string& node_id) { - ClientRegistration reg; - reg.node_id = node_id; - reg.node_address = "addr:" + node_id; - reg.peer_address = "peer:" + node_id; - reg.tier_capacities = Caps(); - reg.tags = {"role=test"}; - return reg; -} - -KvEvent Add(const std::string& key, TierType tier, uint64_t size) { - return KvEvent{KvEvent::Kind::ADD, key, tier, size}; -} -KvEvent Remove(const std::string& key, TierType tier) { - return KvEvent{KvEvent::Kind::REMOVE, key, tier, 0}; -} - -// Register `node` ALIVE at `now`. -void RegisterAlive(IMasterMetadataStore& store, const std::string& node, - Clock::time_point now = kT0) { - ASSERT_TRUE(store.RegisterClient(MakeReg(node), now, 30s)); -} - -// Apply a delta heartbeat carrying `events` at sequence `seq`. -HeartbeatResult Beat(IMasterMetadataStore& store, const std::string& node, uint64_t seq, - std::vector events, Clock::time_point now) { - return store.ApplyHeartbeat(node, seq, now, Caps(), events, /*is_full_sync=*/false); -} - -// --------------------------------------------------------------------------- -// RegisterClient -// --------------------------------------------------------------------------- - -TEST(InMemoryStore, RegisterNewClient) { - InMemoryMasterMetadataStore store; - EXPECT_TRUE(store.RegisterClient(MakeReg("n1"), kT0, 30s)); - EXPECT_TRUE(store.IsClientAlive("n1")); - EXPECT_EQ(store.AliveClientCount(), 1u); - - auto rec = store.GetClient("n1"); - ASSERT_TRUE(rec.has_value()); - EXPECT_EQ(rec->status, ClientStatus::ALIVE); - EXPECT_EQ(rec->last_applied_seq, 0u); - EXPECT_EQ(rec->peer_address, "peer:n1"); - EXPECT_EQ(rec->last_heartbeat, kT0); - EXPECT_EQ(rec->registered_at, kT0); -} - -TEST(InMemoryStore, RejectReRegisterNonStaleAlive) { - InMemoryMasterMetadataStore store; - ASSERT_TRUE(store.RegisterClient(MakeReg("n1"), kT0, 30s)); - // Still well within stale_after window. - EXPECT_FALSE(store.RegisterClient(MakeReg("n1"), kT0 + 5s, 30s)); -} - -TEST(InMemoryStore, AcceptReRegisterStaleAlive) { - InMemoryMasterMetadataStore store; - ASSERT_TRUE(store.RegisterClient(MakeReg("n1"), kT0, 30s)); - // last_heartbeat is kT0; now - last_heartbeat > stale_after → re-register OK - // even though the reaper has not flipped the status yet (hazard #2). - EXPECT_TRUE(store.RegisterClient(MakeReg("n1"), kT0 + 31s, 30s)); - EXPECT_TRUE(store.IsClientAlive("n1")); -} - -TEST(InMemoryStore, AcceptReRegisterExpired) { - InMemoryMasterMetadataStore store; - ASSERT_TRUE(store.RegisterClient(MakeReg("n1"), kT0, 30s)); - ASSERT_EQ(store.ExpireStaleClients(kT0 + 1s).size(), 1u); - EXPECT_FALSE(store.IsClientAlive("n1")); - // Re-register an EXPIRED record at the same instant: accepted, back to ALIVE. - EXPECT_TRUE(store.RegisterClient(MakeReg("n1"), kT0 + 2s, 30s)); - EXPECT_TRUE(store.IsClientAlive("n1")); -} - -// --------------------------------------------------------------------------- -// UnregisterClient — cascade to block locations AND external KV -// --------------------------------------------------------------------------- - -TEST(InMemoryStore, UnregisterClientCascades) { - InMemoryMasterMetadataStore store; - RegisterAlive(store, "n1"); - ASSERT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 10)}, kT0).status, - HeartbeatResult::APPLIED); - ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1", "h2"}, TierType::HBM)); - - ASSERT_FALSE(store.LookupBlock("k1").empty()); - ASSERT_EQ(store.GetExternalKvCount("n1"), 2u); - - store.UnregisterClient("n1"); - - EXPECT_FALSE(store.GetClient("n1").has_value()); - EXPECT_TRUE(store.LookupBlock("k1").empty()); - EXPECT_EQ(store.GetExternalKvCount("n1"), 0u); - EXPECT_TRUE(store.MatchExternalKv({"h1", "h2"}, false, kT0).empty()); -} - -TEST(InMemoryStore, UnregisterUnknownIsNoOp) { - InMemoryMasterMetadataStore store; - store.UnregisterClient("ghost"); // must not crash - EXPECT_EQ(store.AliveClientCount(), 0u); -} - -// --------------------------------------------------------------------------- -// ApplyHeartbeat -// --------------------------------------------------------------------------- - -TEST(InMemoryStore, HeartbeatUnknownNode) { - InMemoryMasterMetadataStore store; - auto r = Beat(store, "ghost", 1, {}, kT0); - EXPECT_EQ(r.status, HeartbeatResult::UNKNOWN); -} - -TEST(InMemoryStore, HeartbeatCasSequence) { - InMemoryMasterMetadataStore store; - RegisterAlive(store, "n1"); - EXPECT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 10)}, kT0).status, - HeartbeatResult::APPLIED); - EXPECT_EQ(Beat(store, "n1", 2, {Add("k2", TierType::HBM, 20)}, kT0).status, - HeartbeatResult::APPLIED); - // Out-of-order seq → SEQ_GAP, acked echoes last applied (2). - auto gap = Beat(store, "n1", 4, {Add("k3", TierType::HBM, 30)}, kT0); - EXPECT_EQ(gap.status, HeartbeatResult::SEQ_GAP); - EXPECT_EQ(gap.acked_seq, 2u); - // k3 must not have been applied. - EXPECT_TRUE(store.LookupBlock("k3").empty()); -} - -TEST(InMemoryStore, SeqGapKeepsLivenessNotCapsOrSeq) { - InMemoryMasterMetadataStore store; - RegisterAlive(store, "n1"); - ASSERT_EQ(Beat(store, "n1", 1, {}, kT0).status, HeartbeatResult::APPLIED); - - // A gap heartbeat at a later time with different caps. - std::map new_caps = {{TierType::HBM, TierCapacity{9999, 9999}}}; - auto gap = store.ApplyHeartbeat("n1", 5, kT0 + 10s, new_caps, {}, /*is_full_sync=*/false); - ASSERT_EQ(gap.status, HeartbeatResult::SEQ_GAP); - - auto rec = store.GetClient("n1"); - ASSERT_TRUE(rec.has_value()); - EXPECT_EQ(rec->status, ClientStatus::ALIVE); // kept alive - EXPECT_EQ(rec->last_heartbeat, kT0 + 10s); // last_heartbeat bumped - EXPECT_EQ(rec->last_applied_seq, 1u); // seq NOT advanced - EXPECT_EQ(rec->tier_capacities.at(TierType::HBM).total_bytes, 1000u); // caps NOT replaced -} - -TEST(InMemoryStore, HeartbeatDeltaAddRemove) { - InMemoryMasterMetadataStore store; - RegisterAlive(store, "n1"); - ASSERT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 10)}, kT0).status, - HeartbeatResult::APPLIED); - ASSERT_EQ(store.LookupBlock("k1").size(), 1u); - - ASSERT_EQ(Beat(store, "n1", 2, {Remove("k1", TierType::HBM)}, kT0).status, - HeartbeatResult::APPLIED); - EXPECT_TRUE(store.LookupBlock("k1").empty()); -} - -TEST(InMemoryStore, HeartbeatFullSyncReplaces) { - InMemoryMasterMetadataStore store; - RegisterAlive(store, "n1"); - ASSERT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 10), Add("k2", TierType::HBM, 20)}, kT0) - .status, - HeartbeatResult::APPLIED); - - // full_sync wipes prior locations and installs only the ADDs carried here. - auto r = store.ApplyHeartbeat("n1", 7, kT0, Caps(), {Add("k3", TierType::HBM, 30)}, - /*is_full_sync=*/true); - EXPECT_EQ(r.status, HeartbeatResult::APPLIED); - EXPECT_EQ(r.acked_seq, 7u); - - EXPECT_TRUE(store.LookupBlock("k1").empty()); - EXPECT_TRUE(store.LookupBlock("k2").empty()); - EXPECT_EQ(store.LookupBlock("k3").size(), 1u); - - auto rec = store.GetClient("n1"); - ASSERT_TRUE(rec.has_value()); - EXPECT_EQ(rec->last_applied_seq, 7u); // full_sync re-baselines the seq -} - -// --------------------------------------------------------------------------- -// ExpireStaleClients — flip to EXPIRED, keep row, cascade, idempotent -// --------------------------------------------------------------------------- - -TEST(InMemoryStore, ExpireStaleFlipsKeepsRowAndCascades) { - InMemoryMasterMetadataStore store; - RegisterAlive(store, "n1", kT0); - RegisterAlive(store, "n2", kT0 + 20s); // fresher - ASSERT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 10)}, kT0).status, - HeartbeatResult::APPLIED); - ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1"}, TierType::HBM)); - - // Cutoff after n1's heartbeat but before n2's. - auto dead = store.ExpireStaleClients(kT0 + 10s); - ASSERT_EQ(dead.size(), 1u); - EXPECT_EQ(dead[0], "n1"); - - // Row KEPT but EXPIRED (hazard #3). - auto rec = store.GetClient("n1"); - ASSERT_TRUE(rec.has_value()); - EXPECT_EQ(rec->status, ClientStatus::EXPIRED); - EXPECT_FALSE(store.IsClientAlive("n1")); - - // Cascade dropped its blocks and external KV. - EXPECT_TRUE(store.LookupBlock("k1").empty()); - EXPECT_EQ(store.GetExternalKvCount("n1"), 0u); - - // n2 untouched. - EXPECT_TRUE(store.IsClientAlive("n2")); -} - -TEST(InMemoryStore, ExpireStaleIsIdempotent) { - InMemoryMasterMetadataStore store; - RegisterAlive(store, "n1", kT0); - ASSERT_EQ(store.ExpireStaleClients(kT0 + 10s).size(), 1u); - // Re-tick: already EXPIRED, nothing new to report. - EXPECT_TRUE(store.ExpireStaleClients(kT0 + 10s).empty()); -} - -TEST(InMemoryStore, ExpiredRowExcludedFromAliveAccounting) { - InMemoryMasterMetadataStore store; - RegisterAlive(store, "n1", kT0); - RegisterAlive(store, "n2", kT0); - ASSERT_EQ(store.AliveClientCount(), 2u); - ASSERT_EQ(store.ExpireStaleClients(kT0 + 10s).size(), 2u); - - EXPECT_EQ(store.AliveClientCount(), 0u); // not 2, even though rows remain - EXPECT_TRUE(store.ListAliveClients().empty()); - EXPECT_TRUE(store.GetClient("n1").has_value()); // row still present -} - -// --------------------------------------------------------------------------- -// Block reads — lease/access observed via EnumerateLruForEviction -// --------------------------------------------------------------------------- - -// Helper: budget large enough to take everything in one bucket. -std::map Budget(const std::string& node, TierType tier, uint64_t bytes) { - return {{NodeTierKey{node, tier}, bytes}}; -} - -TEST(InMemoryStore, LookupBlockHasNoLeaseOrAccessSideEffects) { - InMemoryMasterMetadataStore store; - RegisterAlive(store, "n1"); - ASSERT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 10)}, kT0).status, - HeartbeatResult::APPLIED); - - // Plain read twice. - EXPECT_EQ(store.LookupBlock("k1").size(), 1u); - EXPECT_EQ(store.LookupBlock("k1").size(), 1u); - - // Not leased → still an eviction candidate at kT0. - auto cands = store.EnumerateLruForEviction(Budget("n1", TierType::HBM, 1000), kT0); - ASSERT_EQ(cands.size(), 1u); - EXPECT_EQ(cands.begin()->second.size(), 1u); -} - -TEST(InMemoryStore, LookupBlockForRouteGetGrantsLeaseAndAccess) { - InMemoryMasterMetadataStore store; - RegisterAlive(store, "n1"); - ASSERT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 10)}, kT0).status, - HeartbeatResult::APPLIED); - - auto locs = store.LookupBlockForRouteGet("k1", {}, kT0, 60s); - ASSERT_EQ(locs.size(), 1u); - - // Leased until kT0+60s → filtered out of eviction at kT0+10s. - EXPECT_TRUE(store.EnumerateLruForEviction(Budget("n1", TierType::HBM, 1000), kT0 + 10s).empty()); - // After lease expiry it is a candidate again. - EXPECT_FALSE(store.EnumerateLruForEviction(Budget("n1", TierType::HBM, 1000), kT0 + 61s).empty()); -} - -TEST(InMemoryStore, RouteGetExcludeNodesNoLeaseWhenFullyExcluded) { - InMemoryMasterMetadataStore store; - RegisterAlive(store, "n1"); - ASSERT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 10)}, kT0).status, - HeartbeatResult::APPLIED); - - std::unordered_set exclude = {"n1"}; - auto locs = store.LookupBlockForRouteGet("k1", exclude, kT0, 60s); - EXPECT_TRUE(locs.empty()); // every location excluded - - // No lease granted (hazard #4) → still an eviction candidate immediately. - EXPECT_FALSE(store.EnumerateLruForEviction(Budget("n1", TierType::HBM, 1000), kT0).empty()); -} - -TEST(InMemoryStore, BatchLookupForRouteGetParallelToKeys) { - InMemoryMasterMetadataStore store; - RegisterAlive(store, "n1"); - ASSERT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 10), Add("k3", TierType::HBM, 30)}, kT0) - .status, - HeartbeatResult::APPLIED); - - auto out = store.BatchLookupBlockForRouteGet({"k1", "missing", "k3"}, {}, kT0, 60s); - ASSERT_EQ(out.size(), 3u); - EXPECT_EQ(out[0].size(), 1u); - EXPECT_TRUE(out[1].empty()); // missing key - EXPECT_EQ(out[2].size(), 1u); -} - -TEST(InMemoryStore, BatchExistsBlockNoSideEffects) { - InMemoryMasterMetadataStore store; - RegisterAlive(store, "n1"); - ASSERT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 10)}, kT0).status, - HeartbeatResult::APPLIED); - - auto exists = store.BatchExistsBlock({"k1", "missing"}); - ASSERT_EQ(exists.size(), 2u); - EXPECT_TRUE(exists[0]); - EXPECT_FALSE(exists[1]); - - // No lease granted by an existence check. - EXPECT_FALSE(store.EnumerateLruForEviction(Budget("n1", TierType::HBM, 1000), kT0).empty()); -} - -// --------------------------------------------------------------------------- -// EnumerateLruForEviction -// --------------------------------------------------------------------------- - -TEST(InMemoryStore, EvictionLruOrderAndBudget) { - InMemoryMasterMetadataStore store; - RegisterAlive(store, "n1"); - // Three keys, each 100 bytes, accessed at increasing times so LRU order is - // k_old < k_mid < k_new. - ASSERT_EQ(Beat(store, "n1", 1, {Add("k_old", TierType::HBM, 100)}, kT0).status, - HeartbeatResult::APPLIED); - ASSERT_EQ(Beat(store, "n1", 2, {Add("k_mid", TierType::HBM, 100)}, kT0 + 1s).status, - HeartbeatResult::APPLIED); - ASSERT_EQ(Beat(store, "n1", 3, {Add("k_new", TierType::HBM, 100)}, kT0 + 2s).status, - HeartbeatResult::APPLIED); - - // Budget 150 bytes → should take the two oldest (200 bytes ≥ 150 after second). - auto cands = store.EnumerateLruForEviction(Budget("n1", TierType::HBM, 150), kT0 + 10s); - ASSERT_EQ(cands.size(), 1u); - auto& bucket = cands.at(NodeTierKey{"n1", TierType::HBM}); - ASSERT_EQ(bucket.size(), 2u); - EXPECT_EQ(bucket[0].key, "k_old"); // oldest first - EXPECT_EQ(bucket[1].key, "k_mid"); -} - -TEST(InMemoryStore, EvictionSkipsLeased) { - InMemoryMasterMetadataStore store; - RegisterAlive(store, "n1"); - ASSERT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 100)}, kT0).status, - HeartbeatResult::APPLIED); - // Lease k1 well past the enumeration time. - store.LookupBlockForRouteGet("k1", {}, kT0, 1h); - EXPECT_TRUE(store.EnumerateLruForEviction(Budget("n1", TierType::HBM, 1000), kT0 + 1s).empty()); -} - -TEST(InMemoryStore, EvictionTieTimestampsAllSurvive) { - // §2d correctness claim: many candidates sharing one identical last_accessed_at - // (the common case, since a batch RouteGet stamps one `now` across all keys) - // must all be enumerable — none dropped by tie collisions. - InMemoryMasterMetadataStore store; - RegisterAlive(store, "n1"); - std::vector adds; - for (int i = 0; i < 50; ++i) { - adds.push_back(Add("k" + std::to_string(i), TierType::HBM, 10)); - } - // All keys created (and thus last_accessed) at the identical instant kT0. - ASSERT_EQ(Beat(store, "n1", 1, adds, kT0).status, HeartbeatResult::APPLIED); - - // Huge budget → take everything; all 50 tied-timestamp candidates must appear. - auto cands = store.EnumerateLruForEviction(Budget("n1", TierType::HBM, 100000), kT0 + 10s); - ASSERT_EQ(cands.size(), 1u); - EXPECT_EQ(cands.at(NodeTierKey{"n1", TierType::HBM}).size(), 50u); -} - -TEST(InMemoryStore, EvictionOnlyBudgetedBuckets) { - InMemoryMasterMetadataStore store; - RegisterAlive(store, "n1"); - ASSERT_EQ(Beat(store, "n1", 1, {Add("kh", TierType::HBM, 10), Add("kd", TierType::DRAM, 10)}, kT0) - .status, - HeartbeatResult::APPLIED); - // Only ask about the HBM bucket. - auto cands = store.EnumerateLruForEviction(Budget("n1", TierType::HBM, 1000), kT0 + 1s); - ASSERT_EQ(cands.size(), 1u); - EXPECT_EQ(cands.begin()->first.tier, TierType::HBM); -} - -// --------------------------------------------------------------------------- -// External KV -// --------------------------------------------------------------------------- - -TEST(InMemoryStore, RegisterExternalKvAliveGate) { - InMemoryMasterMetadataStore store; - // Dead/unknown node → rejected, nothing written. - EXPECT_FALSE(store.RegisterExternalKvIfAlive("ghost", {"h1"}, TierType::HBM)); - EXPECT_TRUE(store.MatchExternalKv({"h1"}, false, kT0).empty()); - - RegisterAlive(store, "n1"); - EXPECT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1"}, TierType::HBM)); - EXPECT_EQ(store.MatchExternalKv({"h1"}, false, kT0).size(), 1u); -} - -TEST(InMemoryStore, UnregisterExternalKvAndByTier) { - InMemoryMasterMetadataStore store; - RegisterAlive(store, "n1"); - ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1"}, TierType::HBM)); - ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1"}, TierType::DRAM)); - - // Remove only the HBM tier; DRAM remains. - store.UnregisterExternalKv("n1", {"h1"}, TierType::HBM); - auto m = store.MatchExternalKv({"h1"}, false, kT0); - ASSERT_EQ(m.size(), 1u); - EXPECT_EQ(m[0].hashes_by_tier.count(TierType::HBM), 0u); - EXPECT_EQ(m[0].hashes_by_tier.count(TierType::DRAM), 1u); - - // Whole-tier wipe of DRAM → entry gone. - store.UnregisterExternalKvByTier("n1", TierType::DRAM); - EXPECT_TRUE(store.MatchExternalKv({"h1"}, false, kT0).empty()); -} - -TEST(InMemoryStore, MatchCountsHitsWhenRequested) { - InMemoryMasterMetadataStore store; - RegisterAlive(store, "n1"); - ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1", "h2"}, TierType::HBM)); - - // count_as_hit=false: pure read, hit map untouched. - store.MatchExternalKv({"h1", "h2"}, /*count_as_hit=*/false, kT0); - EXPECT_TRUE(store.GetExternalKvHitCounts({"h1", "h2"}).empty()); - - // count_as_hit=true: increments accumulate across calls. - store.MatchExternalKv({"h1", "h2"}, /*count_as_hit=*/true, kT0); - store.MatchExternalKv({"h1"}, /*count_as_hit=*/true, kT0 + 1s); - - auto counts = store.GetExternalKvHitCounts({"h1", "h2"}); - std::map by_hash; - for (const auto& e : counts) by_hash[e.hash] = e.hit_count_total; - EXPECT_EQ(by_hash["h1"], 2u); - EXPECT_EQ(by_hash["h2"], 1u); -} - -TEST(InMemoryStore, MatchedHashCountAcrossTiers) { - // Preserves the NodeMatch::MatchedHashCount coverage from - // test_external_kv_block_index.cpp:57 — one hash mirrored across two tiers - // counts once. - InMemoryMasterMetadataStore store; - RegisterAlive(store, "n1"); - ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1"}, TierType::HBM)); - ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1"}, TierType::DRAM)); - - auto m = store.MatchExternalKv({"h1"}, false, kT0); - ASSERT_EQ(m.size(), 1u); - EXPECT_EQ(m[0].hashes_by_tier.size(), 2u); // appears in two tier buckets - EXPECT_EQ(m[0].MatchedHashCount(), 1u); // but is one unique hash -} - -TEST(InMemoryStore, GetExternalKvHitCountsDedupesAndSkipsMissing) { - InMemoryMasterMetadataStore store; - RegisterAlive(store, "n1"); - ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1"}, TierType::HBM)); - store.MatchExternalKv({"h1"}, true, kT0); - - auto counts = store.GetExternalKvHitCounts({"missing", "h1", "h1"}); - ASSERT_EQ(counts.size(), 1u); - EXPECT_EQ(counts[0].hash, "h1"); - EXPECT_EQ(counts[0].hit_count_total, 1u); -} - -TEST(InMemoryStore, GarbageCollectHitsByLastSeen) { - InMemoryMasterMetadataStore store; - RegisterAlive(store, "n1"); - ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"old", "fresh"}, TierType::HBM)); - store.MatchExternalKv({"old"}, true, kT0); - store.MatchExternalKv({"fresh"}, true, kT0 + 100s); - - // Drop entries last seen before kT0+50s → only "old" goes. - EXPECT_EQ(store.GarbageCollectHits(kT0 + 50s), 1u); - - auto counts = store.GetExternalKvHitCounts({"old", "fresh"}); - ASSERT_EQ(counts.size(), 1u); - EXPECT_EQ(counts[0].hash, "fresh"); -} - -TEST(InMemoryStore, UnregisterExternalKvByNodeWipesAllTiersOnly) { - // Whole-node external-KV wipe (backs RevokeAllExternalKvBlocksForNode). Unlike - // UnregisterClient, it must NOT touch the client record or block locations. - InMemoryMasterMetadataStore store; - RegisterAlive(store, "n1"); - ASSERT_EQ(Beat(store, "n1", 1, {Add("k1", TierType::HBM, 10)}, kT0).status, - HeartbeatResult::APPLIED); - ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1", "h2"}, TierType::HBM)); - ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1"}, TierType::DRAM)); - ASSERT_EQ(store.GetExternalKvCount("n1"), 2u); - - store.UnregisterExternalKvByNode("n1"); - - // External KV gone across every tier. - EXPECT_EQ(store.GetExternalKvCount("n1"), 0u); - EXPECT_TRUE(store.MatchExternalKv({"h1", "h2"}, false, kT0).empty()); - - // Client record and block locations untouched (distinguishes from UnregisterClient). - EXPECT_TRUE(store.IsClientAlive("n1")); - EXPECT_EQ(store.LookupBlock("k1").size(), 1u); -} - -TEST(InMemoryStore, UnregisterExternalKvByNodeUnknownIsNoOp) { - InMemoryMasterMetadataStore store; - store.UnregisterExternalKvByNode("ghost"); // must not crash - EXPECT_EQ(store.GetExternalKvCount("ghost"), 0u); -} - -// --------------------------------------------------------------------------- -// Client reads — GetPeerAddress, GetClientTags, ListAliveClients content -// --------------------------------------------------------------------------- - -TEST(InMemoryStore, GetPeerAddressAliveExpiredAndUnknown) { - InMemoryMasterMetadataStore store; - RegisterAlive(store, "n1"); - - // ALIVE → peer surfaced (MakeReg sets peer:). - auto alive = store.GetPeerAddress("n1"); - ASSERT_TRUE(alive.has_value()); - EXPECT_EQ(*alive, "peer:n1"); - - // EXPIRED rows still surface their peer_address (contract: the row is kept). - ASSERT_EQ(store.ExpireStaleClients(kT0 + 10s).size(), 1u); - auto expired = store.GetPeerAddress("n1"); - ASSERT_TRUE(expired.has_value()); - EXPECT_EQ(*expired, "peer:n1"); - - // Unknown node → nullopt. - EXPECT_FALSE(store.GetPeerAddress("ghost").has_value()); -} - -TEST(InMemoryStore, GetClientTagsReturnsRegisteredTagsAndEmptyForUnknown) { - InMemoryMasterMetadataStore store; - RegisterAlive(store, "n1"); // MakeReg sets tags = {"role=test"} - - auto tags = store.GetClientTags("n1"); - ASSERT_EQ(tags.size(), 1u); - EXPECT_EQ(tags[0], "role=test"); - - EXPECT_TRUE(store.GetClientTags("ghost").empty()); -} - -TEST(InMemoryStore, ListAliveClientsReturnsAliveRecordsExcludingExpired) { - InMemoryMasterMetadataStore store; - RegisterAlive(store, "n1", kT0); - RegisterAlive(store, "n2", kT0 + 20s); // fresher, survives the cutoff below - - // Expire only n1. - ASSERT_EQ(store.ExpireStaleClients(kT0 + 10s).size(), 1u); - - auto alive = store.ListAliveClients(); - ASSERT_EQ(alive.size(), 1u); // n1 excluded even though its row still exists - EXPECT_EQ(alive[0].node_id, "n2"); - EXPECT_EQ(alive[0].status, ClientStatus::ALIVE); - EXPECT_EQ(alive[0].peer_address, "peer:n2"); -} - -// --------------------------------------------------------------------------- -// Concurrency -// --------------------------------------------------------------------------- - -TEST(InMemoryStore, ConcurrentHeartbeatCasExactlyOneApplied) { - InMemoryMasterMetadataStore store; - RegisterAlive(store, "n1"); - - std::atomic applied{0}; - std::atomic gap{0}; - std::atomic start{false}; - std::vector threads; - for (int t = 0; t < 2; ++t) { - threads.emplace_back([&] { - while (!start.load(std::memory_order_acquire)) std::this_thread::yield(); - // Both race to apply seq=1 (last_applied starts at 0). - auto r = store.ApplyHeartbeat("n1", 1, kT0, Caps(), {}, /*is_full_sync=*/false); - if (r.status == HeartbeatResult::APPLIED) { - applied.fetch_add(1); - } else if (r.status == HeartbeatResult::SEQ_GAP) { - gap.fetch_add(1); - } - }); - } - start.store(true, std::memory_order_release); - for (auto& th : threads) th.join(); - - EXPECT_EQ(applied.load(), 1); - EXPECT_EQ(gap.load(), 1); - EXPECT_EQ(store.GetClient("n1")->last_applied_seq, 1u); -} - -// ThreadSanitizer safety net for collapsing four lock domains into one: a mixed -// read/write workload across the shared/unique split must be race-free. -TEST(InMemoryStore, MixedWorkloadIsRaceFree) { - InMemoryMasterMetadataStore store; - RegisterAlive(store, "n1"); - for (int i = 0; i < 100; ++i) { - store.ApplyHeartbeat("n1", i + 1, kT0, Caps(), - {Add("k" + std::to_string(i), TierType::HBM, 10)}, - /*is_full_sync=*/false); - } - ASSERT_TRUE(store.RegisterExternalKvIfAlive("n1", {"h1", "h2", "h3"}, TierType::HBM)); - - std::atomic start{false}; - std::vector threads; - - // RouteGet readers (shared-lock path with atomic lease/access mutation). - for (int r = 0; r < 4; ++r) { - threads.emplace_back([&] { - while (!start.load(std::memory_order_acquire)) std::this_thread::yield(); - for (int i = 0; i < 500; ++i) { - store.BatchLookupBlockForRouteGet({"k1", "k50", "k99"}, {}, kT0 + std::chrono::seconds(i), - 30s); - store.BatchExistsBlock({"k1", "k2"}); - } - }); - } - // Hit writers (the formerly-shared path that becomes exclusive). - threads.emplace_back([&] { - while (!start.load(std::memory_order_acquire)) std::this_thread::yield(); - for (int i = 0; i < 500; ++i) { - store.MatchExternalKv({"h1", "h2", "h3"}, /*count_as_hit=*/true, - kT0 + std::chrono::seconds(i)); - } - }); - // Eviction-enumeration reader. - threads.emplace_back([&] { - while (!start.load(std::memory_order_acquire)) std::this_thread::yield(); - for (int i = 0; i < 500; ++i) { - store.EnumerateLruForEviction(Budget("n1", TierType::HBM, 50), kT0 + std::chrono::seconds(i)); - } - }); - - start.store(true, std::memory_order_release); - for (auto& th : threads) th.join(); - - // After the storm, hit counts reflect exactly the 500 hit-writer iterations. - auto counts = store.GetExternalKvHitCounts({"h1"}); - ASSERT_EQ(counts.size(), 1u); - EXPECT_EQ(counts[0].hit_count_total, 500u); -} - -} // namespace -} // namespace mori::umbp diff --git a/src/umbp/tests/test_master_metadata_store_interface.cpp b/src/umbp/tests/test_master_metadata_store_interface.cpp deleted file mode 100644 index ed82a3d2a..000000000 --- a/src/umbp/tests/test_master_metadata_store_interface.cpp +++ /dev/null @@ -1,133 +0,0 @@ -// Copyright © Advanced Micro Devices, Inc. All rights reserved. -// -// MIT License -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -// Phase 1 compile/instantiation gate for IMasterMetadataStore. -// -// The interface is abstract with no implementation yet, so there is no runtime -// behavior to exercise. The bar for Phase 1 is that the contract is well-formed -// and instantiable: -// 1. MockMasterMetadataStore overrides every pure-virtual (a missing or -// ill-typed override makes the mock abstract → fails to instantiate). -// 2. A MockMasterMetadataStore is usable through an IMasterMetadataStore&, -// proving the override set is complete. -// Behavioral assertions arrive with InMemoryMasterMetadataStore in Phase 2. - -#include - -#include - -#include "mock_master_metadata_store.h" -#include "umbp/distributed/master/master_metadata_store.h" - -namespace mori::umbp { -namespace { - -// Instantiation gate: if the interface had an orphaned or ill-typed pure -// virtual, MockMasterMetadataStore would stay abstract and this would not -// compile. -TEST(MasterMetadataStoreInterface, MockIsInstantiableThroughInterface) { - MockMasterMetadataStore mock; - IMasterMetadataStore& store = mock; - (void)store; - SUCCEED(); -} - -// Signature-completeness spot check: name every interface method once through -// the base-class pointer with a default ON_CALL, mirroring the §1b delta table -// plus the two added hit-count methods (GetExternalKvHitCounts, -// GarbageCollectHits) and the `now` parameter on MatchExternalKv. This guards -// against silently dropping the live GetExternalKvHitCounts RPC path. -TEST(MasterMetadataStoreInterface, EveryMethodIsCallableThroughInterface) { - using ::testing::_; - using ::testing::NiceMock; - using ::testing::Return; - using namespace std::chrono_literals; - - // NiceMock: these are default-action calls, not behavior under test, so the - // "uninteresting call" warnings would just be noise. - NiceMock mock; - const auto now = std::chrono::system_clock::now(); - - ON_CALL(mock, RegisterClient(_, _, _)).WillByDefault(Return(true)); - ON_CALL(mock, ApplyHeartbeat(_, _, _, _, _, _)) - .WillByDefault(Return(HeartbeatResult{HeartbeatResult::APPLIED, 0})); - ON_CALL(mock, ExpireStaleClients(_)).WillByDefault(Return(std::vector{})); - ON_CALL(mock, RegisterExternalKvIfAlive(_, _, _)).WillByDefault(Return(true)); - ON_CALL(mock, GarbageCollectHits(_)).WillByDefault(Return(0)); - ON_CALL(mock, LookupBlock(_)).WillByDefault(Return(std::vector{})); - ON_CALL(mock, LookupBlockForRouteGet(_, _, _, _)).WillByDefault(Return(std::vector{})); - ON_CALL(mock, BatchLookupBlockForRouteGet(_, _, _, _)) - .WillByDefault(Return(std::vector>{})); - ON_CALL(mock, BatchExistsBlock(_)).WillByDefault(Return(std::vector{})); - ON_CALL(mock, EnumerateLruForEviction(_, _)) - .WillByDefault(Return(std::map>{})); - ON_CALL(mock, GetClient(_)).WillByDefault(Return(std::nullopt)); - ON_CALL(mock, IsClientAlive(_)).WillByDefault(Return(false)); - ON_CALL(mock, GetPeerAddress(_)).WillByDefault(Return(std::nullopt)); - ON_CALL(mock, ListAliveClients()).WillByDefault(Return(std::vector{})); - ON_CALL(mock, AliveClientCount()).WillByDefault(Return(0)); - ON_CALL(mock, GetClientTags(_)).WillByDefault(Return(std::vector{})); - ON_CALL(mock, MatchExternalKv(_, _, _)).WillByDefault(Return(std::vector{})); - ON_CALL(mock, GetExternalKvHitCounts(_)) - .WillByDefault(Return(std::vector{})); - ON_CALL(mock, GetExternalKvCount(_)).WillByDefault(Return(0)); - - IMasterMetadataStore& store = mock; - - // Cross-store writes. - ClientRegistration reg; - reg.node_id = "node-a"; - EXPECT_TRUE(store.RegisterClient(reg, now, 30s)); - store.UnregisterClient("node-a"); - EXPECT_EQ(store.ApplyHeartbeat("node-a", 1, now, {}, {}, false).status, HeartbeatResult::APPLIED); - EXPECT_TRUE(store.ExpireStaleClients(now).empty()); - - // External-KV writes. - EXPECT_TRUE(store.RegisterExternalKvIfAlive("node-a", {"h0"}, TierType::HBM)); - store.UnregisterExternalKv("node-a", {"h0"}, TierType::HBM); - store.UnregisterExternalKvByTier("node-a", TierType::HBM); - store.UnregisterExternalKvByNode("node-a"); - EXPECT_EQ(store.GarbageCollectHits(now), 0u); - - // Block reads. - EXPECT_TRUE(store.LookupBlock("k0").empty()); - EXPECT_TRUE(store.LookupBlockForRouteGet("k0", {}, now, 5s).empty()); - EXPECT_TRUE(store.BatchLookupBlockForRouteGet({"k0"}, {}, now, 5s).empty()); - EXPECT_TRUE(store.BatchExistsBlock({"k0"}).empty()); - EXPECT_TRUE(store.EnumerateLruForEviction({}, now).empty()); - - // Client reads. - EXPECT_FALSE(store.GetClient("node-a").has_value()); - EXPECT_FALSE(store.IsClientAlive("node-a")); - EXPECT_FALSE(store.GetPeerAddress("node-a").has_value()); - EXPECT_TRUE(store.ListAliveClients().empty()); - EXPECT_EQ(store.AliveClientCount(), 0u); - EXPECT_TRUE(store.GetClientTags("node-a").empty()); - - // External-KV reads, incl. the two added hit-count methods + `now` param. - EXPECT_TRUE(store.MatchExternalKv({"h0"}, /*count_as_hit=*/true, now).empty()); - EXPECT_TRUE(store.GetExternalKvHitCounts({"h0"}).empty()); - EXPECT_EQ(store.GetExternalKvCount("node-a"), 0u); -} - -} // namespace -} // namespace mori::umbp diff --git a/src/umbp/tests/test_peer_dram_allocator.cpp b/src/umbp/tests/test_peer_dram_allocator.cpp deleted file mode 100644 index f76b43a5c..000000000 --- a/src/umbp/tests/test_peer_dram_allocator.cpp +++ /dev/null @@ -1,899 +0,0 @@ -// Copyright © Advanced Micro Devices, Inc. All rights reserved. -// -// MIT License -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "umbp/distributed/peer/peer_dram_allocator.h" - -namespace mori::umbp { - -namespace { - -// 3 buffers x 4 pages of 1 KiB = 12 KiB total DRAM. -constexpr uint64_t kPageSize = 1024; - -PeerDramAllocator::TierConfig MakeDramCfg() { - PeerDramAllocator::TierConfig cfg; - cfg.buffer_sizes = {kPageSize * 4, kPageSize * 4, kPageSize * 4}; - cfg.buffer_descs = {{0xA0, 0xA1}, {0xB0, 0xB1}, {0xC0, 0xC1}}; - return cfg; -} - -PeerDramAllocator::TierConfig EmptyCfg() { return {}; } - -std::unique_ptr MakeAllocator( - std::chrono::milliseconds pending_ttl = std::chrono::milliseconds{5000}, - std::chrono::milliseconds read_lease_ttl = std::chrono::milliseconds{500}) { - return std::make_unique(kPageSize, MakeDramCfg(), EmptyCfg(), pending_ttl, - read_lease_ttl); -} - -// Strip AllocateResult down to its slot for tests that don't exercise -// the dedup outcome. -std::optional AllocateOk(PeerDramAllocator& a, - const std::string& key, uint64_t size, - TierType tier) { - return a.Allocate(key, size, tier).slot; -} - -} // namespace - -// ---- Allocate / Commit / Resolve happy path --------------------------------- - -TEST(PeerDramAllocator, CommitMakesKeyResolvable) { - auto a = MakeAllocator(); - auto pending = AllocateOk(*a, "key-1", kPageSize, TierType::DRAM); - ASSERT_TRUE(pending.has_value()); - EXPECT_EQ(pending->size, kPageSize); - EXPECT_EQ(pending->pages.size(), 1u); - - uint64_t committed_bytes = 0; - ASSERT_TRUE(a->Commit(pending->slot_id, "key-1", committed_bytes)); - EXPECT_EQ(committed_bytes, pending->size); - auto r = a->Resolve("key-1"); - EXPECT_TRUE(r.found); - EXPECT_EQ(r.size, kPageSize); - EXPECT_EQ(r.tier, TierType::DRAM); - EXPECT_EQ(r.pages, pending->pages); - - auto events = a->DrainPendingEvents(); - ASSERT_EQ(events.size(), 1u); - EXPECT_EQ(events[0].kind, KvEvent::Kind::ADD); - EXPECT_EQ(events[0].key, "key-1"); - EXPECT_EQ(events[0].size, kPageSize); - EXPECT_EQ(events[0].tier, TierType::DRAM); -} - -// ---- Allocate-side dedup ---------------------------------------------------- -// Defensive layer for master-index lag (primary dedup is at BatchRoutePut). - -TEST(PeerDramAllocator, AllocateRejectsAlreadyOwnedKey) { - auto a = MakeAllocator(); - - auto first = AllocateOk(*a, "A", kPageSize, TierType::DRAM); - ASSERT_TRUE(first.has_value()); - uint64_t committed_bytes = 0; - ASSERT_TRUE(a->Commit(first->slot_id, "A", committed_bytes)); - a->DrainPendingEvents(); - - const auto cap_after_commit = a->TierCapacitiesSnapshot()[TierType::DRAM]; - - auto second = a->Allocate("A", kPageSize, TierType::DRAM); - EXPECT_EQ(second.outcome, PeerDramAllocator::Outcome::kSuccessAlreadyExists); - EXPECT_FALSE(second.slot.has_value()); - - // No pages reserved -> capacity unchanged. - const auto cap_after_dedup = a->TierCapacitiesSnapshot()[TierType::DRAM]; - EXPECT_EQ(cap_after_dedup.available_bytes, cap_after_commit.available_bytes); -} - -TEST(PeerDramAllocator, AllocateAllowsDifferentKey) { - auto a = MakeAllocator(); - - auto first = AllocateOk(*a, "A", kPageSize, TierType::DRAM); - ASSERT_TRUE(first.has_value()); - uint64_t committed_bytes = 0; - ASSERT_TRUE(a->Commit(first->slot_id, "A", committed_bytes)); - - auto second = a->Allocate("B", kPageSize, TierType::DRAM); - EXPECT_EQ(second.outcome, PeerDramAllocator::Outcome::kSuccessAllocated); - ASSERT_TRUE(second.slot.has_value()); - EXPECT_TRUE(a->Commit(second.slot->slot_id, "B", committed_bytes)); -} - -// Lax mode: pending_ not checked. Two same-key Allocates before any -// Commit both succeed; race absorbed by Commit() (see -// DuplicateCommitIsIdempotentAndKeepsFirst). -TEST(PeerDramAllocator, AllocateDoesNotRejectOnPendingDuplicate) { - auto a = MakeAllocator(); - - auto first = a->Allocate("A", kPageSize, TierType::DRAM); - EXPECT_EQ(first.outcome, PeerDramAllocator::Outcome::kSuccessAllocated); - ASSERT_TRUE(first.slot.has_value()); - - auto second = a->Allocate("A", kPageSize, TierType::DRAM); - EXPECT_EQ(second.outcome, PeerDramAllocator::Outcome::kSuccessAllocated); - ASSERT_TRUE(second.slot.has_value()); - ASSERT_NE(second.slot->slot_id, first.slot->slot_id); -} - -// ---- Duplicate Commit idempotency ------------------------------------------- -// Race-window safety net. Both Allocates must happen BEFORE either -// Commit — once owned_["dup-key"] is set, the new owned_-check in -// Allocate would reject the second slot before it could reach Commit. - -TEST(PeerDramAllocator, DuplicateCommitIsIdempotentAndKeepsFirst) { - auto a = MakeAllocator(); - - auto first = AllocateOk(*a, "dup-key", kPageSize, TierType::DRAM); - ASSERT_TRUE(first.has_value()); - auto second = AllocateOk(*a, "dup-key", kPageSize, TierType::DRAM); - ASSERT_TRUE(second.has_value()); - ASSERT_NE(second->slot_id, first->slot_id); - - const auto first_pages = first->pages; - - uint64_t committed_bytes = 0; - ASSERT_TRUE(a->Commit(first->slot_id, "dup-key", committed_bytes)); - EXPECT_EQ(committed_bytes, kPageSize); - - auto events = a->DrainPendingEvents(); - ASSERT_EQ(events.size(), 1u); - EXPECT_EQ(events[0].kind, KvEvent::Kind::ADD); - EXPECT_EQ(events[0].key, "dup-key"); - - // First owned (1 page) + second still pending (1 page) = 2 occupied. - const auto cap_after_first_commit = a->TierCapacitiesSnapshot()[TierType::DRAM]; - EXPECT_EQ(cap_after_first_commit.available_bytes, - cap_after_first_commit.total_bytes - 2 * kPageSize); - - // Duplicate Commit: idempotent success, consumes the second pending - // (caller never needs to Abort it), prior owned slot unchanged. - committed_bytes = 0; - ASSERT_TRUE(a->Commit(second->slot_id, "dup-key", committed_bytes)); - EXPECT_EQ(committed_bytes, kPageSize); - - // Master's view unchanged: no REMOVE, no second ADD. - EXPECT_TRUE(a->DrainPendingEvents().empty()); - - // Resolve still returns the first commit's pages. - auto r = a->Resolve("dup-key"); - ASSERT_TRUE(r.found); - EXPECT_EQ(r.pages, first_pages); - EXPECT_EQ(r.size, kPageSize); - - // Second slot's pages freed -> only first occupies (1 page). - const auto cap_after_dup = a->TierCapacitiesSnapshot()[TierType::DRAM]; - EXPECT_EQ(cap_after_dup.available_bytes, cap_after_dup.total_bytes - kPageSize); - EXPECT_EQ(cap_after_dup.total_bytes, cap_after_first_commit.total_bytes); - - // Second slot_id no longer pending; idempotent Abort returns true. - EXPECT_TRUE(a->Abort(second->slot_id)); - EXPECT_TRUE(a->DrainPendingEvents().empty()); -} - -// ---- ENOSPC ----------------------------------------------------------------- - -TEST(PeerDramAllocator, AllocateReturnsNulloptWhenFull) { - auto a = MakeAllocator(); - std::vector slot_ids; - for (int i = 0; i < 12; ++i) { - auto p = AllocateOk(*a, "k-" + std::to_string(i), kPageSize, TierType::DRAM); - ASSERT_TRUE(p.has_value()) << "i=" << i; - slot_ids.push_back(p->slot_id); - } - EXPECT_FALSE(AllocateOk(*a, "k-overflow", kPageSize, TierType::DRAM).has_value()); - - EXPECT_TRUE(a->Abort(slot_ids.back())); - EXPECT_TRUE(AllocateOk(*a, "k-recovered", kPageSize, TierType::DRAM).has_value()); -} - -TEST(PeerDramAllocator, UnconfiguredTierReturnsNullopt) { - auto a = MakeAllocator(); - EXPECT_FALSE(AllocateOk(*a, "k", kPageSize, TierType::HBM).has_value()); -} - -// ---- Pending TTL ------------------------------------------------------------ - -TEST(PeerDramAllocator, PendingSlotExpiresAfterTtl) { - auto a = std::make_unique(kPageSize, MakeDramCfg(), EmptyCfg(), - /*pending_ttl=*/std::chrono::milliseconds{1}); - auto pending = AllocateOk(*a, "key-late", kPageSize, TierType::DRAM); - ASSERT_TRUE(pending.has_value()); - - std::this_thread::sleep_for(std::chrono::milliseconds{20}); - a->RunReaperOnceForTest(); - - uint64_t committed_bytes = 0; - EXPECT_FALSE(a->Commit(pending->slot_id, "key-late", committed_bytes)); - EXPECT_EQ(committed_bytes, 0u); - EXPECT_TRUE(a->DrainPendingEvents().empty()); - - auto cap = a->TierCapacitiesSnapshot(); - EXPECT_EQ(cap[TierType::DRAM].available_bytes, cap[TierType::DRAM].total_bytes); -} - -// ---- Abort idempotency ------------------------------------------------------ - -TEST(PeerDramAllocator, AbortIsIdempotent) { - auto a = MakeAllocator(); - auto pending = AllocateOk(*a, "k", kPageSize, TierType::DRAM); - ASSERT_TRUE(pending.has_value()); - EXPECT_TRUE(a->Abort(pending->slot_id)); - EXPECT_TRUE(a->Abort(pending->slot_id)); - EXPECT_TRUE(a->Abort(999999)); - EXPECT_TRUE(a->DrainPendingEvents().empty()); -} - -// ---- Evict idempotency + REMOVE event --------------------------------------- - -TEST(PeerDramAllocator, EvictRemovesKeyAndQueuesEvent) { - auto a = MakeAllocator(); - auto p = AllocateOk(*a, "k", kPageSize, TierType::DRAM); - uint64_t committed_bytes = 0; - ASSERT_TRUE(a->Commit(p->slot_id, "k", committed_bytes)); - EXPECT_EQ(committed_bytes, p->size); - a->DrainPendingEvents(); - - auto results = a->Evict({"k", "ghost"}); - ASSERT_EQ(results.size(), 2u); - EXPECT_EQ(results[0].key, "k"); - EXPECT_EQ(results[0].bytes_freed, kPageSize); - EXPECT_EQ(results[1].key, "ghost"); - EXPECT_EQ(results[1].bytes_freed, 0u); - - auto events = a->DrainPendingEvents(); - ASSERT_EQ(events.size(), 1u); - EXPECT_EQ(events[0].kind, KvEvent::Kind::REMOVE); - EXPECT_EQ(events[0].key, "k"); - - EXPECT_FALSE(a->Resolve("k").found); - - results = a->Evict({"k"}); - EXPECT_EQ(results[0].bytes_freed, 0u); - EXPECT_TRUE(a->DrainPendingEvents().empty()); -} - -// ---- Resolve-during-Evict race --------------------------------------------- - -TEST(PeerDramAllocator, EvictDefersWhenReadLeaseActive) { - auto a = std::make_unique(kPageSize, MakeDramCfg(), EmptyCfg(), - /*pending_ttl=*/std::chrono::milliseconds{5000}, - /*read_lease_ttl=*/std::chrono::milliseconds{200}); - auto p = AllocateOk(*a, "k", kPageSize, TierType::DRAM); - uint64_t committed_bytes = 0; - ASSERT_TRUE(a->Commit(p->slot_id, "k", committed_bytes)); - EXPECT_EQ(committed_bytes, p->size); - a->DrainPendingEvents(); - - auto r = a->Resolve("k"); - ASSERT_TRUE(r.found); - - auto results = a->Evict({"k"}); - EXPECT_EQ(results[0].bytes_freed, 0u); - EXPECT_TRUE(a->Resolve("k").found); - EXPECT_TRUE(a->DrainPendingEvents().empty()); - - std::this_thread::sleep_for(std::chrono::milliseconds{300}); - a->RunReaperOnceForTest(); - results = a->Evict({"k"}); - EXPECT_EQ(results[0].bytes_freed, kPageSize); - auto events = a->DrainPendingEvents(); - ASSERT_EQ(events.size(), 1u); - EXPECT_EQ(events[0].kind, KvEvent::Kind::REMOVE); -} - -// ---- Full-sync snapshot ----------------------------------------------------- - -TEST(PeerDramAllocator, SnapshotOwnedKeysReturnsEveryAdd) { - auto a = MakeAllocator(); - for (int i = 0; i < 5; ++i) { - const std::string k = "k-" + std::to_string(i); - auto p = AllocateOk(*a, k, kPageSize, TierType::DRAM); - ASSERT_TRUE(p.has_value()); - uint64_t committed_bytes = 0; - ASSERT_TRUE(a->Commit(p->slot_id, k, committed_bytes)); - EXPECT_EQ(committed_bytes, p->size); - } - a->DrainPendingEvents(); - - auto snap = a->SnapshotOwnedKeys(); - ASSERT_EQ(snap.size(), 5u); - for (const auto& ev : snap) { - EXPECT_EQ(ev.kind, KvEvent::Kind::ADD); - EXPECT_EQ(ev.size, kPageSize); - EXPECT_EQ(ev.tier, TierType::DRAM); - } - EXPECT_TRUE(a->DrainPendingEvents().empty()); -} - -// ---- Buffer descs filtered to the page set --------------------------------- - -TEST(PeerDramAllocator, BufferDescsForPagesDedupAndOrder) { - auto a = MakeAllocator(); - auto p = AllocateOk(*a, "k", kPageSize * 5, TierType::DRAM); - ASSERT_TRUE(p.has_value()); - ASSERT_EQ(p->pages.size(), 5u); - - auto descs = a->BufferDescsForPages(TierType::DRAM, p->pages); - ASSERT_EQ(descs.size(), 2u); - EXPECT_EQ(descs[0].buffer_index, 0u); - EXPECT_EQ(descs[1].buffer_index, 1u); - EXPECT_EQ(descs[0].desc_bytes, std::vector({0xA0, 0xA1})); - EXPECT_EQ(descs[1].desc_bytes, std::vector({0xB0, 0xB1})); -} - -// ---- BatchAllocate / BatchCommit / BatchAbort ------------------------------- - -TEST(PeerDramAllocator, BatchAllocateEmptyInputReturnsEmpty) { - auto a = MakeAllocator(); - EXPECT_TRUE(a->BatchAllocate({}).empty()); -} - -TEST(PeerDramAllocator, BatchAllocateMixedOutcomesAndDescs) { - auto a = MakeAllocator(); - auto owned = AllocateOk(*a, "owned", kPageSize, TierType::DRAM); - ASSERT_TRUE(owned.has_value()); - uint64_t committed_bytes = 0; - ASSERT_TRUE(a->Commit(owned->slot_id, "owned", committed_bytes)); - a->DrainPendingEvents(); - - std::vector requests; - requests.push_back({"owned", kPageSize, TierType::DRAM}); - requests.push_back({"ok", kPageSize * 5, TierType::DRAM}); - requests.push_back({"bad-tier", kPageSize, TierType::HBM}); - requests.push_back({"zero", 0, TierType::DRAM}); - requests.push_back({"too-big", kPageSize * 20, TierType::DRAM}); - - auto results = a->BatchAllocate(requests); - ASSERT_EQ(results.size(), requests.size()); - - EXPECT_EQ(results[0].outcome, PeerDramAllocator::Outcome::kSuccessAlreadyExists); - EXPECT_FALSE(results[0].slot.has_value()); - EXPECT_TRUE(results[0].descs.empty()); - - EXPECT_EQ(results[1].outcome, PeerDramAllocator::Outcome::kSuccessAllocated); - ASSERT_TRUE(results[1].slot.has_value()); - EXPECT_EQ(results[1].slot->size, kPageSize * 5); - EXPECT_EQ(results[1].slot->pages.size(), 5u); - ASSERT_EQ(results[1].descs.size(), 2u); - EXPECT_EQ(results[1].descs[0].buffer_index, 0u); - EXPECT_EQ(results[1].descs[1].buffer_index, 1u); - - EXPECT_EQ(results[2].outcome, PeerDramAllocator::Outcome::kFailed); - EXPECT_FALSE(results[2].slot.has_value()); - EXPECT_EQ(results[3].outcome, PeerDramAllocator::Outcome::kFailed); - EXPECT_FALSE(results[3].slot.has_value()); - EXPECT_EQ(results[4].outcome, PeerDramAllocator::Outcome::kFailedNoSpace); - EXPECT_FALSE(results[4].slot.has_value()); -} - -TEST(PeerDramAllocator, BatchCommitMixedSuccessAndFailure) { - auto a = MakeAllocator(); - auto allocated = a->BatchAllocate({ - {"dup", kPageSize, TierType::DRAM}, - {"dup", kPageSize * 2, TierType::DRAM}, - {"unique", kPageSize, TierType::DRAM}, - }); - ASSERT_EQ(allocated.size(), 3u); - ASSERT_TRUE(allocated[0].slot.has_value()); - ASSERT_TRUE(allocated[1].slot.has_value()); - ASSERT_TRUE(allocated[2].slot.has_value()); - - auto committed = a->BatchCommit({ - {allocated[0].slot->slot_id, "dup"}, - {999999, "missing"}, - {allocated[1].slot->slot_id, "dup"}, - {allocated[2].slot->slot_id, "unique"}, - }); - ASSERT_EQ(committed.size(), 4u); - EXPECT_TRUE(committed[0].success); - EXPECT_EQ(committed[0].bytes_committed, kPageSize); - EXPECT_FALSE(committed[1].success); - EXPECT_EQ(committed[1].bytes_committed, 0u); - EXPECT_TRUE(committed[2].success); - EXPECT_EQ(committed[2].bytes_committed, kPageSize); - EXPECT_TRUE(committed[3].success); - EXPECT_EQ(committed[3].bytes_committed, kPageSize); - - auto dup = a->Resolve("dup"); - ASSERT_TRUE(dup.found); - EXPECT_EQ(dup.pages, allocated[0].slot->pages); - EXPECT_EQ(dup.size, kPageSize); - auto unique = a->Resolve("unique"); - ASSERT_TRUE(unique.found); - EXPECT_EQ(unique.size, kPageSize); - - auto events = a->DrainPendingEvents(); - ASSERT_EQ(events.size(), 2u); - EXPECT_EQ(events[0].kind, KvEvent::Kind::ADD); - EXPECT_EQ(events[0].key, "dup"); - EXPECT_EQ(events[1].kind, KvEvent::Kind::ADD); - EXPECT_EQ(events[1].key, "unique"); -} - -TEST(PeerDramAllocator, BatchAbortMixedSlotsIsIdempotent) { - auto a = MakeAllocator(); - auto allocated = a->BatchAllocate({ - {"drop", kPageSize, TierType::DRAM}, - {"keep", kPageSize, TierType::DRAM}, - }); - ASSERT_EQ(allocated.size(), 2u); - ASSERT_TRUE(allocated[0].slot.has_value()); - ASSERT_TRUE(allocated[1].slot.has_value()); - - auto aborted = a->BatchAbort({allocated[0].slot->slot_id, 999999}); - ASSERT_EQ(aborted.size(), 2u); - EXPECT_TRUE(aborted[0]); - EXPECT_TRUE(aborted[1]); - - uint64_t committed_bytes = 0; - EXPECT_FALSE(a->Commit(allocated[0].slot->slot_id, "drop", committed_bytes)); - EXPECT_TRUE(a->Commit(allocated[1].slot->slot_id, "keep", committed_bytes)); - EXPECT_EQ(committed_bytes, kPageSize); - EXPECT_TRUE(a->Resolve("keep").found); -} - -// ---- BatchResolve ---------------------------------------------------------- - -TEST(PeerDramAllocator, BatchResolveEmptyInputReturnsEmpty) { - auto a = MakeAllocator(); - EXPECT_TRUE(a->BatchResolve({}).empty()); -} - -TEST(PeerDramAllocator, BatchResolveMixedHitsAndMisses) { - auto a = MakeAllocator(); - // 5 pages over 4-pages-per-buffer config -> exercises dedup'd descs. - auto p_hit = AllocateOk(*a, "hit", kPageSize * 5, TierType::DRAM); - ASSERT_TRUE(p_hit.has_value()); - uint64_t committed_bytes = 0; - ASSERT_TRUE(a->Commit(p_hit->slot_id, "hit", committed_bytes)); - auto p_small = AllocateOk(*a, "small", kPageSize, TierType::DRAM); - ASSERT_TRUE(p_small.has_value()); - ASSERT_TRUE(a->Commit(p_small->slot_id, "small", committed_bytes)); - a->DrainPendingEvents(); - - auto ref_hit = a->Resolve("hit"); - auto ref_descs_hit = a->BufferDescsForPages(ref_hit.tier, ref_hit.pages); - auto ref_small = a->Resolve("small"); - auto ref_descs_small = a->BufferDescsForPages(ref_small.tier, ref_small.pages); - ASSERT_TRUE(ref_hit.found); - ASSERT_TRUE(ref_small.found); - - auto results = a->BatchResolve({"hit", "ghost-a", "small", "ghost-b"}); - ASSERT_EQ(results.size(), 4u); - - EXPECT_TRUE(results[0].found); - EXPECT_EQ(results[0].tier, ref_hit.tier); - EXPECT_EQ(results[0].pages, ref_hit.pages); - EXPECT_EQ(results[0].size, ref_hit.size); - ASSERT_EQ(results[0].descs.size(), ref_descs_hit.size()); - for (size_t i = 0; i < ref_descs_hit.size(); ++i) { - EXPECT_EQ(results[0].descs[i].buffer_index, ref_descs_hit[i].buffer_index); - EXPECT_EQ(results[0].descs[i].desc_bytes, ref_descs_hit[i].desc_bytes); - } - - EXPECT_FALSE(results[1].found); - EXPECT_TRUE(results[1].pages.empty()); - EXPECT_EQ(results[1].size, 0u); - EXPECT_TRUE(results[1].descs.empty()); - - EXPECT_TRUE(results[2].found); - EXPECT_EQ(results[2].tier, ref_small.tier); - EXPECT_EQ(results[2].pages, ref_small.pages); - EXPECT_EQ(results[2].size, ref_small.size); - ASSERT_EQ(results[2].descs.size(), ref_descs_small.size()); - for (size_t i = 0; i < ref_descs_small.size(); ++i) { - EXPECT_EQ(results[2].descs[i].buffer_index, ref_descs_small[i].buffer_index); - EXPECT_EQ(results[2].descs[i].desc_bytes, ref_descs_small[i].desc_bytes); - } - - EXPECT_FALSE(results[3].found); -} - -TEST(PeerDramAllocator, BatchResolveExtendsLeaseForHitsOnly) { - auto a = std::make_unique(kPageSize, MakeDramCfg(), EmptyCfg(), - /*pending_ttl=*/std::chrono::milliseconds{5000}, - /*read_lease_ttl=*/std::chrono::milliseconds{500}); - auto p_x = AllocateOk(*a, "x", kPageSize, TierType::DRAM); - ASSERT_TRUE(p_x.has_value()); - uint64_t committed_bytes = 0; - ASSERT_TRUE(a->Commit(p_x->slot_id, "x", committed_bytes)); - auto p_y = AllocateOk(*a, "y", kPageSize, TierType::DRAM); - ASSERT_TRUE(p_y.has_value()); - ASSERT_TRUE(a->Commit(p_y->slot_id, "y", committed_bytes)); - a->DrainPendingEvents(); - - auto results = a->BatchResolve({"x", "missing", "y"}); - ASSERT_EQ(results.size(), 3u); - ASSERT_TRUE(results[0].found); - ASSERT_FALSE(results[1].found); - ASSERT_TRUE(results[2].found); - - auto evict = a->Evict({"x", "y"}); - ASSERT_EQ(evict.size(), 2u); - EXPECT_EQ(evict[0].bytes_freed, 0u); - EXPECT_EQ(evict[1].bytes_freed, 0u); - EXPECT_TRUE(a->Resolve("x").found); - EXPECT_TRUE(a->Resolve("y").found); - EXPECT_TRUE(a->DrainPendingEvents().empty()); - - // Miss must not poison read_lease_until_: a subsequent - // Allocate+Commit+Evict on the same key must free as if never touched. - auto p_miss = AllocateOk(*a, "missing", kPageSize, TierType::DRAM); - ASSERT_TRUE(p_miss.has_value()); - ASSERT_TRUE(a->Commit(p_miss->slot_id, "missing", committed_bytes)); - a->DrainPendingEvents(); - auto evict_missing = a->Evict({"missing"}); - ASSERT_EQ(evict_missing.size(), 1u); - EXPECT_EQ(evict_missing[0].bytes_freed, kPageSize); -} - -TEST(PeerDramAllocator, BatchResolveLeaseExpiresLikeSingleKeyResolve) { - auto a = std::make_unique(kPageSize, MakeDramCfg(), EmptyCfg(), - /*pending_ttl=*/std::chrono::milliseconds{5000}, - /*read_lease_ttl=*/std::chrono::milliseconds{50}); - auto p = AllocateOk(*a, "k", kPageSize, TierType::DRAM); - ASSERT_TRUE(p.has_value()); - uint64_t committed_bytes = 0; - ASSERT_TRUE(a->Commit(p->slot_id, "k", committed_bytes)); - a->DrainPendingEvents(); - - auto results = a->BatchResolve({"k"}); - ASSERT_EQ(results.size(), 1u); - ASSERT_TRUE(results[0].found); - - EXPECT_EQ(a->Evict({"k"})[0].bytes_freed, 0u); - - std::this_thread::sleep_for(std::chrono::milliseconds{100}); - auto evicted = a->Evict({"k"}); - ASSERT_EQ(evicted.size(), 1u); - EXPECT_EQ(evicted[0].bytes_freed, kPageSize); -} - -// ---- Capacities snapshot ---------------------------------------------------- - -TEST(PeerDramAllocator, TierCapacitiesReflectAllocations) { - auto a = MakeAllocator(); - auto cap0 = a->TierCapacitiesSnapshot(); - ASSERT_EQ(cap0.count(TierType::DRAM), 1u); - const uint64_t total = cap0[TierType::DRAM].total_bytes; - EXPECT_EQ(cap0[TierType::DRAM].available_bytes, total); - - auto p = AllocateOk(*a, "k", kPageSize * 3, TierType::DRAM); - ASSERT_TRUE(p.has_value()); - auto cap1 = a->TierCapacitiesSnapshot(); - EXPECT_EQ(cap1[TierType::DRAM].available_bytes, total - 3 * kPageSize); - - uint64_t committed_bytes = 0; - ASSERT_TRUE(a->Commit(p->slot_id, "k", committed_bytes)); - EXPECT_EQ(committed_bytes, p->size); - auto cap2 = a->TierCapacitiesSnapshot(); - EXPECT_EQ(cap2[TierType::DRAM].available_bytes, total - 3 * kPageSize); - - ASSERT_EQ(a->Evict({"k"})[0].bytes_freed, 3 * kPageSize); - auto cap3 = a->TierCapacitiesSnapshot(); - EXPECT_EQ(cap3[TierType::DRAM].available_bytes, total); -} - -// ---- Commit after reap ------------------------------------------------------ - -TEST(PeerDramAllocator, CommitAfterReapReturnsFalse) { - auto a = std::make_unique(kPageSize, MakeDramCfg(), EmptyCfg(), - std::chrono::milliseconds{1}); - auto p = AllocateOk(*a, "doomed", kPageSize, TierType::DRAM); - ASSERT_TRUE(p.has_value()); - std::this_thread::sleep_for(std::chrono::milliseconds{20}); - a->RunReaperOnceForTest(); - uint64_t committed_bytes = 0; - EXPECT_FALSE(a->Commit(p->slot_id, "doomed", committed_bytes)); - EXPECT_EQ(committed_bytes, 0u); - EXPECT_TRUE(a->DrainPendingEvents().empty()); -} - -// ---- Distributed Clear ------------------------------------------------------ - -TEST(PeerDramAllocator, ClearLocalReleasesOwnedAndCancelsPending) { - auto a = MakeAllocator(); - - auto pA = AllocateOk(*a, "A", kPageSize, TierType::DRAM); - ASSERT_TRUE(pA.has_value()); - uint64_t committed_bytes = 0; - ASSERT_TRUE(a->Commit(pA->slot_id, "A", committed_bytes)); - - auto pB = AllocateOk(*a, "B", kPageSize * 2, TierType::DRAM); - ASSERT_TRUE(pB.has_value()); - a->DrainPendingEvents(); // discard the A ADD - - const auto cap_before = a->TierCapacitiesSnapshot()[TierType::DRAM]; - ASSERT_LT(cap_before.available_bytes, cap_before.total_bytes); - - a->ClearLocal(); - - EXPECT_TRUE(a->IsClearFullSyncPending()); - EXPECT_FALSE(a->Resolve("A").found); - EXPECT_TRUE(a->SnapshotOwnedKeys().empty()); - EXPECT_TRUE(a->DrainPendingEvents().empty()); - - // Owned pages (A) returned immediately; pending pages (B) still held. - auto cap_after_clear = a->TierCapacitiesSnapshot()[TierType::DRAM]; - EXPECT_EQ(cap_after_clear.available_bytes, cap_before.total_bytes - 2 * kPageSize); - - // Committing the cancelled pending fails AND releases its pages - // without emitting an ADD. - EXPECT_FALSE(a->Commit(pB->slot_id, "B", committed_bytes)); - EXPECT_EQ(committed_bytes, 0u); - EXPECT_TRUE(a->DrainPendingEvents().empty()); - EXPECT_FALSE(a->Resolve("B").found); - - auto cap_final = a->TierCapacitiesSnapshot()[TierType::DRAM]; - EXPECT_EQ(cap_final.available_bytes, cap_final.total_bytes); -} - -TEST(PeerDramAllocator, ClearLocalGatesAllocateUntilAcked) { - auto a = MakeAllocator(); - - a->ClearLocal(); - EXPECT_FALSE(AllocateOk(*a, "blocked", kPageSize, TierType::DRAM).has_value()); - - a->ClearFullSyncAcked(); - EXPECT_FALSE(a->IsClearFullSyncPending()); - EXPECT_TRUE(AllocateOk(*a, "ok-after-ack", kPageSize, TierType::DRAM).has_value()); -} - -TEST(PeerDramAllocator, ClearLocalDropsQueuedAdds) { - auto a = MakeAllocator(); - auto p = AllocateOk(*a, "k", kPageSize, TierType::DRAM); - ASSERT_TRUE(p.has_value()); - uint64_t committed_bytes = 0; - ASSERT_TRUE(a->Commit(p->slot_id, "k", committed_bytes)); - // ADD is sitting in the outbox, not yet drained. - - a->ClearLocal(); - - EXPECT_TRUE(a->DrainPendingEvents().empty()); - EXPECT_TRUE(a->SnapshotOwnedKeys().empty()); -} - -TEST(PeerDramAllocator, AbortReleasesCancelledPending) { - auto a = MakeAllocator(); - auto p = AllocateOk(*a, "p1", kPageSize, TierType::DRAM); - ASSERT_TRUE(p.has_value()); - - a->ClearLocal(); - // Abort on a cancelled pending is idempotent and frees the pages. - EXPECT_TRUE(a->Abort(p->slot_id)); - a->ClearFullSyncAcked(); - - auto cap = a->TierCapacitiesSnapshot()[TierType::DRAM]; - EXPECT_EQ(cap.available_bytes, cap.total_bytes); -} - -// Pre-clear pending Commit fails; post-ack new Allocate+Commit succeeds. -TEST(PeerDramAllocator, PendingGenerationRejectsPreClearCommit) { - auto a = MakeAllocator(); - - auto pB = AllocateOk(*a, "B", kPageSize * 2, TierType::DRAM); - ASSERT_TRUE(pB.has_value()); - const auto cap_before = a->TierCapacitiesSnapshot()[TierType::DRAM]; - EXPECT_EQ(cap_before.available_bytes, cap_before.total_bytes - 2 * kPageSize); - - a->ClearLocal(); - - uint64_t committed_bytes = 0; - EXPECT_FALSE(a->Commit(pB->slot_id, "B", committed_bytes)); - EXPECT_EQ(committed_bytes, 0u); - EXPECT_TRUE(a->DrainPendingEvents().empty()); - auto cap_after_reject = a->TierCapacitiesSnapshot()[TierType::DRAM]; - EXPECT_EQ(cap_after_reject.available_bytes, cap_after_reject.total_bytes); - - a->ClearFullSyncAcked(); - - auto pC = AllocateOk(*a, "C", kPageSize, TierType::DRAM); - ASSERT_TRUE(pC.has_value()); - ASSERT_TRUE(a->Commit(pC->slot_id, "C", committed_bytes)); - EXPECT_EQ(committed_bytes, kPageSize); - EXPECT_TRUE(a->Resolve("C").found); -} - -// Repeated Clears still reject the original pre-clear pending Commit. -TEST(PeerDramAllocator, PendingGenerationSurvivesDoubleClear) { - auto a = MakeAllocator(); - - auto pB = AllocateOk(*a, "B", kPageSize, TierType::DRAM); - ASSERT_TRUE(pB.has_value()); - - a->ClearLocal(); - a->ClearLocal(); - - uint64_t committed_bytes = 0; - EXPECT_FALSE(a->Commit(pB->slot_id, "B", committed_bytes)); - auto cap_after_reject = a->TierCapacitiesSnapshot()[TierType::DRAM]; - EXPECT_EQ(cap_after_reject.available_bytes, cap_after_reject.total_bytes); - - a->ClearFullSyncAcked(); - auto pC = AllocateOk(*a, "C", kPageSize, TierType::DRAM); - ASSERT_TRUE(pC.has_value()); - EXPECT_TRUE(a->Commit(pC->slot_id, "C", committed_bytes)); -} - -// Leased owned key: logically gone at Clear, pages freed by reaper after -// the lease expires. -TEST(PeerDramAllocator, ClearLocalDefersLeasedOwnedPages) { - auto a = MakeAllocator(/*pending_ttl=*/std::chrono::milliseconds{5000}, - /*read_lease_ttl=*/std::chrono::milliseconds{200}); - - auto p = AllocateOk(*a, "A", kPageSize, TierType::DRAM); - ASSERT_TRUE(p.has_value()); - uint64_t committed_bytes = 0; - ASSERT_TRUE(a->Commit(p->slot_id, "A", committed_bytes)); - a->DrainPendingEvents(); - - const auto cap_committed = a->TierCapacitiesSnapshot()[TierType::DRAM]; - EXPECT_EQ(cap_committed.available_bytes, cap_committed.total_bytes - kPageSize); - - ASSERT_TRUE(a->Resolve("A").found); // lease. - - a->ClearLocal(); - - EXPECT_FALSE(a->Resolve("A").found); - EXPECT_TRUE(a->SnapshotOwnedKeys().empty()); - - auto cap_after_clear = a->TierCapacitiesSnapshot()[TierType::DRAM]; - EXPECT_EQ(cap_after_clear.available_bytes, cap_committed.total_bytes - kPageSize); - - // Pre-TTL sweep: no-op. - a->RunReaperOnceForTest(); - auto cap_no_op_sweep = a->TierCapacitiesSnapshot()[TierType::DRAM]; - EXPECT_EQ(cap_no_op_sweep.available_bytes, cap_committed.total_bytes - kPageSize); - - // Past TTL: pages return to bitmap. - std::this_thread::sleep_for(std::chrono::milliseconds{300}); - a->RunReaperOnceForTest(); - auto cap_swept = a->TierCapacitiesSnapshot()[TierType::DRAM]; - EXPECT_EQ(cap_swept.available_bytes, cap_swept.total_bytes); -} - -// Leased owned A defers; pending B rejects via generation. -TEST(PeerDramAllocator, ClearLocalMixedPendingAndLeased) { - auto a = MakeAllocator(/*pending_ttl=*/std::chrono::milliseconds{5000}, - /*read_lease_ttl=*/std::chrono::milliseconds{200}); - - auto pA = AllocateOk(*a, "A", kPageSize, TierType::DRAM); - ASSERT_TRUE(pA.has_value()); - uint64_t committed_bytes = 0; - ASSERT_TRUE(a->Commit(pA->slot_id, "A", committed_bytes)); - ASSERT_TRUE(a->Resolve("A").found); // lease. - - auto pB = AllocateOk(*a, "B", kPageSize * 2, TierType::DRAM); - ASSERT_TRUE(pB.has_value()); - a->DrainPendingEvents(); - - const auto total = a->TierCapacitiesSnapshot()[TierType::DRAM].total_bytes; - - a->ClearLocal(); - - EXPECT_FALSE(a->Resolve("A").found); - EXPECT_TRUE(a->SnapshotOwnedKeys().empty()); - - // A deferred + B pending: 3 pages occupied. - auto cap_after_clear = a->TierCapacitiesSnapshot()[TierType::DRAM]; - EXPECT_EQ(cap_after_clear.available_bytes, total - 3 * kPageSize); - - // Commit(B) fails on generation mismatch, releases B. - EXPECT_FALSE(a->Commit(pB->slot_id, "B", committed_bytes)); - auto cap_after_reject = a->TierCapacitiesSnapshot()[TierType::DRAM]; - EXPECT_EQ(cap_after_reject.available_bytes, total - kPageSize); // only A. - - // Past lease + sweep: A released. - std::this_thread::sleep_for(std::chrono::milliseconds{300}); - a->RunReaperOnceForTest(); - auto cap_final = a->TierCapacitiesSnapshot()[TierType::DRAM]; - EXPECT_EQ(cap_final.available_bytes, total); -} - -// Sweeps are no-ops while the deferred lease is still active. -TEST(PeerDramAllocator, ClearLocalSweepRespectsTtl) { - auto a = MakeAllocator(/*pending_ttl=*/std::chrono::milliseconds{5000}, - /*read_lease_ttl=*/std::chrono::milliseconds{10000}); - - auto p = AllocateOk(*a, "A", kPageSize, TierType::DRAM); - ASSERT_TRUE(p.has_value()); - uint64_t committed_bytes = 0; - ASSERT_TRUE(a->Commit(p->slot_id, "A", committed_bytes)); - ASSERT_TRUE(a->Resolve("A").found); - - const auto cap_committed = a->TierCapacitiesSnapshot()[TierType::DRAM]; - - a->ClearLocal(); - - // Lease still live: every sweep is a no-op. - for (int i = 0; i < 3; ++i) { - a->RunReaperOnceForTest(); - auto cap = a->TierCapacitiesSnapshot()[TierType::DRAM]; - EXPECT_EQ(cap.available_bytes, cap_committed.available_bytes) << "sweep i=" << i; - } -} - -// ---- OwnedKeyCountByTier ---------------------------------------------------- - -TEST(PeerDramAllocator, OwnedKeyCountByTierTracksCommitsAndEvicts) { - auto a = MakeAllocator(); - - auto counts0 = a->OwnedKeyCountByTier(); - EXPECT_EQ(counts0[TierType::DRAM], 0u); - EXPECT_EQ(counts0[TierType::HBM], 0u); - EXPECT_EQ(counts0[TierType::SSD], 0u); - - for (int i = 0; i < 3; ++i) { - const std::string k = "key-dram-" + std::to_string(i); - auto p = AllocateOk(*a, k, kPageSize, TierType::DRAM); - ASSERT_TRUE(p.has_value()) << "i=" << i; - uint64_t committed_bytes = 0; - ASSERT_TRUE(a->Commit(p->slot_id, k, committed_bytes)); - } - auto counts1 = a->OwnedKeyCountByTier(); - EXPECT_EQ(counts1[TierType::DRAM], 3u); - EXPECT_EQ(counts1[TierType::HBM], 0u); - EXPECT_EQ(counts1[TierType::SSD], 0u); - - a->Evict({"key-dram-0"}); - auto counts2 = a->OwnedKeyCountByTier(); - EXPECT_EQ(counts2[TierType::DRAM], 2u); - EXPECT_EQ(counts2[TierType::HBM], 0u); -} - -TEST(PeerDramAllocator, OwnedKeyCountByTierMultiTier) { - PeerDramAllocator::TierConfig hbm_cfg; - hbm_cfg.buffer_sizes = {kPageSize * 4}; - hbm_cfg.buffer_descs = {{0xD0, 0xD1}}; - auto a = std::make_unique(kPageSize, MakeDramCfg(), hbm_cfg, - std::chrono::milliseconds{5000}); - - for (int i = 0; i < 2; ++i) { - const std::string k = "d-" + std::to_string(i); - auto p = AllocateOk(*a, k, kPageSize, TierType::DRAM); - ASSERT_TRUE(p.has_value()); - uint64_t committed_bytes = 0; - ASSERT_TRUE(a->Commit(p->slot_id, k, committed_bytes)); - } - { - auto p = AllocateOk(*a, "h-0", kPageSize, TierType::HBM); - ASSERT_TRUE(p.has_value()); - uint64_t committed_bytes = 0; - ASSERT_TRUE(a->Commit(p->slot_id, "h-0", committed_bytes)); - } - - auto counts = a->OwnedKeyCountByTier(); - EXPECT_EQ(counts[TierType::DRAM], 2u); - EXPECT_EQ(counts[TierType::HBM], 1u); - EXPECT_EQ(counts[TierType::SSD], 0u); -} - -} // namespace mori::umbp diff --git a/src/umbp/tests/test_peer_ssd_eviction.cpp b/src/umbp/tests/test_peer_ssd_eviction.cpp deleted file mode 100644 index 591d3b1d3..000000000 --- a/src/umbp/tests/test_peer_ssd_eviction.cpp +++ /dev/null @@ -1,406 +0,0 @@ -// Copyright © Advanced Micro Devices, Inc. All rights reserved. -// -// MIT License -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -// -// SSD local capacity management + eviction. Drives PeerSsdManager -// through a controllable in-memory TierBackend (the test-only constructor) so -// LRU ordering, watermark eviction, the in-flight-read guard, idempotent Write, -// backend-evict failure, concurrent eviction, and physical Clear are all -// deterministic without real disk IO. -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "umbp/distributed/peer/peer_ssd_manager.h" -#include "umbp/local/tiers/tier_backend.h" - -namespace mori::umbp { -namespace { - -// In-memory TierBackend with test hooks: blockable reads (to hold the in-flight -// guard), forced evict failure, and call counters. -class FakeBackend : public TierBackend { - public: - explicit FakeBackend(size_t capacity) - : TierBackend(StorageTier::LOCAL_SSD), capacity_(capacity) {} - - bool Write(const std::string& key, const void* data, size_t size) override { - std::lock_guard lk(mu_); - ++write_calls_; - auto it = store_.find(key); - size_t prev = (it == store_.end()) ? 0 : it->second.size(); - if (used_ - prev + size > capacity_) return false; // ENOSPC - store_[key].assign(static_cast(data), static_cast(data) + size); - used_ = used_ - prev + size; - return true; - } - - bool ReadIntoPtr(const std::string& key, uintptr_t dst, size_t size) override { - { - std::unique_lock lk(gate_mu_); - ++reads_started_; - started_cv_.notify_all(); - gate_cv_.wait(lk, [this] { return !read_blocked_; }); - } - std::lock_guard lk(mu_); - auto it = store_.find(key); - if (it == store_.end() || it->second.size() != size) return false; - std::memcpy(reinterpret_cast(dst), it->second.data(), size); - return true; - } - - bool Exists(const std::string& key) const override { - std::lock_guard lk(mu_); - return store_.count(key) != 0; - } - - bool Evict(const std::string& key) override { - std::lock_guard lk(mu_); - if (fail_evict_) return false; - auto it = store_.find(key); - if (it == store_.end()) return false; - used_ -= it->second.size(); - store_.erase(it); - return true; - } - - std::pair Capacity() const override { - std::lock_guard lk(mu_); - return {used_, capacity_}; - } - - void Clear() override { - std::lock_guard lk(mu_); - ++clear_calls_; - store_.clear(); - used_ = 0; - } - - // --- test controls --- - void BlockReads() { - std::lock_guard lk(gate_mu_); - read_blocked_ = true; - } - void UnblockReads() { - { - std::lock_guard lk(gate_mu_); - read_blocked_ = false; - } - gate_cv_.notify_all(); - } - void WaitReadsStarted(int n) { - std::unique_lock lk(gate_mu_); - started_cv_.wait(lk, [&] { return reads_started_ >= n; }); - } - void SetFailEvict(bool f) { - std::lock_guard lk(mu_); - fail_evict_ = f; - } - int write_calls() const { - std::lock_guard lk(mu_); - return write_calls_; - } - int clear_calls() const { - std::lock_guard lk(mu_); - return clear_calls_; - } - - private: - mutable std::mutex mu_; - std::unordered_map> store_; - size_t used_ = 0; - size_t capacity_; - int write_calls_ = 0; - int clear_calls_ = 0; - bool fail_evict_ = false; - - std::mutex gate_mu_; - std::condition_variable gate_cv_; - std::condition_variable started_cv_; - bool read_blocked_ = false; - int reads_started_ = 0; -}; - -std::vector> OneSeg(const std::string& s) { - return {{s.data(), s.size()}}; -} - -// Manager owning a FakeBackend we keep a raw pointer to for test inspection. -struct Harness { - FakeBackend* backend; - std::unique_ptr mgr; -}; - -Harness MakeHarness(size_t capacity, double high = 0.9, double low = 0.7) { - auto be = std::make_unique(capacity); - FakeBackend* raw = be.get(); - return Harness{raw, std::make_unique(std::move(be), high, low)}; -} - -int CountKind(const std::vector& events, KvEvent::Kind kind) { - int n = 0; - for (const auto& e : events) { - if (e.kind == kind && e.tier == TierType::SSD) ++n; - } - return n; -} - -bool HasRemove(const std::vector& events, const std::string& key) { - for (const auto& e : events) { - if (e.kind == KvEvent::Kind::REMOVE && e.tier == TierType::SSD && e.key == key) return true; - } - return false; -} - -// --------------------------------------------------------------------------- - -TEST(PeerSsdEviction, WriteAndPrepareReadRefreshLru) { - auto h = MakeHarness(/*capacity=*/1'000'000); - ASSERT_TRUE(h.mgr->Write("A", OneSeg("aaaa"), 4)); - ASSERT_TRUE(h.mgr->Write("B", OneSeg("bbbb"), 4)); - ASSERT_TRUE(h.mgr->Write("C", OneSeg("cccc"), 4)); - - // LRU now (oldest->newest): A, B, C. Reading A promotes it to MRU, so the - // oldest becomes B and SelectVictims must pick B first (not the just-read A). - std::vector buf(4); - auto out = h.mgr->PrepareRead("A", buf.data(), buf.size()); - ASSERT_EQ(out.status, SsdReadStatus::kOk); - EXPECT_EQ(std::string(buf.data(), out.size), "aaaa"); - - auto victims = h.mgr->SelectVictims(/*bytes_to_free=*/1); - ASSERT_FALSE(victims.empty()); - EXPECT_EQ(victims.front(), "B"); - EXPECT_NE(victims.front(), "A"); -} - -TEST(PeerSsdEviction, WatermarkTriggersEvictionDownToLow) { - // capacity 1000, high 0.9 (=>900), low 0.7 (=>700); 100-byte values. - auto h = MakeHarness(/*capacity=*/1000, /*high=*/0.9, /*low=*/0.7); - std::string val(100, 'x'); - for (int i = 1; i <= 9; ++i) { - ASSERT_TRUE(h.mgr->Write("k" + std::to_string(i), OneSeg(val), val.size())); - } - // After k9: used hit 900 >= high -> evict oldest down to <= 700. - auto [used, total] = h.mgr->Capacity(); - EXPECT_EQ(total, 1000u); - EXPECT_LE(used, 700u); - - // Oldest (k1, k2) evicted first; newest still present. - EXPECT_FALSE(h.mgr->Exists("k1")); - EXPECT_FALSE(h.mgr->Exists("k2")); - EXPECT_TRUE(h.mgr->Exists("k9")); - - auto events = h.mgr->DrainPendingEvents(); - EXPECT_TRUE(HasRemove(events, "k1")); - EXPECT_TRUE(HasRemove(events, "k2")); - EXPECT_EQ(CountKind(events, KvEvent::Kind::REMOVE), 2); -} - -TEST(PeerSsdEviction, EnospcTriggersEvictThenRetry) { - // Fill to 800/1000 (below the 0.9 high watermark, so no watermark eviction - // fires during the fill), then write a 300-byte value that overflows the - // device: backend Write -> ENOSPC -> one evict round (frees the oldest down - // to the 0.5 low watermark) -> retry succeeds. After the retry used is 800, - // still below high, so no second round disturbs the just-written key. - auto h = MakeHarness(/*capacity=*/1000, /*high=*/0.9, /*low=*/0.5); - for (int i = 1; i <= 8; ++i) { - ASSERT_TRUE(h.mgr->Write("k" + std::to_string(i), OneSeg(std::string(100, 'a')), 100)); - } - ASSERT_TRUE(h.mgr->Write("big", OneSeg(std::string(300, 'c')), 300)); - EXPECT_TRUE(h.mgr->Exists("big")); - EXPECT_FALSE(h.mgr->Exists("k1")); // oldest reclaimed to make room - EXPECT_LE(h.mgr->Capacity().first, 1000u); -} - -TEST(PeerSsdEviction, InFlightReadIsNotEvicted) { - auto h = MakeHarness(/*capacity=*/1'000'000); - const std::string val = "payload-payload"; - ASSERT_TRUE(h.mgr->Write("K", OneSeg(val), val.size())); - - h.backend->BlockReads(); - std::vector buf(val.size()); - SsdReadOutcome out{}; - std::thread reader([&] { out = h.mgr->PrepareRead("K", buf.data(), buf.size()); }); - h.backend->WaitReadsStarted(1); // PrepareRead has marked K in-flight and is blocked in the read - - // Eviction must skip a key that is being read. - EXPECT_FALSE(h.mgr->Evict("K")); - EXPECT_TRUE(h.mgr->SelectVictims(1'000'000).empty()); - EXPECT_TRUE(h.mgr->Exists("K")); - - h.backend->UnblockReads(); - reader.join(); - EXPECT_EQ(out.status, SsdReadStatus::kOk); - EXPECT_EQ(std::string(buf.data(), out.size), val); - - // Once the read finished, the key can be evicted. - EXPECT_TRUE(h.mgr->Evict("K")); - EXPECT_FALSE(h.mgr->Exists("K")); -} - -TEST(PeerSsdEviction, StaleRouteReadAfterEvictIsNotFound) { - auto h = MakeHarness(/*capacity=*/1'000'000); - ASSERT_TRUE(h.mgr->Write("K", OneSeg("data"), 4)); - ASSERT_TRUE(h.mgr->Evict("K")); - - std::vector buf(4); - auto out = h.mgr->PrepareRead("K", buf.data(), buf.size()); - EXPECT_EQ(out.status, SsdReadStatus::kNotFound); -} - -TEST(PeerSsdEviction, DuplicateWriteIsIdempotent) { - auto h = MakeHarness(/*capacity=*/1'000'000); - ASSERT_TRUE(h.mgr->Write("K", OneSeg("data"), 4)); - ASSERT_TRUE(h.mgr->Write("K", OneSeg("data"), 4)); // same content-addressed key - - EXPECT_EQ(h.backend->write_calls(), 1); // no second backend write - auto events = h.mgr->DrainPendingEvents(); - EXPECT_EQ(CountKind(events, KvEvent::Kind::ADD), 1); // no duplicate ADD SSD -} - -TEST(PeerSsdEviction, BackendEvictFailureKeepsMetadata) { - auto h = MakeHarness(/*capacity=*/1'000'000); - ASSERT_TRUE(h.mgr->Write("K", OneSeg("data"), 4)); - h.mgr->DrainPendingEvents(); // discard ADD - - h.backend->SetFailEvict(true); - EXPECT_FALSE(h.mgr->Evict("K")); - EXPECT_TRUE(h.mgr->Exists("K")); // kept for retry - EXPECT_TRUE(h.mgr->DrainPendingEvents().empty()); // no REMOVE emitted - - h.backend->SetFailEvict(false); - EXPECT_TRUE(h.mgr->Evict("K")); // retry succeeds - EXPECT_FALSE(h.mgr->Exists("K")); - auto events = h.mgr->DrainPendingEvents(); - EXPECT_EQ(CountKind(events, KvEvent::Kind::REMOVE), 1); -} - -TEST(PeerSsdEviction, ConcurrentEvictOfSameKeyRemovesOnce) { - auto h = MakeHarness(/*capacity=*/1'000'000); - ASSERT_TRUE(h.mgr->Write("K", OneSeg("data"), 4)); - h.mgr->DrainPendingEvents(); - - std::atomic wins{0}; - std::vector threads; - for (int i = 0; i < 4; ++i) { - threads.emplace_back([&] { - if (h.mgr->Evict("K")) wins.fetch_add(1); - }); - } - for (auto& t : threads) t.join(); - - EXPECT_EQ(wins.load(), 1); // exactly one evictor wins - EXPECT_FALSE(h.mgr->Exists("K")); - auto events = h.mgr->DrainPendingEvents(); - EXPECT_EQ(CountKind(events, KvEvent::Kind::REMOVE), 1); // no double REMOVE -} - -TEST(PeerSsdEviction, ClearLocalWipesPhysicalBytes) { - auto h = MakeHarness(/*capacity=*/1'000'000); - ASSERT_TRUE(h.mgr->Write("a", OneSeg("1111"), 4)); - ASSERT_TRUE(h.mgr->Write("b", OneSeg("2222"), 4)); - - h.mgr->ClearLocal(); - - EXPECT_EQ(h.backend->clear_calls(), 1); // physical wipe happened - EXPECT_FALSE(h.mgr->Exists("a")); - EXPECT_FALSE(h.mgr->Exists("b")); - EXPECT_TRUE(h.mgr->SnapshotOwnedKeys().empty()); - auto [used, total] = h.mgr->Capacity(); - EXPECT_EQ(used, 0u); -} - -TEST(PeerSsdEviction, ClearLocalWaitsForInFlightRead) { - auto h = MakeHarness(/*capacity=*/1'000'000); - const std::string val = "read-priority"; - ASSERT_TRUE(h.mgr->Write("K", OneSeg(val), val.size())); - - h.backend->BlockReads(); - std::vector buf(val.size()); - SsdReadOutcome out{}; - std::thread reader([&] { out = h.mgr->PrepareRead("K", buf.data(), buf.size()); }); - h.backend->WaitReadsStarted(1); - - std::thread clearer([&] { h.mgr->ClearLocal(); }); - - // The read is in flight; let it complete, then ClearLocal may wipe. If - // ClearLocal had wiped the backend before the read finished, the read would - // return kError instead of the correct bytes — so kOk proves read priority. - h.backend->UnblockReads(); - reader.join(); - clearer.join(); - - EXPECT_EQ(out.status, SsdReadStatus::kOk); - EXPECT_EQ(std::string(buf.data(), out.size), val); - EXPECT_EQ(h.backend->clear_calls(), 1); - EXPECT_FALSE(h.mgr->Exists("K")); -} - -TEST(PeerSsdEviction, InvalidWatermarksThrow) { - // low >= high, and high > 1 are both rejected (fail-fast, no silent clamp). - EXPECT_THROW(PeerSsdManager(std::make_unique(1000), 0.5, 0.7), std::runtime_error); - EXPECT_THROW(PeerSsdManager(std::make_unique(1000), 1.5, 0.7), std::runtime_error); - EXPECT_THROW(PeerSsdManager(std::make_unique(1000), 0.9, 0.0), std::runtime_error); -} - -TEST(PeerSsdEviction, SelectVictimsBoundaries) { - auto h = MakeHarness(/*capacity=*/1'000'000); - ASSERT_TRUE(h.mgr->Write("K", OneSeg("data"), 4)); - - EXPECT_TRUE(h.mgr->SelectVictims(0).empty()); // nothing to free - - // All candidates in flight -> no victim, no spin. - h.backend->BlockReads(); - std::vector buf(4); - SsdReadOutcome out{}; - std::thread reader([&] { out = h.mgr->PrepareRead("K", buf.data(), buf.size()); }); - h.backend->WaitReadsStarted(1); - EXPECT_TRUE(h.mgr->SelectVictims(1'000'000).empty()); - h.backend->UnblockReads(); - reader.join(); - EXPECT_EQ(out.status, SsdReadStatus::kOk); -} - -TEST(PeerSsdEviction, DisabledManagerIsInert) { - PeerSsdConfig cfg; - cfg.enabled = false; - PeerSsdManager mgr(cfg); - - EXPECT_FALSE(mgr.Write("K", OneSeg("data"), 4)); - EXPECT_FALSE(mgr.Evict("K")); - EXPECT_TRUE(mgr.SelectVictims(100).empty()); - std::vector buf(4); - EXPECT_EQ(mgr.PrepareRead("K", buf.data(), buf.size()).status, SsdReadStatus::kNotFound); - mgr.ClearLocal(); // no backend -> no crash, no-op - EXPECT_TRUE(mgr.SnapshotOwnedKeys().empty()); -} - -} // namespace -} // namespace mori::umbp diff --git a/src/umbp/tests/test_peer_ssd_manager.cpp b/src/umbp/tests/test_peer_ssd_manager.cpp deleted file mode 100644 index 0fd2aa861..000000000 --- a/src/umbp/tests/test_peer_ssd_manager.cpp +++ /dev/null @@ -1,233 +0,0 @@ -// Copyright © Advanced Micro Devices, Inc. All rights reserved. -// -// MIT License -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "umbp/distributed/peer/owned_location_source.h" -#include "umbp/distributed/peer/peer_ssd_manager.h" - -namespace mori::umbp { -namespace { - -namespace fs = std::filesystem; - -// Unique temp dir per fixture instance; backend uses Posix I/O to avoid -// io_uring availability differences inside the build container. -class PeerSsdManagerTest : public ::testing::Test { - protected: - void SetUp() override { - static std::atomic counter{0}; - dir_ = fs::temp_directory_path() / ("umbp_ssd_test_" + std::to_string(::getpid()) + "_" + - std::to_string(counter.fetch_add(1))); - fs::remove_all(dir_); - } - - void TearDown() override { - std::error_code ec; - fs::remove_all(dir_, ec); - } - - PeerSsdConfig MakeConfig(size_t capacity = 64ULL * 1024 * 1024) const { - PeerSsdConfig cfg; - cfg.enabled = true; - cfg.ssd.enabled = true; - cfg.ssd.storage_dir = dir_.string(); - cfg.ssd.capacity_bytes = capacity; - cfg.ssd.io.backend = UMBPIoBackend::Posix; // avoid io_uring container flakiness - return cfg; - } - - static std::vector> OneSegment(const std::string& s) { - return {{s.data(), s.size()}}; - } - - fs::path dir_; -}; - -TEST_F(PeerSsdManagerTest, WriteRecordsOwnershipAndQueuesAddEvent) { - PeerSsdManager mgr(MakeConfig()); - const std::string key = "key-1"; - const std::string value = "hello-ssd-payload"; - - ASSERT_TRUE(mgr.Write(key, OneSegment(value), value.size())); - EXPECT_TRUE(mgr.Exists(key)); - - auto events = mgr.DrainPendingEvents(); - ASSERT_EQ(events.size(), 1u); - EXPECT_EQ(events[0].kind, KvEvent::Kind::ADD); - EXPECT_EQ(events[0].key, key); - EXPECT_EQ(events[0].tier, TierType::SSD); - EXPECT_EQ(events[0].size, value.size()); - - // Drain is destructive. - EXPECT_TRUE(mgr.DrainPendingEvents().empty()); - - auto snap = mgr.SnapshotOwnedKeys(); - ASSERT_EQ(snap.size(), 1u); - EXPECT_EQ(snap[0].key, key); - EXPECT_EQ(snap[0].tier, TierType::SSD); - EXPECT_EQ(snap[0].size, value.size()); -} - -TEST_F(PeerSsdManagerTest, WriteAssemblesNonContiguousSegments) { - PeerSsdManager mgr(MakeConfig()); - const std::string a = "abc"; - const std::string b = "defgh"; - std::vector> segs = {{a.data(), a.size()}, {b.data(), b.size()}}; - - ASSERT_TRUE(mgr.Write("multi", segs, a.size() + b.size())); - EXPECT_TRUE(mgr.Exists("multi")); - auto snap = mgr.SnapshotOwnedKeys(); - ASSERT_EQ(snap.size(), 1u); - EXPECT_EQ(snap[0].size, a.size() + b.size()); -} - -TEST_F(PeerSsdManagerTest, CapacityReportsTotalAndGrowsWithWrites) { - const size_t cap = 32ULL * 1024 * 1024; - PeerSsdManager mgr(MakeConfig(cap)); - - auto [used_before, total_before] = mgr.Capacity(); - EXPECT_EQ(total_before, cap); - - std::string value(4096, 'x'); - ASSERT_TRUE(mgr.Write("big", OneSegment(value), value.size())); - - auto [used_after, total_after] = mgr.Capacity(); - EXPECT_EQ(total_after, cap); - EXPECT_GE(used_after, used_before); -} - -TEST_F(PeerSsdManagerTest, EvictRemovesOwnershipAndQueuesRemoveEvent) { - PeerSsdManager mgr(MakeConfig()); - const std::string key = "key-evict"; - const std::string value = "payload"; - ASSERT_TRUE(mgr.Write(key, OneSegment(value), value.size())); - mgr.DrainPendingEvents(); // discard the ADD - - EXPECT_TRUE(mgr.Evict(key)); - EXPECT_FALSE(mgr.Exists(key)); - - auto events = mgr.DrainPendingEvents(); - ASSERT_EQ(events.size(), 1u); - EXPECT_EQ(events[0].kind, KvEvent::Kind::REMOVE); - EXPECT_EQ(events[0].key, key); - EXPECT_EQ(events[0].tier, TierType::SSD); - - // Evicting an unknown key is a no-op (no event, returns false). - EXPECT_FALSE(mgr.Evict("never-written")); - EXPECT_TRUE(mgr.DrainPendingEvents().empty()); -} - -TEST_F(PeerSsdManagerTest, PrepareReadReturnsBytesForOwnedKey) { - PeerSsdManager mgr(MakeConfig()); - const std::string key = "key-read"; - const std::string value = "hello-ssd-read-path"; - ASSERT_TRUE(mgr.Write(key, OneSegment(value), value.size())); - - std::vector staging(value.size()); - auto out = mgr.PrepareRead(key, staging.data(), staging.size()); - EXPECT_EQ(out.status, SsdReadStatus::kOk); - EXPECT_EQ(out.size, value.size()); - EXPECT_EQ(std::string(staging.data(), out.size), value); -} - -TEST_F(PeerSsdManagerTest, PrepareReadUnknownKeyIsNotFound) { - PeerSsdManager mgr(MakeConfig()); - std::vector staging(64); - auto out = mgr.PrepareRead("never-written", staging.data(), staging.size()); - EXPECT_EQ(out.status, SsdReadStatus::kNotFound); -} - -TEST_F(PeerSsdManagerTest, PrepareReadRejectsOverCapBeforeIo) { - PeerSsdManager mgr(MakeConfig()); - const std::string key = "key-big"; - const std::string value(4096, 'z'); - ASSERT_TRUE(mgr.Write(key, OneSegment(value), value.size())); - - // Capacity smaller than the actual size must be rejected as kSizeTooLarge - // (and the reported size is the real size) without reading into the buffer. - std::vector staging(value.size() / 2); - auto out = mgr.PrepareRead(key, staging.data(), staging.size()); - EXPECT_EQ(out.status, SsdReadStatus::kSizeTooLarge); - EXPECT_EQ(out.size, value.size()); -} - -// ---- Unified owned-location source aggregation ------------------------------ - -// Minimal OwnedLocationSource that replays a fixed event list, used to verify -// MasterClient's multi-source concat logic without a live master. -class FakeSource : public OwnedLocationSource { - public: - explicit FakeSource(std::vector events) : events_(std::move(events)) {} - std::vector DrainPendingEvents() override { - auto out = events_; - drained_ = true; - return out; - } - std::vector SnapshotOwnedKeys() const override { return events_; } - bool drained_ = false; - - private: - std::vector events_; -}; - -TEST(OwnedLocationSourceAgg, DrainAndSnapshotConcatAcrossSourcesInOrder) { - FakeSource dram({{KvEvent::Kind::ADD, "d1", TierType::DRAM, 10}, - {KvEvent::Kind::ADD, "d2", TierType::DRAM, 20}}); - FakeSource ssd({{KvEvent::Kind::ADD, "s1", TierType::SSD, 30}}); - - std::vector sources = {&dram, &ssd}; - - auto drained = DrainAllSources(sources); - ASSERT_EQ(drained.size(), 3u); - EXPECT_EQ(drained[0].key, "d1"); - EXPECT_EQ(drained[0].tier, TierType::DRAM); - EXPECT_EQ(drained[1].key, "d2"); - EXPECT_EQ(drained[2].key, "s1"); - EXPECT_EQ(drained[2].tier, TierType::SSD); - EXPECT_TRUE(dram.drained_); - EXPECT_TRUE(ssd.drained_); - - auto snap = SnapshotAllSources(sources); - ASSERT_EQ(snap.size(), 3u); - EXPECT_EQ(snap[2].tier, TierType::SSD); -} - -TEST(OwnedLocationSourceAgg, NullSourcesAreSkipped) { - FakeSource only({{KvEvent::Kind::ADD, "x", TierType::SSD, 1}}); - std::vector sources = {nullptr, &only, nullptr}; - auto drained = DrainAllSources(sources); - ASSERT_EQ(drained.size(), 1u); - EXPECT_EQ(drained[0].key, "x"); - EXPECT_TRUE(SnapshotAllSources({nullptr}).empty()); -} - -} // namespace -} // namespace mori::umbp diff --git a/src/umbp/tests/test_peer_ssd_read_rpc.cpp b/src/umbp/tests/test_peer_ssd_read_rpc.cpp deleted file mode 100644 index 949996cdb..000000000 --- a/src/umbp/tests/test_peer_ssd_read_rpc.cpp +++ /dev/null @@ -1,258 +0,0 @@ -// Copyright © Advanced Micro Devices, Inc. All rights reserved. -// -// MIT License -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -// -// RPC-level integration test for the SSD read path: a real PeerServiceServer -// (backed by a real PeerSsdManager / POSIX SSDTier) served over a gRPC loopback -// channel. It exercises prepare -> read-from-staging -> release / TTL and, -// crucially, asserts that OK / NOT_FOUND / NO_SLOT / SIZE_TOO_LARGE are each -// reported as distinct statuses so a transient failure is never collapsed into -// a NOT_FOUND miss. RDMA is intentionally out of scope here (the staging buffer -// is read directly); the full BatchGet -> RDMA path needs a live cluster. -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "umbp/distributed/config.h" -#include "umbp/distributed/peer/peer_service.h" -#include "umbp/distributed/peer/peer_ssd_manager.h" -#include "umbp_peer.grpc.pb.h" - -namespace mori::umbp { -namespace { - -namespace fs = std::filesystem; - -constexpr size_t kStagingSize = 4096; -constexpr int kNumReadSlots = 4; // -> 1024 B per slot -constexpr int kLeaseTimeoutS = 2; - -uint16_t AllocPort() { - static std::atomic next{51300}; - return next.fetch_add(1); -} - -class PeerSsdReadRpcTest : public ::testing::Test { - protected: - void SetUp() override { - staging_buffer_ = std::malloc(kStagingSize); - ASSERT_NE(staging_buffer_, nullptr); - std::memset(staging_buffer_, 0, kStagingSize); - - dir_ = fs::temp_directory_path() / - ("umbp_ssd_rpc_" + std::to_string(::getpid()) + "_" + std::to_string(AllocPort())); - fs::remove_all(dir_); - - PeerSsdConfig cfg; - cfg.enabled = true; - cfg.ssd.enabled = true; - cfg.ssd.storage_dir = dir_.string(); - cfg.ssd.capacity_bytes = 1 << 20; - cfg.ssd.io.backend = UMBPIoBackend::Posix; // avoid io_uring container flakiness - peer_ssd_ = std::make_unique(cfg); - - // Fake staging MemoryDesc bytes — GetPeerInfo just echoes them; this test - // reads the staging buffer directly rather than RDMA-ing it. - staging_desc_ = {0xAB, 0xCD}; - - port_ = AllocPort(); - server_ = std::make_unique( - /*dram_alloc=*/nullptr, peer_ssd_.get(), staging_buffer_, kStagingSize, staging_desc_, - kNumReadSlots, kLeaseTimeoutS); - ASSERT_TRUE(server_->Start(port_)); - std::this_thread::sleep_for(std::chrono::milliseconds(150)); - - auto channel = grpc::CreateChannel("localhost:" + std::to_string(port_), - grpc::InsecureChannelCredentials()); - stub_ = ::umbp::UMBPPeer::NewStub(channel); - } - - void TearDown() override { - server_->Stop(); - server_.reset(); - peer_ssd_.reset(); - std::free(staging_buffer_); - fs::remove_all(dir_); - } - - void WriteSsd(const std::string& key, const std::string& data) { - ASSERT_TRUE(peer_ssd_->Write(key, {{data.data(), data.size()}}, data.size())); - } - - ::umbp::PrepareSsdReadResponse Prepare(const std::string& key, uint64_t max_size) { - ::umbp::PrepareSsdReadRequest req; - req.set_key(key); - req.set_max_size(max_size); - ::umbp::PrepareSsdReadResponse resp; - grpc::ClientContext ctx; - EXPECT_TRUE(stub_->PrepareSsdRead(&ctx, req, &resp).ok()); - return resp; - } - - void* staging_buffer_ = nullptr; - fs::path dir_; - std::vector staging_desc_; - uint16_t port_ = 0; - std::unique_ptr peer_ssd_; - std::unique_ptr server_; - std::unique_ptr<::umbp::UMBPPeer::Stub> stub_; -}; - -TEST_F(PeerSsdReadRpcTest, OkReadsBytesIntoStaging) { - const std::string data = "ssd-read-rpc-ok"; - WriteSsd("k-ok", data); - - auto resp = Prepare("k-ok", data.size()); - ASSERT_EQ(resp.status(), ::umbp::SSD_READ_OK); - EXPECT_EQ(resp.size(), data.size()); - EXPECT_LT(resp.staging_offset(), kStagingSize); - EXPECT_GT(resp.lease_id(), 0u); - std::string loaded(static_cast(staging_buffer_) + resp.staging_offset(), - resp.size()); - EXPECT_EQ(loaded, data); -} - -TEST_F(PeerSsdReadRpcTest, NotFoundIsADistinctMiss) { - auto resp = Prepare("absent", 64); - EXPECT_EQ(resp.status(), ::umbp::SSD_READ_NOT_FOUND); -} - -TEST_F(PeerSsdReadRpcTest, SizeTooLargeIsDistinct) { - // A key bigger than one slot (1024 B) must report SIZE_TOO_LARGE, not OK and - // not NOT_FOUND. - const std::string big(2048, 'q'); - WriteSsd("k-big", big); - auto resp = Prepare("k-big", kStagingSize); - EXPECT_EQ(resp.status(), ::umbp::SSD_READ_SIZE_TOO_LARGE); -} - -// The key assertion the review asked for: slot exhaustion is NO_SLOT -// (retryable), never collapsed into NOT_FOUND. A present key and an absent key -// under exhaustion are distinguishable. -TEST_F(PeerSsdReadRpcTest, NoSlotIsDistinctFromNotFound) { - std::vector<::umbp::PrepareSsdReadResponse> held; - for (int i = 0; i < kNumReadSlots; ++i) { - const std::string key = "hold-" + std::to_string(i); - WriteSsd(key, "payload"); - auto resp = Prepare(key, 64); - ASSERT_EQ(resp.status(), ::umbp::SSD_READ_OK); - held.push_back(resp); // keep the lease held (no release) so slots stay busy - } - - // A present key with all slots busy -> NO_SLOT (retryable), NOT a miss. - WriteSsd("present-extra", "payload"); - EXPECT_EQ(Prepare("present-extra", 64).status(), ::umbp::SSD_READ_NO_SLOT); - - // An absent key under the same exhaustion is still NO_SLOT (slot check - // precedes the key lookup), so the caller cannot mistake exhaustion for a - // definitive miss. - EXPECT_EQ(Prepare("absent-extra", 64).status(), ::umbp::SSD_READ_NO_SLOT); -} - -// Many concurrent readers contend for a fixed pool of staging slots: at most -// kNumReadSlots win OK, the rest get NO_SLOT (a retryable transient), and NEVER -// NOT_FOUND for a present key. Also exercises the staging observability -// (slot_full_rejects counter + the in-use gauge accessor). -TEST_F(PeerSsdReadRpcTest, ConcurrentReadersExhaustSlotsWithoutFalseMiss) { - const std::string data = "concurrent-payload"; - for (int i = 0; i < 32; ++i) WriteSsd("ck-" + std::to_string(i), data); - - const uint64_t slot_full_before = server_->Metrics().slot_full_rejects.load(); - - constexpr int kReaders = 24; // >> kNumReadSlots, leases held (never released) - std::atomic ok{0}, no_slot{0}, other{0}; - std::vector threads; - for (int i = 0; i < kReaders; ++i) { - threads.emplace_back([&, i] { - auto resp = Prepare("ck-" + std::to_string(i), data.size()); - switch (resp.status()) { - case ::umbp::SSD_READ_OK: - ok.fetch_add(1); - break; - case ::umbp::SSD_READ_NO_SLOT: - no_slot.fetch_add(1); - break; - default: - other.fetch_add(1); // NOT_FOUND / SIZE_TOO_LARGE / ERROR must never happen here - break; - } - }); - } - for (auto& t : threads) t.join(); - - EXPECT_EQ(other.load(), 0) << "present keys never report a false miss under contention"; - EXPECT_LE(ok.load(), kNumReadSlots) << "at most one OK per staging slot"; - EXPECT_EQ(ok.load() + no_slot.load(), kReaders); - EXPECT_GT(no_slot.load(), 0) << "with more readers than slots, some must see NO_SLOT"; - - // The NO_SLOT rejections were counted, and the gauge sees the held leases. - EXPECT_GE(server_->Metrics().slot_full_rejects.load() - slot_full_before, - static_cast(no_slot.load())); - EXPECT_EQ(server_->SnapshotReadSlotsInUse(), static_cast(ok.load())); -} - -// A best-effort release frees the slot; double release reports false. -TEST_F(PeerSsdReadRpcTest, ReleaseFreesSlotAndIsBestEffort) { - WriteSsd("k-rel", "payload"); - auto resp = Prepare("k-rel", 64); - ASSERT_EQ(resp.status(), ::umbp::SSD_READ_OK); - - ::umbp::ReleaseSsdLeaseRequest rel; - rel.set_lease_id(resp.lease_id()); - ::umbp::ReleaseSsdLeaseResponse rel_resp; - grpc::ClientContext ctx; - ASSERT_TRUE(stub_->ReleaseSsdLease(&ctx, rel, &rel_resp).ok()); - EXPECT_TRUE(rel_resp.success()); - - ::umbp::ReleaseSsdLeaseResponse rel_resp2; - grpc::ClientContext ctx2; - ASSERT_TRUE(stub_->ReleaseSsdLease(&ctx2, rel, &rel_resp2).ok()); - EXPECT_FALSE(rel_resp2.success()) << "double release is a no-op"; -} - -// Leased slots are reclaimed by TTL even without a release, so a fresh prepare -// succeeds after the lease elapses (slot lifecycle: Leased -> reclaimed). -TEST_F(PeerSsdReadRpcTest, LeasedSlotsReclaimedByTtl) { - for (int i = 0; i < kNumReadSlots; ++i) { - const std::string key = "ttl-" + std::to_string(i); - WriteSsd(key, "payload"); - ASSERT_EQ(Prepare(key, 64).status(), ::umbp::SSD_READ_OK); // never released - } - EXPECT_EQ(Prepare("ttl-0", 64).status(), ::umbp::SSD_READ_NO_SLOT); // all busy - - std::this_thread::sleep_for(std::chrono::seconds(kLeaseTimeoutS + 1)); - - WriteSsd("ttl-after", "payload"); - EXPECT_EQ(Prepare("ttl-after", 64).status(), ::umbp::SSD_READ_OK) << "TTL should reclaim a slot"; -} - -} // namespace -} // namespace mori::umbp diff --git a/src/umbp/tests/test_router_dedup.cpp b/src/umbp/tests/test_router_dedup.cpp deleted file mode 100644 index 6de571fd1..000000000 --- a/src/umbp/tests/test_router_dedup.cpp +++ /dev/null @@ -1,126 +0,0 @@ -// Copyright © Advanced Micro Devices, Inc. All rights reserved. -// -// MIT License -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -// -// Master-side dedup for Router::BatchRoutePut: indexed keys come back -// with already_exists=true and bypass node selection. -#include - -#include -#include -#include -#include -#include -#include - -#include "umbp/distributed/master/in_memory_master_metadata_store.h" -#include "umbp/distributed/routing/router.h" -#include "umbp/distributed/types.h" - -namespace mori::umbp { - -namespace { - -constexpr uint64_t kGB = 1024ULL * 1024 * 1024; - -std::map MakeDramCaps(uint64_t total = 8 * kGB) { - std::map caps; - caps[TierType::DRAM] = {total, total}; - return caps; -} - -ClientRegistration MakeRegistration(const std::string& node_id, const std::string& node_address, - const std::string& peer_address) { - ClientRegistration reg; - reg.node_id = node_id; - reg.node_address = node_address; - reg.tier_capacities = MakeDramCaps(); - reg.peer_address = peer_address; - return reg; -} - -// Register `node_id` ALIVE and apply one ADD event for `key` so it has a block -// location in the store. Under the merged store a location can only be created -// through an ApplyHeartbeat from a registered (alive) node — locations no -// longer exist independently of a client record the way the old -// GlobalBlockIndex allowed. -void RegisterWithKey(InMemoryMasterMetadataStore& store, const std::string& node_id, - const std::string& key, std::chrono::system_clock::time_point now) { - ASSERT_TRUE(store.RegisterClient(MakeRegistration(node_id, node_id + ":1", node_id + ":peer"), - now, std::chrono::seconds{30})); - auto hb = store.ApplyHeartbeat(node_id, /*seq=*/1, now, MakeDramCaps(), - {KvEvent{KvEvent::Kind::ADD, key, TierType::DRAM, 4096}}, - /*is_full_sync=*/false); - ASSERT_EQ(hb.status, HeartbeatResult::APPLIED); -} - -} // namespace - -// Indexed keys are marked already_exists; unknown keys still routed. -TEST(RouterDedup, BatchRoutePutMarksAlreadyExistsForIndexedKey) { - const auto now = std::chrono::system_clock::now(); - InMemoryMasterMetadataStore store; - Router router(store); - - RegisterWithKey(store, "node-a", "key-X", now); - - std::vector keys{"key-X", "key-Y"}; - std::vector sizes{4096, 4096}; - std::unordered_set excludes; - - auto results = router.BatchRoutePut(keys, "requester", sizes, excludes); - ASSERT_EQ(results.size(), 2u); - - ASSERT_TRUE(results[0].has_value()); - EXPECT_EQ(results[0]->outcome, RoutePutOutcome::kAlreadyExists); - EXPECT_TRUE(results[0]->node_id.empty()); - - ASSERT_TRUE(results[1].has_value()); - EXPECT_EQ(results[1]->outcome, RoutePutOutcome::kRouted); - EXPECT_EQ(results[1]->node_id, "node-a"); -} - -// already_exists wins over an unroutable Put: an existing key is marked -// kAlreadyExists even when no node can accept the write. In the old design -// "no node" meant an empty registry while a foreign node owned the key; under -// the merged store a location can't outlive its alive owner, so the -// unroutable condition is expressed by excluding the only candidate node. The -// property under test is unchanged: dedup wins over node selection. -TEST(RouterDedup, BatchRoutePutAlreadyExistsBypassesUnroutablePut) { - const auto now = std::chrono::system_clock::now(); - InMemoryMasterMetadataStore store; - Router router(store); - - RegisterWithKey(store, "node-a", "key-X", now); - - std::vector keys{"key-X", "key-Y"}; - std::vector sizes{4096, 4096}; - std::unordered_set excludes{"node-a"}; // no routable target left - - auto results = router.BatchRoutePut(keys, "requester", sizes, excludes); - ASSERT_EQ(results.size(), 2u); - - ASSERT_TRUE(results[0].has_value()); - EXPECT_EQ(results[0]->outcome, RoutePutOutcome::kAlreadyExists); - EXPECT_FALSE(results[1].has_value()); // distinct from kAlreadyExists -} - -} // namespace mori::umbp diff --git a/src/umbp/tests/test_ssd_copy_pipeline.cpp b/src/umbp/tests/test_ssd_copy_pipeline.cpp deleted file mode 100644 index 1d144cbb6..000000000 --- a/src/umbp/tests/test_ssd_copy_pipeline.cpp +++ /dev/null @@ -1,341 +0,0 @@ -// Copyright © Advanced Micro Devices, Inc. All rights reserved. -// -// MIT License -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "umbp/distributed/peer/peer_dram_allocator.h" -#include "umbp/distributed/peer/peer_ssd_manager.h" -#include "umbp/distributed/peer/ssd_copy_pipeline.h" - -namespace mori::umbp { -namespace { - -namespace fs = std::filesystem; - -constexpr uint64_t kPageSize = 1024; - -// Concatenate a pin's segments into one buffer for content comparison. -std::string Concat(const PeerDramAllocator::DramCopyPin& pin) { - std::string out; - for (const auto& [ptr, len] : pin.segments) { - out.append(static_cast(ptr), len); - } - return out; -} - -// ---- DramCopyPin unit tests (direct on PeerDramAllocator) ------------------- - -class DramCopyPinTest : public ::testing::Test { - protected: - void SetUp() override { - backing_.assign(kPageSize * 8, 0); - PeerDramAllocator::TierConfig dram; - dram.buffer_sizes = {kPageSize * 8}; - dram.buffer_descs = {{0x01, 0x02}}; - dram.buffer_bases = {backing_.data()}; - dram_ = std::make_unique(kPageSize, std::move(dram), - PeerDramAllocator::TierConfig{}, - /*pending_ttl=*/std::chrono::milliseconds{5000}, - /*read_lease_ttl=*/std::chrono::milliseconds{0}); - } - - // Allocate, write `value` into its pages, commit. Backing memory is owned - // by the test, so we resolve pages -> offset exactly like the real writer. - void PutLocal(const std::string& key, const std::string& value) { - auto res = dram_->Allocate(key, value.size(), TierType::DRAM); - ASSERT_EQ(res.outcome, PeerDramAllocator::Outcome::kSuccessAllocated); - const auto& slot = *res.slot; - size_t off = 0; - for (const auto& p : slot.pages) { - const size_t bytes = std::min(kPageSize, value.size() - off); - std::memcpy(backing_.data() + static_cast(p.page_index) * kPageSize, - value.data() + off, bytes); - off += bytes; - } - uint64_t committed = 0; - ASSERT_TRUE(dram_->Commit(slot.slot_id, key, committed)); - ASSERT_EQ(committed, value.size()); - } - - std::vector backing_; - std::unique_ptr dram_; -}; - -TEST_F(DramCopyPinTest, AcquireResolvesSegmentsToCommittedBytes) { - const std::string value(kPageSize + 17, 'Z'); // spans 2 pages, last partial - PutLocal("k", value); - - auto pin = dram_->AcquireDramCopyPin("k"); - ASSERT_TRUE(pin.has_value()); - EXPECT_EQ(pin->total_size, value.size()); - EXPECT_EQ(Concat(*pin), value); - dram_->ReleaseDramCopyPin("k", pin->pin_token); -} - -TEST_F(DramCopyPinTest, AcquireMissingKeyReturnsNullopt) { - EXPECT_FALSE(dram_->AcquireDramCopyPin("never").has_value()); -} - -TEST_F(DramCopyPinTest, DuplicatePinReturnsNullopt) { - PutLocal("k", "payload"); - auto first = dram_->AcquireDramCopyPin("k"); - ASSERT_TRUE(first.has_value()); - EXPECT_FALSE(dram_->AcquireDramCopyPin("k").has_value()); // already pinned - dram_->ReleaseDramCopyPin("k", first->pin_token); -} - -TEST_F(DramCopyPinTest, EvictBlockedWhilePinnedThenAllowedAfterRelease) { - PutLocal("k", "payload"); - dram_->DrainPendingEvents(); // discard the commit's ADD DRAM event - auto pin = dram_->AcquireDramCopyPin("k"); - ASSERT_TRUE(pin.has_value()); - - // Pinned: Evict must not free, not emit REMOVE, keep ownership. - auto evicted = dram_->Evict({"k"}); - ASSERT_EQ(evicted.size(), 1u); - EXPECT_EQ(evicted[0].bytes_freed, 0u); - EXPECT_TRUE(dram_->DrainPendingEvents().empty()); // no REMOVE DRAM - ASSERT_EQ(dram_->SnapshotOwnedKeys().size(), 1u); // still owned - - // Release -> next Evict frees and emits REMOVE. - dram_->ReleaseDramCopyPin("k", pin->pin_token); - auto evicted2 = dram_->Evict({"k"}); - ASSERT_EQ(evicted2.size(), 1u); - EXPECT_GT(evicted2[0].bytes_freed, 0u); - auto events = dram_->DrainPendingEvents(); - ASSERT_EQ(events.size(), 1u); - EXPECT_EQ(events[0].kind, KvEvent::Kind::REMOVE); - EXPECT_EQ(events[0].tier, TierType::DRAM); -} - -TEST(DramCopyPinNonContiguous, SegmentsSpanMultipleBuffers) { - // Two 1-page buffers force a cross-buffer page set for a 2-page key. - std::vector b0(kPageSize, 0), b1(kPageSize, 0); - PeerDramAllocator::TierConfig dram; - dram.buffer_sizes = {kPageSize, kPageSize}; - dram.buffer_descs = {{0x01}, {0x02}}; - dram.buffer_bases = {b0.data(), b1.data()}; - PeerDramAllocator alloc(kPageSize, std::move(dram), PeerDramAllocator::TierConfig{}, - std::chrono::milliseconds{5000}, std::chrono::milliseconds{0}); - - const std::string value(kPageSize + 5, 'Q'); - auto res = alloc.Allocate("k", value.size(), TierType::DRAM); - ASSERT_EQ(res.outcome, PeerDramAllocator::Outcome::kSuccessAllocated); - const auto& slot = *res.slot; - ASSERT_EQ(slot.pages.size(), 2u); - std::vector bases = {b0.data(), b1.data()}; - size_t off = 0; - for (const auto& p : slot.pages) { - const size_t bytes = std::min(kPageSize, value.size() - off); - std::memcpy(bases[p.buffer_index] + static_cast(p.page_index) * kPageSize, - value.data() + off, bytes); - off += bytes; - } - uint64_t committed = 0; - ASSERT_TRUE(alloc.Commit(slot.slot_id, "k", committed)); - - auto pin = alloc.AcquireDramCopyPin("k"); - ASSERT_TRUE(pin.has_value()); - EXPECT_EQ(pin->segments.size(), 2u); - EXPECT_EQ(Concat(*pin), value); - alloc.ReleaseDramCopyPin("k", pin->pin_token); -} - -// ---- Pipeline integration tests (allocator + SSD manager + pipeline) -------- - -class SsdCopyPipelineTest : public ::testing::Test { - protected: - void SetUp() override { - static std::atomic counter{0}; - dir_ = fs::temp_directory_path() / ("umbp_copy_test_" + std::to_string(::getpid()) + "_" + - std::to_string(counter.fetch_add(1))); - fs::remove_all(dir_); - - backing_.assign(kPageSize * 16, 0); - PeerDramAllocator::TierConfig dram; - dram.buffer_sizes = {kPageSize * 16}; - dram.buffer_descs = {{0x01}}; - dram.buffer_bases = {backing_.data()}; - dram_ = std::make_unique( - kPageSize, std::move(dram), PeerDramAllocator::TierConfig{}, - std::chrono::milliseconds{5000}, std::chrono::milliseconds{0}); - - PeerSsdConfig ssd_cfg; - ssd_cfg.enabled = true; - ssd_cfg.ssd.enabled = true; - ssd_cfg.ssd.storage_dir = dir_.string(); - ssd_cfg.ssd.capacity_bytes = 64ULL * 1024 * 1024; - ssd_cfg.ssd.io.backend = UMBPIoBackend::Posix; // avoid io_uring container flakiness - ssd_ = std::make_unique(ssd_cfg); - } - - void TearDown() override { - std::error_code ec; - fs::remove_all(dir_, ec); - } - - void PutLocal(const std::string& key, const std::string& value) { - auto res = dram_->Allocate(key, value.size(), TierType::DRAM); - ASSERT_EQ(res.outcome, PeerDramAllocator::Outcome::kSuccessAllocated); - const auto& slot = *res.slot; - size_t off = 0; - for (const auto& p : slot.pages) { - const size_t bytes = std::min(kPageSize, value.size() - off); - std::memcpy(backing_.data() + static_cast(p.page_index) * kPageSize, - value.data() + off, bytes); - off += bytes; - } - uint64_t committed = 0; - ASSERT_TRUE(dram_->Commit(slot.slot_id, key, committed)); - } - - bool WaitForSsd(const std::string& key, std::chrono::milliseconds timeout) { - const auto deadline = std::chrono::steady_clock::now() + timeout; - while (std::chrono::steady_clock::now() < deadline) { - if (ssd_->Exists(key)) return true; - std::this_thread::sleep_for(std::chrono::milliseconds(2)); - } - return ssd_->Exists(key); - } - - fs::path dir_; - std::vector backing_; - std::unique_ptr dram_; - std::unique_ptr ssd_; -}; - -TEST_F(SsdCopyPipelineTest, CommitCopiesToSsdAndEmitsAddEvent) { - SsdCopyPipeline pipeline(dram_.get(), ssd_.get()); - pipeline.Start(); - - PutLocal("k", "hello-ssd-copy-on-commit"); - ASSERT_TRUE(pipeline.Enqueue(SsdCopyTask{"k", TierType::DRAM, 24})); - - ASSERT_TRUE(WaitForSsd("k", std::chrono::seconds(2))); - EXPECT_GE(pipeline.CopiedOk(), 1u); - EXPECT_GE(pipeline.Enqueued(), 1u); // observability: task was accepted - EXPECT_EQ(pipeline.Failed(), 0u); - - auto events = ssd_->DrainPendingEvents(); - ASSERT_EQ(events.size(), 1u); - EXPECT_EQ(events[0].kind, KvEvent::Kind::ADD); - EXPECT_EQ(events[0].tier, TierType::SSD); - EXPECT_EQ(events[0].key, "k"); - - pipeline.Stop(); -} - -TEST_F(SsdCopyPipelineTest, QueuedTaskForEvictedKeyIsDropped) { - SsdCopyPipeline pipeline(dram_.get(), ssd_.get()); - - PutLocal("gone", "data"); - // Evict before the copy ever runs (no pin held) -> key removed from owned_. - auto ev = dram_->Evict({"gone"}); - ASSERT_EQ(ev.size(), 1u); - EXPECT_GT(ev[0].bytes_freed, 0u); - - // Now start draining: the worker's AcquireDramCopyPin returns nullopt -> drop. - ASSERT_TRUE(pipeline.Enqueue(SsdCopyTask{"gone", TierType::DRAM, 4})); - pipeline.Start(); - - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - EXPECT_FALSE(ssd_->Exists("gone")); - EXPECT_EQ(pipeline.CopiedOk(), 0u); - pipeline.Stop(); -} - -TEST_F(SsdCopyPipelineTest, FullQueueDropsWithoutBlocking) { - // queue_depth=2, no workers started -> nothing drains, so the 3rd+ enqueue - // overflows and is dropped (and returns immediately). - SsdCopyPipeline pipeline(dram_.get(), ssd_.get(), /*queue_depth=*/2, /*workers=*/1); - EXPECT_TRUE(pipeline.Enqueue(SsdCopyTask{"a", TierType::DRAM, 1})); - EXPECT_TRUE(pipeline.Enqueue(SsdCopyTask{"b", TierType::DRAM, 1})); - EXPECT_FALSE(pipeline.Enqueue(SsdCopyTask{"c", TierType::DRAM, 1})); // full -> drop - EXPECT_FALSE(pipeline.Enqueue(SsdCopyTask{"d", TierType::DRAM, 1})); - EXPECT_EQ(pipeline.Dropped(), 2u); - EXPECT_EQ(pipeline.Enqueued(), 2u); // only the two accepted tasks - EXPECT_EQ(pipeline.DroppedStopped(), 0u); // these are queue-full, not stopped, drops -} - -TEST_F(SsdCopyPipelineTest, EnqueueRejectedWhileStopped) { - SsdCopyPipeline pipeline(dram_.get(), ssd_.get()); - pipeline.Start(); - pipeline.Stop(); - EXPECT_FALSE(pipeline.Enqueue(SsdCopyTask{"k", TierType::DRAM, 1})); - EXPECT_EQ(pipeline.Dropped(), 0u); // stopped path is not counted as a full-drop - EXPECT_EQ(pipeline.DroppedStopped(), 1u); // counted under the stopped reason instead -} - -TEST_F(SsdCopyPipelineTest, StopAfterCopyIsCleanAndReleasesPin) { - SsdCopyPipeline pipeline(dram_.get(), ssd_.get()); - pipeline.Start(); - - const std::string value(8 * 1024, 'X'); - PutLocal("big", value); - ASSERT_TRUE(pipeline.Enqueue(SsdCopyTask{"big", TierType::DRAM, value.size()})); - - // Let the copy run, then Stop(). Stop() joins the worker; the RAII pin guard - // guarantees the pin is released before the worker exits (Stop() never - // force-frees an in-flight pin — that join is the in-flight-wait guarantee). - ASSERT_TRUE(WaitForSsd("big", std::chrono::seconds(2))); - pipeline.Stop(); - - EXPECT_EQ(pipeline.CopiedOk(), 1u); - // Pin released -> the key is now evictable (no copy holding its pages). - auto ev = dram_->Evict({"big"}); - ASSERT_EQ(ev.size(), 1u); - EXPECT_GT(ev[0].bytes_freed, 0u); -} - -TEST_F(SsdCopyPipelineTest, QuiesceThenClearLeavesNoStaleSsdState) { - SsdCopyPipeline pipeline(dram_.get(), ssd_.get()); - pipeline.Start(); - - PutLocal("k", "payload"); - ASSERT_TRUE(pipeline.Enqueue(SsdCopyTask{"k", TierType::DRAM, 7})); - ASSERT_TRUE(WaitForSsd("k", std::chrono::seconds(2))); - - // Clear path: quiesce (drain in-flight) then clear both tiers. - pipeline.Quiesce(); - dram_->ClearLocal(); - ssd_->ClearLocal(); - pipeline.Resume(); - - EXPECT_FALSE(ssd_->Exists("k")); - EXPECT_TRUE(ssd_->SnapshotOwnedKeys().empty()); - EXPECT_TRUE(ssd_->DrainPendingEvents().empty()); - - pipeline.Stop(); -} - -} // namespace -} // namespace mori::umbp diff --git a/src/umbp/tests/test_ssd_read_lease_gating.cpp b/src/umbp/tests/test_ssd_read_lease_gating.cpp deleted file mode 100644 index 1b105d0d1..000000000 --- a/src/umbp/tests/test_ssd_read_lease_gating.cpp +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright © Advanced Micro Devices, Inc. All rights reserved. -// -// MIT License -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -// -// Pure-logic unit tests for the reader-side remote SSD read lease gating -// (umbp/distributed/ssd_read_lease.h). These cover the decision policy without -// a cluster / RDMA: the full PrepareSsdRead -> RDMA path is exercised at the -// RPC level in test_peer_ssd_read_rpc.cpp. Retryable outcomes are NO_SLOT and -// a reader-local lease expiry; rpc failures are not-served (RPC-test covered). -#include - -#include - -#include "umbp/distributed/ssd_read_lease.h" - -namespace mori::umbp::ssd_read_lease { -namespace { - -using std::chrono::milliseconds; -using std::chrono::steady_clock; - -// ---- LeaseExpired ---- - -TEST(SsdReadLeaseGating, NotExpiredBeforeDeadline) { - const auto t_send = steady_clock::now(); - EXPECT_FALSE(LeaseExpired(t_send, /*lease_ttl_ms=*/1000, t_send + milliseconds(500))); -} - -TEST(SsdReadLeaseGating, ExpiredAfterDeadline) { - const auto t_send = steady_clock::now(); - EXPECT_TRUE(LeaseExpired(t_send, /*lease_ttl_ms=*/1000, t_send + milliseconds(1001))); -} - -TEST(SsdReadLeaseGating, ExactlyAtDeadlineIsNotExpired) { - // Boundary: now == t_send + ttl uses '>' so it is still valid. - const auto t_send = steady_clock::now(); - EXPECT_FALSE(LeaseExpired(t_send, /*lease_ttl_ms=*/1000, t_send + milliseconds(1000))); -} - -TEST(SsdReadLeaseGating, ZeroTtlIsBornExpired) { - const auto t_send = steady_clock::now(); - EXPECT_FALSE(LeaseExpired(t_send, /*lease_ttl_ms=*/0, t_send)); // exactly t_send: valid - EXPECT_TRUE(LeaseExpired(t_send, /*lease_ttl_ms=*/0, t_send + milliseconds(1))); -} - -// ---- DecideSsdReadOutcome ---- -// Situation A (not expired): a good RDMA serves + releases; a failed RDMA is a -// hard error but still releases (the lease is still ours). -// Situation B (expired): always a transient retry, and NEVER release (the slot -// is left for the peer's TTL reclaim), regardless of whether the RDMA "worked". - -TEST(SsdReadLeaseGating, ValidAndRdmaOk_ServesAndReleases) { - const auto d = DecideSsdReadOutcome(/*expired=*/false, /*rdma_ok=*/true); - EXPECT_EQ(d.outcome, GateOutcome::kSuccess); - EXPECT_TRUE(d.release); -} - -TEST(SsdReadLeaseGating, ValidAndRdmaFailed_ErrorButReleases) { - const auto d = DecideSsdReadOutcome(/*expired=*/false, /*rdma_ok=*/false); - EXPECT_EQ(d.outcome, GateOutcome::kError); - EXPECT_TRUE(d.release); -} - -TEST(SsdReadLeaseGating, ExpiredWithRdmaOk_RetryNoRelease) { - // The dangerous case: RDMA "succeeded" but the lease elapsed, so the bytes - // are untrusted (the peer may have recycled the slot). Must NOT be success. - const auto d = DecideSsdReadOutcome(/*expired=*/true, /*rdma_ok=*/true); - EXPECT_EQ(d.outcome, GateOutcome::kRetry); - EXPECT_FALSE(d.release); -} - -TEST(SsdReadLeaseGating, ExpiredWithRdmaFailed_RetryNoRelease) { - const auto d = DecideSsdReadOutcome(/*expired=*/true, /*rdma_ok=*/false); - EXPECT_EQ(d.outcome, GateOutcome::kRetry); - EXPECT_FALSE(d.release); -} - -} // namespace -} // namespace mori::umbp::ssd_read_lease diff --git a/src/umbp/tests/test_ssd_reliability.cpp b/src/umbp/tests/test_ssd_reliability.cpp deleted file mode 100644 index e7f1a37ac..000000000 --- a/src/umbp/tests/test_ssd_reliability.cpp +++ /dev/null @@ -1,345 +0,0 @@ -// Copyright © Advanced Micro Devices, Inc. All rights reserved. -// -// MIT License -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -// -// Cross-component reliability tests: combinations no single-component test -// covers. All deterministic, no real disk / RDMA / master RPC: -// * the unified owned-location event source merges DRAM + SSD into one -// snapshot/delta (so a heartbeat full-sync ships SSD owned keys too); -// * a local SSD eviction's REMOVE SSD event converges the master -// GlobalBlockIndex while leaving the DRAM bucket intact; -// * tier-priority RouteGet over the real index picks DRAM, then SSD once the -// DRAM replica is removed; -// * crash-restart leftover is discarded at startup; -// * the SSD observability counters increment at the right events. -// -// (copy-pin vs DRAM evict is covered by test_ssd_copy_pipeline's -// EvictBlockedWhilePinnedThenAllowedAfterRelease; seq-gap -> full-sync by -// test_global_block_index_events' ClientRegistryHeartbeat.SeqGap*.) -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "umbp/distributed/master/global_block_index.h" -#include "umbp/distributed/peer/owned_location_source.h" -#include "umbp/distributed/peer/peer_ssd_manager.h" -#include "umbp/distributed/routing/route_get_strategy.h" -#include "umbp/distributed/types.h" -#include "umbp/local/tiers/tier_backend.h" - -namespace mori::umbp { -namespace { - -// Minimal in-memory TierBackend (mirrors the one in test_peer_ssd_eviction) so -// PeerSsdManager runs without real disk IO. Exposes a forced evict failure for -// the backend-failure counter and lets the test pre-seed bytes (crash leftover). -class FakeBackend : public TierBackend { - public: - explicit FakeBackend(size_t capacity) - : TierBackend(StorageTier::LOCAL_SSD), capacity_(capacity) {} - - bool Write(const std::string& key, const void* data, size_t size) override { - std::lock_guard lk(mu_); - auto it = store_.find(key); - size_t prev = (it == store_.end()) ? 0 : it->second.size(); - if (used_ - prev + size > capacity_) return false; - store_[key].assign(static_cast(data), static_cast(data) + size); - used_ = used_ - prev + size; - return true; - } - bool ReadIntoPtr(const std::string& key, uintptr_t dst, size_t size) override { - std::lock_guard lk(mu_); - auto it = store_.find(key); - if (it == store_.end() || it->second.size() != size) return false; - std::memcpy(reinterpret_cast(dst), it->second.data(), size); - return true; - } - bool Exists(const std::string& key) const override { - std::lock_guard lk(mu_); - return store_.count(key) != 0; - } - bool Evict(const std::string& key) override { - std::lock_guard lk(mu_); - if (fail_evict_) return false; - auto it = store_.find(key); - if (it == store_.end()) return false; - used_ -= it->second.size(); - store_.erase(it); - return true; - } - std::pair Capacity() const override { - std::lock_guard lk(mu_); - return {used_, capacity_}; - } - void Clear() override { - std::lock_guard lk(mu_); - ++clear_calls_; - store_.clear(); - used_ = 0; - } - void SetFailEvict(bool f) { - std::lock_guard lk(mu_); - fail_evict_ = f; - } - int clear_calls() const { - std::lock_guard lk(mu_); - return clear_calls_; - } - - private: - mutable std::mutex mu_; - std::unordered_map> store_; - size_t used_ = 0; - size_t capacity_; - bool fail_evict_ = false; - int clear_calls_ = 0; -}; - -std::vector> OneSeg(const std::string& s) { - return {{s.data(), s.size()}}; -} - -bool HasLoc(const std::vector& locs, const std::string& node, TierType tier) { - for (const auto& l : locs) { - if (l.node_id == node && l.tier == tier) return true; - } - return false; -} - -int CountTier(const std::vector& events, KvEvent::Kind kind, TierType tier) { - int n = 0; - for (const auto& e : events) { - if (e.kind == kind && e.tier == tier) ++n; - } - return n; -} - -// A canned owned-location source standing in for PeerDramAllocator so the -// aggregation can be tested without standing up a DRAM allocator. -class FakeOwnedSource : public OwnedLocationSource { - public: - std::vector delta; - std::vector snapshot; - std::vector DrainPendingEvents() override { - std::vector out; - out.swap(delta); - return out; - } - std::vector SnapshotOwnedKeys() const override { return snapshot; } -}; - -// --------------------------------------------------------------------------- -// Unified owned-location source: DRAM + SSD merge into one bundle. -// --------------------------------------------------------------------------- - -// A heartbeat full-sync snapshots ALL sources; SSD owned keys must be present -// alongside DRAM in the merged snapshot (otherwise master would drop the SSD -// tier on a seq-gap recovery). -TEST(SsdReliability, FullSyncSnapshotMergesDramAndSsdOwnedKeys) { - FakeOwnedSource dram; - dram.snapshot = {KvEvent{KvEvent::Kind::ADD, "d-key", TierType::DRAM, 10}}; - - auto be = std::make_unique(1'000'000); - PeerSsdManager ssd(std::move(be), 0.9, 0.7); - ASSERT_TRUE(ssd.Write("s-key", OneSeg("ssddata"), 7)); - ssd.DrainPendingEvents(); // the ADD SSD delta; snapshot is independent - - std::vector sources = {&dram, &ssd}; - auto snap = SnapshotAllSources(sources); - - EXPECT_EQ(CountTier(snap, KvEvent::Kind::ADD, TierType::DRAM), 1); - EXPECT_EQ(CountTier(snap, KvEvent::Kind::ADD, TierType::SSD), 1); -} - -// A delta heartbeat drains ALL sources and concatenates into one list. -TEST(SsdReliability, DeltaDrainMergesDramAndSsdEvents) { - FakeOwnedSource dram; - dram.delta = {KvEvent{KvEvent::Kind::REMOVE, "d-key", TierType::DRAM, 0}}; - - auto be = std::make_unique(1'000'000); - PeerSsdManager ssd(std::move(be), 0.9, 0.7); - ASSERT_TRUE(ssd.Write("s-key", OneSeg("ssddata"), 7)); // queues ADD SSD delta - - std::vector sources = {&dram, &ssd}; - auto merged = DrainAllSources(sources); - - EXPECT_EQ(CountTier(merged, KvEvent::Kind::REMOVE, TierType::DRAM), 1); - EXPECT_EQ(CountTier(merged, KvEvent::Kind::ADD, TierType::SSD), 1); - // Draining again yields nothing (outbox cleared on both sources). - EXPECT_TRUE(DrainAllSources(sources).empty()); -} - -// --------------------------------------------------------------------------- -// SSD local eviction -> REMOVE SSD -> master GlobalBlockIndex converges. -// --------------------------------------------------------------------------- - -// A key mirrored on DRAM + SSD of one owner: a local SSD eviction emits -// REMOVE SSD, and applying that to the master index drops only the SSD bucket -// (the DRAM replica, owned independently, stays routable). -TEST(SsdReliability, LocalSsdEvictionRemoveConvergesMasterIndex) { - GlobalBlockIndex idx; - - auto be = std::make_unique(1'000'000); - PeerSsdManager ssd(std::move(be), 0.9, 0.7); - - // DRAM replica added independently (a DRAM owner would emit this). - idx.ApplyEvents("owner", {KvEvent{KvEvent::Kind::ADD, "k", TierType::DRAM, 100}}); - // SSD copy lands -> ADD SSD drained into the index. - ASSERT_TRUE(ssd.Write("k", OneSeg(std::string(100, 'x')), 100)); - idx.ApplyEvents("owner", ssd.DrainPendingEvents()); - - auto both = idx.Lookup("k"); - ASSERT_TRUE(HasLoc(both, "owner", TierType::DRAM)); - ASSERT_TRUE(HasLoc(both, "owner", TierType::SSD)); - - // Local SSD eviction -> REMOVE SSD -> index drops only the SSD bucket. - ASSERT_TRUE(ssd.Evict("k")); - auto ssd_events = ssd.DrainPendingEvents(); - EXPECT_EQ(CountTier(ssd_events, KvEvent::Kind::REMOVE, TierType::SSD), 1); - idx.ApplyEvents("owner", ssd_events); - - auto after = idx.Lookup("k"); - EXPECT_TRUE(HasLoc(after, "owner", TierType::DRAM)); // DRAM replica still routable - EXPECT_FALSE(HasLoc(after, "owner", TierType::SSD)); // SSD bucket converged away -} - -// --------------------------------------------------------------------------- -// Tier-priority RouteGet over the real index: DRAM first, SSD after evict. -// --------------------------------------------------------------------------- - -TEST(SsdReliability, TierPriorityRoutesDramThenSsdAfterDramRemoved) { - GlobalBlockIndex idx; - idx.ApplyEvents("owner", {KvEvent{KvEvent::Kind::ADD, "k", TierType::DRAM, 100}, - KvEvent{KvEvent::Kind::ADD, "k", TierType::SSD, 100}}); - - TierPriorityRouteGetStrategy strategy; - - auto locs = idx.BatchLookupForRouteGet({"k"}, {}, std::chrono::seconds{10}); - ASSERT_EQ(locs.size(), 1u); - auto dram_pick = strategy.Select(locs[0], "reader"); - EXPECT_EQ(dram_pick.tier, TierType::DRAM) << "prefers the fast DRAM replica"; - - // DRAM evicted -> only the SSD bucket remains -> RouteGet must serve from SSD. - idx.ApplyEvents("owner", {KvEvent{KvEvent::Kind::REMOVE, "k", TierType::DRAM, 0}}); - auto locs2 = idx.BatchLookupForRouteGet({"k"}, {}, std::chrono::seconds{10}); - ASSERT_EQ(locs2.size(), 1u); - auto ssd_pick = strategy.Select(locs2[0], "reader"); - EXPECT_EQ(ssd_pick.tier, TierType::SSD) << "falls back to the surviving SSD replica"; - EXPECT_EQ(ssd_pick.node_id, "owner"); -} - -// --------------------------------------------------------------------------- -// Crash-restart leftover handling (discard). -// --------------------------------------------------------------------------- - -// After a crash owned_ is empty but the backend still holds bytes from the -// previous run. DiscardLeftoverOnStartup wipes them so used capacity starts -// at 0 (no divergence between the empty owned_ map and the physical device). -TEST(SsdReliability, StartupDiscardWipesLeftoverBytes) { - auto be = std::make_unique(1'000'000); - FakeBackend* raw = be.get(); - // Simulate a previous process's bytes left on the device. - ASSERT_TRUE(raw->Write("orphan-1", "leftover-a", 10)); - ASSERT_TRUE(raw->Write("orphan-2", "leftover-b", 10)); - ASSERT_GT(raw->Capacity().first, 0u); - - // Fresh manager: owned_ is empty, but the backend reports used > 0. - PeerSsdManager ssd(std::move(be), 0.9, 0.7); - EXPECT_TRUE(ssd.SnapshotOwnedKeys().empty()); - ASSERT_GT(ssd.Capacity().first, 0u); - - ssd.DiscardLeftoverOnStartup(); - - EXPECT_EQ(raw->clear_calls(), 1); - EXPECT_EQ(ssd.Capacity().first, 0u); // leftover gone -> consistent with empty owned_ -} - -TEST(SsdReliability, StartupDiscardOnCleanTierIsNoop) { - auto be = std::make_unique(1'000'000); - FakeBackend* raw = be.get(); - PeerSsdManager ssd(std::move(be), 0.9, 0.7); - - ssd.DiscardLeftoverOnStartup(); // used == 0 -> skip the wipe entirely - EXPECT_EQ(raw->clear_calls(), 0); -} - -// --------------------------------------------------------------------------- -// Observability counters increment at the right events. -// --------------------------------------------------------------------------- - -TEST(SsdReliability, ReadCountersTrackOutcomes) { - auto be = std::make_unique(1'000'000); - PeerSsdManager ssd(std::move(be), 0.9, 0.7); - ASSERT_TRUE(ssd.Write("k", OneSeg("0123456789"), 10)); - - std::vector buf(10); - EXPECT_EQ(ssd.PrepareRead("k", buf.data(), buf.size()).status, SsdReadStatus::kOk); - EXPECT_EQ(ssd.PrepareRead("absent", buf.data(), buf.size()).status, SsdReadStatus::kNotFound); - EXPECT_EQ(ssd.PrepareRead("k", buf.data(), /*cap=*/1).status, SsdReadStatus::kSizeTooLarge); - - EXPECT_EQ(ssd.ReadOk(), 1u); - EXPECT_EQ(ssd.ReadNotFound(), 1u); - EXPECT_EQ(ssd.ReadSizeTooLarge(), 1u); - EXPECT_EQ(ssd.ReadError(), 0u); -} - -TEST(SsdReliability, EvictionCountersTrackVictimsBytesAndBackendFailures) { - auto be = std::make_unique(1'000'000); - FakeBackend* raw = be.get(); - PeerSsdManager ssd(std::move(be), 0.9, 0.7); - ASSERT_TRUE(ssd.Write("a", OneSeg(std::string(40, 'a')), 40)); - ASSERT_TRUE(ssd.Write("b", OneSeg(std::string(60, 'b')), 60)); - - ASSERT_TRUE(ssd.Evict("a")); - EXPECT_EQ(ssd.EvictionVictims(), 1u); - EXPECT_EQ(ssd.EvictionBytesFreed(), 40u); - EXPECT_EQ(ssd.EvictionBackendFailures(), 0u); - - // Backend refuses the next evict -> the failure is counted, the key kept. - raw->SetFailEvict(true); - EXPECT_FALSE(ssd.Evict("b")); - EXPECT_EQ(ssd.EvictionBackendFailures(), 1u); - EXPECT_EQ(ssd.EvictionVictims(), 1u); // unchanged - EXPECT_TRUE(ssd.Exists("b")); -} - -TEST(SsdReliability, WatermarkEvictionCountsARound) { - // capacity 1000, high 0.9 (=>900), low 0.5 (=>500); 100-byte values. After - // the 9th write used hits 900 -> one eviction round runs. - auto be = std::make_unique(1000); - PeerSsdManager ssd(std::move(be), 0.9, 0.5); - std::string val(100, 'x'); - for (int i = 1; i <= 9; ++i) { - ASSERT_TRUE(ssd.Write("k" + std::to_string(i), OneSeg(val), val.size())); - } - EXPECT_GE(ssd.EvictionRounds(), 1u); - EXPECT_GE(ssd.EvictionVictims(), 1u); - EXPECT_LE(ssd.Capacity().first, 500u); -} - -} // namespace -} // namespace mori::umbp diff --git a/src/umbp/tests/test_tier_priority_route_get.cpp b/src/umbp/tests/test_tier_priority_route_get.cpp deleted file mode 100644 index b6b2abe2d..000000000 --- a/src/umbp/tests/test_tier_priority_route_get.cpp +++ /dev/null @@ -1,112 +0,0 @@ -// Copyright © Advanced Micro Devices, Inc. All rights reserved. -// -// MIT License -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -#include - -#include -#include -#include - -#include "umbp/distributed/routing/route_get_strategy.h" - -namespace mori::umbp { -namespace { - -Location MakeLoc(const std::string& node_id, TierType tier) { - Location loc; - loc.node_id = node_id; - loc.size = 4096; - loc.tier = tier; - return loc; -} - -// With a DRAM (or HBM) replica present alongside an SSD one, the strategy must -// never route to the slow SSD tier. -TEST(TierPriorityRouteGetStrategyTest, PrefersDramOverSsd) { - TierPriorityRouteGetStrategy strategy; - std::vector locations = { - MakeLoc("ssd-node", TierType::SSD), - MakeLoc("dram-node", TierType::DRAM), - }; - for (int i = 0; i < 100; ++i) { - auto selected = strategy.Select(locations, "requester"); - EXPECT_EQ(selected.tier, TierType::DRAM); - EXPECT_EQ(selected.node_id, "dram-node"); - } -} - -// HBM beats both DRAM and SSD. -TEST(TierPriorityRouteGetStrategyTest, PrefersHbmOverDramAndSsd) { - TierPriorityRouteGetStrategy strategy; - std::vector locations = { - MakeLoc("ssd-node", TierType::SSD), - MakeLoc("dram-node", TierType::DRAM), - MakeLoc("hbm-node", TierType::HBM), - }; - for (int i = 0; i < 100; ++i) { - auto selected = strategy.Select(locations, "requester"); - EXPECT_EQ(selected.tier, TierType::HBM); - } -} - -// When SSD is the only tier present it is selected (read-from-SSD is valid). -TEST(TierPriorityRouteGetStrategyTest, FallsBackToSsdWhenOnlyTier) { - TierPriorityRouteGetStrategy strategy; - std::vector locations = { - MakeLoc("ssd-a", TierType::SSD), - MakeLoc("ssd-b", TierType::SSD), - }; - for (int i = 0; i < 50; ++i) { - auto selected = strategy.Select(locations, "requester"); - EXPECT_EQ(selected.tier, TierType::SSD); - } -} - -// Within the winning tier, selection spreads across all replicas on that tier -// and never leaks to a lower tier. -TEST(TierPriorityRouteGetStrategyTest, RandomWithinBestTierOnly) { - TierPriorityRouteGetStrategy strategy; - std::vector locations = { - MakeLoc("dram-a", TierType::DRAM), - MakeLoc("dram-b", TierType::DRAM), - MakeLoc("dram-c", TierType::DRAM), - MakeLoc("ssd-x", TierType::SSD), - }; - std::set seen; - for (int i = 0; i < 2000; ++i) { - auto selected = strategy.Select(locations, "requester"); - ASSERT_EQ(selected.tier, TierType::DRAM) << "must never pick the SSD replica"; - seen.insert(selected.node_id); - } - EXPECT_EQ(seen.size(), 3u) << "all three DRAM replicas should be reachable"; - EXPECT_EQ(seen.count("ssd-x"), 0u); -} - -TEST(TierPriorityRouteGetStrategyTest, EmptyReturnsDefault) { - TierPriorityRouteGetStrategy strategy; - std::vector locations; - auto selected = strategy.Select(locations, "requester"); - EXPECT_EQ(selected.tier, TierType::UNKNOWN); - EXPECT_TRUE(selected.node_id.empty()); -} - -} // namespace -} // namespace mori::umbp