diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..bcd0fbc
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,10 @@
+* text=auto eol=lf
+
+*.png binary
+*.jpg binary
+*.jpeg binary
+*.gif binary
+*.pdf binary
+*.so binary
+*.dll binary
+*.exe binary
diff --git a/.gitignore b/.gitignore
index 1722478..a376bd0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,4 +4,8 @@
 **/.idea/*
 .cache/
 bench/
-experiment/
\ No newline at end of file
+experiment/
+**/results
+**.pyc
+**/_pychache__
+.artifacts/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0b98cda..8851525 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,6 +6,19 @@ project(dsr
 
 include(GNUInstallDirs)
 
+# -DTSAN=ON  enables ThreadSanitizer across the whole build (library + tests/benchmarks).
+# Must be applied before any add_subdirectory so every TU is instrumented.
+# Incompatible with SANITIZER (ASan/UBSan) — enforce that here.
+option(TSAN "Enable ThreadSanitizer" OFF)
+if (TSAN)
+    if (SANITIZER)
+        message(FATAL_ERROR "TSAN and SANITIZER (ASan+UBSan) are mutually exclusive")
+    endif()
+    message(STATUS "ThreadSanitizer enabled")
+    add_compile_options(-fsanitize=thread -fno-omit-frame-pointer)
+    add_link_options(-fsanitize=thread)
+endif()
+
 add_definitions(-I/usr/include/x86_64-linux-gnu/qt6/QtOpenGLWidgets/)
 
 include_directories(/home/robocomp/robocomp/classes)
@@ -27,3 +40,9 @@ if (WITH_TESTS)
 add_subdirectory(tests)
 
 endif()
+
+if (WITH_BENCHMARKS)
+
+add_subdirectory(benchmarks)
+
+endif()
diff --git a/api/dsr_api.cpp b/api/dsr_api.cpp
index 42207a9..a97a144 100644
--- a/api/dsr_api.cpp
+++ b/api/dsr_api.cpp
@@ -75,14 +75,14 @@ DSRGraph::DSRGraph(GraphSettings settings) :
 
 
     // RTPS Initialize publisher with general topic
-    auto [res, pub, writer] = dsrpub_node.init(participant_handle, dsrparticipant.getNodeTopic());
-    auto [res2, pub2, writer2] = dsrpub_node_attrs.init(participant_handle, dsrparticipant.getAttNodeTopic());
+    auto [res, pub, writer] = dsrpub_node.init(participant_handle, dsrparticipant.getNodeTopic(), dsrparticipant.get_domain_id());
+    auto [res2, pub2, writer2] = dsrpub_node_attrs.init(participant_handle, dsrparticipant.getAttNodeTopic(), dsrparticipant.get_domain_id());
 
-    auto [res3, pub3, writer3] = dsrpub_edge.init(participant_handle, dsrparticipant.getEdgeTopic());
-    auto [res4, pub4, writer4] = dsrpub_edge_attrs.init(participant_handle, dsrparticipant.getAttEdgeTopic());
+    auto [res3, pub3, writer3] = dsrpub_edge.init(participant_handle, dsrparticipant.getEdgeTopic(), dsrparticipant.get_domain_id());
+    auto [res4, pub4, writer4] = dsrpub_edge_attrs.init(participant_handle, dsrparticipant.getAttEdgeTopic(), dsrparticipant.get_domain_id());
 
-    auto [res5, pub5, writer5] = dsrpub_graph_request.init(participant_handle, dsrparticipant.getGraphRequestTopic());
-    auto [res6, pub6, writer6] = dsrpub_request_answer.init(participant_handle, dsrparticipant.getGraphTopic());
+    auto [res5, pub5, writer5] = dsrpub_graph_request.init(participant_handle, dsrparticipant.getGraphRequestTopic(), dsrparticipant.get_domain_id());
+    auto [res6, pub6, writer6] = dsrpub_request_answer.init(participant_handle, dsrparticipant.getGraphTopic(), dsrparticipant.get_domain_id());
 
     dsrparticipant.add_publisher(dsrparticipant.getNodeTopic()->get_name(), {pub, writer});
     dsrparticipant.add_publisher(dsrparticipant.getAttNodeTopic()->get_name(), {pub2, writer2});
@@ -272,21 +272,18 @@ std::tuple<bool, std::optional<std::vector<IDL::MvregNodeAttr>>> DSRGraph::updat
 
     if (!deleted.contains(node.id()))
     {
-        if (nodes.contains(node.id()) and !nodes.at(node.id()).empty())
+        auto nit = nodes.find(node.id());
+        if (nit != nodes.end() && !nit->second.empty())
         {
-
             std::vector<IDL::MvregNodeAttr> atts_deltas;
-            auto &iter = nodes.at(node.id()).read_reg().attrs();
+            auto &iter = nit->second.read_reg().attrs();
             //New attributes and updates.
             for (auto &[k, att]: node.attrs()) {
-                if (!iter.contains(k)) {
-                    iter.emplace(k, mvreg<CRDTAttribute>());
-                }
-                if (iter.at(k).empty() or att.read_reg() != iter.at(k).read_reg()) {
-                    auto delta = iter.at(k).write(std::move(att.read_reg()));
+                auto &attr_reg = iter.try_emplace(k, mvreg<CRDTAttribute>()).first->second;
+                if (attr_reg.empty() or att.read_reg() != attr_reg.read_reg()) {
+                    auto delta = attr_reg.write(std::move(att.read_reg()));
                     atts_deltas.emplace_back(
                             CRDTNodeAttr_to_IDL(agent_id, node.id(), node.id(), k, delta));
-
                 }
             }
             //Remove old attributes.
@@ -296,7 +293,7 @@ std::tuple<bool, std::optional<std::vector<IDL::MvregNodeAttr>>> DSRGraph::updat
                 if (ignored_attributes.contains(k)) {
                     it_a = iter.erase(it_a);
                 } else if (!node.attrs().contains(k)) {
-                    auto delta = iter.at(k).reset();
+                    auto delta = it_a->second.reset();
                     atts_deltas.emplace_back(
                             CRDTNodeAttr_to_IDL(node.agent_id(), node.id(), node.id(), k, delta));
                     it_a = iter.erase(it_a);
@@ -378,27 +375,23 @@ DSRGraph::delete_node_(uint64_t id) {
     // Get remove delta.
     auto delta = nodes[id].reset();
     IDL::MvregNode delta_remove = CRDTNode_to_IDL(agent_id, id, delta);
-    //search and remove edges.
-    //For each node check if there is an edge to remove.
-    //TODO: use to_edges.
-    for (auto &[k, v] : nodes)
+    // Search and remove incoming edges using to_edges cache: O(k) instead of O(n).
     {
-        std::shared_lock<std::shared_mutex> lck_cache(_mutex_cache_maps);
-        if (!edges.contains({k, id})) continue;
-        // Remove all edges between them
-        auto &visited_node = v.read_reg();
-        auto keys = deleted_edges.size();
-        for (const auto &key : edges.at({k, id}))
+        decltype(to_edges)::mapped_type incoming;
         {
-            deleted_edges.emplace_back(visited_node.fano().at({id, key}).read_reg());
-            auto delta_fano = visited_node.fano().at({id, key}).reset();
-            delta_vec.emplace_back(CRDTEdge_to_IDL(agent_id, k, id, key, delta_fano));
-            visited_node.fano().erase({id, key});
+            std::shared_lock<std::shared_mutex> lck_cache(_mutex_cache_maps);
+            if (to_edges.contains(id))
+                incoming = to_edges.at(id);
         }
-        lck_cache.unlock();
-        //Remove all from cache
-        for (auto i = keys; i < deleted_edges.size(); i++) {
-            update_maps_edge_delete(k, id, deleted_edges[i].type());
+        for (const auto &[from, type] : incoming)
+        {
+            if (!nodes.contains(from)) continue;
+            auto &visited_node = nodes.at(from).read_reg();
+            deleted_edges.emplace_back(visited_node.fano().at({id, type}).read_reg());
+            auto delta_fano = visited_node.fano().at({id, type}).reset();
+            delta_vec.emplace_back(CRDTEdge_to_IDL(agent_id, from, id, type, delta_fano));
+            visited_node.fano().erase({id, type});
+            update_maps_edge_delete(from, id, type);
         }
     }
     update_maps_node_delete(id, node.value());
@@ -495,6 +488,7 @@ std::vector<DSR::Node> DSRGraph::get_nodes_by_type(const std::string &type)
     std::vector<Node> nodes_;
     if (nodeType.contains(type))
     {
+        nodes_.reserve(nodeType.at(type).size());
         for (auto &id: nodeType.at(type))
         {
             std::optional<CRDTNode> n = get_(id);
@@ -527,6 +521,12 @@ std::vector<DSR::Node> DSRGraph::get_nodes_by_types(const std::vector<std::strin
     std::shared_lock<std::shared_mutex> lck(_mutex_cache_maps);
 
     std::vector<Node> nodes_;
+    {
+        size_t total = 0;
+        for (const auto &type : types)
+            if (nodeType.contains(type)) total += nodeType.at(type).size();
+        nodes_.reserve(total);
+    }
     for (auto &type : types)
     {
         if (nodeType.contains(type))
@@ -547,17 +547,17 @@ std::vector<DSR::Node> DSRGraph::get_nodes_by_types(const std::vector<std::strin
 //////////////////////////////////////////////////////////////////////////////////
 std::optional<CRDTEdge> DSRGraph::get_edge_(uint64_t from, uint64_t to, const std::string &key)
 {
-    //std::shared_lock<std::shared_mutex> lock(_mutex);
-    if (nodes.contains(from) && nodes.contains(to))
-    {
-        auto n = get_(from);
-        if (n.has_value()) {
-            auto edge = n.value().fano().find({to, key});
-            if (edge != n.value().fano().end()) {
-                return edge->second.read_reg();
-            }
-        }
+    auto from_it = nodes.find(from);
+    if (from_it == nodes.end() || from_it->second.empty() || !nodes.contains(to)) {
+        return {};
+    }
+
+    auto& fano = from_it->second.read_reg().fano();
+    auto edge = fano.find({to, key});
+    if (edge != fano.end() && !edge->second.empty()) {
+        return edge->second.read_reg();
     }
+
     return {};
 }
 
@@ -587,18 +587,17 @@ std::optional<Edge> DSRGraph::get_edge(const Node &n, const std::string &to, con
     std::optional<uint64_t> id_to = get_id_from_name(to);
     if (id_to.has_value())
     {
-        return (n.fano().contains({id_to.value(), key})) ?
-               std::make_optional(n.fano().find({id_to.value(), key})->second) :
-               std::nullopt;
+        auto it = n.fano().find({id_to.value(), key});
+        if (it != n.fano().end()) return it->second;
     }
     return {};
 }
 
 std::optional<Edge> DSRGraph::get_edge(const Node &n, uint64_t to, const std::string &key)
 {
-    return (n.fano().contains({to, key})) ?
-           std::make_optional(n.fano().find({to, key})->second) :
-           std::nullopt;
+    auto it = n.fano().find({to, key});
+    if (it != n.fano().end()) return it->second;
+    return {};
 }
 
 
@@ -613,43 +612,33 @@ DSRGraph::insert_or_assign_edge_(CRDTEdge &&attrs, uint64_t from, uint64_t to)
     {
         auto &node = nodes.at(from).read_reg();
         //check if we are creating an edge or we are updating it.
-        //Update
-        if (node.fano().contains({to, attrs.type()}))
+        auto fano_it = node.fano().find({to, attrs.type()});
+        if (fano_it != node.fano().end())
         {
-            auto iter = nodes.at(from).read_reg().fano().find({attrs.to(), attrs.type()});
-            auto end = nodes.at(from).read_reg().fano().end();
-            if (iter != end) {
-                std::vector<IDL::MvregEdgeAttr> atts_deltas;
-                auto &iter_edge = iter->second.read_reg().attrs();
-                for (auto &[k, att]: attrs.attrs()) {
-                    //comparar igualdad o inexistencia
-                    if (!iter_edge.contains(k)) {
-                        iter_edge.emplace(k, mvreg<CRDTAttribute>());
-                    }
-                    if (iter_edge.at(k).empty() or
-                        att.read_reg() !=
-                        iter_edge.at(k).read_reg()) {
-                        auto delta = iter_edge.at(k).write(std::move(att.read_reg()));
-                        atts_deltas.emplace_back(
-                                CRDTEdgeAttr_to_IDL(agent_id, from, from, to, attrs.type(), k, delta));
-
-                    }
+            //Update
+            std::vector<IDL::MvregEdgeAttr> atts_deltas;
+            auto &iter_edge = fano_it->second.read_reg().attrs();
+            for (auto &[k, att]: attrs.attrs()) {
+                auto &attr_reg = iter_edge.try_emplace(k, mvreg<CRDTAttribute>()).first->second;
+                if (attr_reg.empty() or att.read_reg() != attr_reg.read_reg()) {
+                    auto delta = attr_reg.write(std::move(att.read_reg()));
+                    atts_deltas.emplace_back(
+                            CRDTEdgeAttr_to_IDL(agent_id, from, from, to, attrs.type(), k, delta));
                 }
-                auto it = iter_edge.begin();
-                while (it != iter_edge.end()) {
-                    if (!attrs.attrs().contains(it->first)) {
-                        std::string att = it->first;
-                        auto delta = iter_edge.at(it->first).reset();
-                        it = iter_edge.erase(it);
-                        atts_deltas.emplace_back(
-                                CRDTEdgeAttr_to_IDL(agent_id, from, from, to, attrs.type(), att, delta));
-
-                    } else {
-                        ++it;
-                    }
+            }
+            auto it = iter_edge.begin();
+            while (it != iter_edge.end()) {
+                if (!attrs.attrs().contains(it->first)) {
+                    std::string att = it->first;
+                    auto delta = it->second.reset();
+                    it = iter_edge.erase(it);
+                    atts_deltas.emplace_back(
+                            CRDTEdgeAttr_to_IDL(agent_id, from, from, to, attrs.type(), att, delta));
+                } else {
+                    ++it;
                 }
-                return {true, {}, std::move(atts_deltas)};
             }
+            return {true, {}, std::move(atts_deltas)};
         } else
         { // Insert
             //node.fano().insert({{to, attrs.type()}, mvreg<CRDTEdge>()});
@@ -799,10 +788,18 @@ std::vector<DSR::Edge> DSRGraph::get_edges_by_type(const std::string &type)
     std::shared_lock<std::shared_mutex> lock_cache(_mutex_cache_maps);
     std::vector<Edge> edges_;
     if (edgeType.contains(type)) {
+        edges_.reserve(edgeType.at(type).size());
         for (auto &[from, to] : edgeType.at(type)) {
-            auto n = get_edge_(from, to, type);
-            if (n.has_value())
-                edges_.emplace_back(std::move(n.value()));
+            auto node_it = nodes.find(from);
+            if (node_it == nodes.end() || node_it->second.empty()) {
+                continue;
+            }
+
+            auto &fano = node_it->second.read_reg().fano();
+            auto edge_it = fano.find({to, type});
+            if (edge_it != fano.end()) {
+                edges_.emplace_back(edge_it->second.read_reg());
+            }
         }
     }
     return edges_;
@@ -814,6 +811,7 @@ std::vector<DSR::Edge> DSRGraph::get_edges_to_id(uint64_t id)
     std::shared_lock<std::shared_mutex> lock_cache(_mutex_cache_maps);
     std::vector<Edge> edges_;
     if (to_edges.contains(id)) {
+        edges_.reserve(to_edges.at(id).size());
         for (const auto &[k, v] : to_edges.at(id)) {
             auto n = get_edge_(k, id, v);
             if (n.has_value())
@@ -826,12 +824,16 @@ std::vector<DSR::Edge> DSRGraph::get_edges_to_id(uint64_t id)
 
 std::optional<std::map<std::pair<uint64_t, std::string>, DSR::Edge>> DSRGraph::get_edges(uint64_t id) {
     std::shared_lock<std::shared_mutex> lock(_mutex);
-    std::optional<Node> n = get_node(id);
-    if (n.has_value())
-    {
-        return n->fano();
+    auto node_it = nodes.find(id);
+    if (node_it == nodes.end() || node_it->second.empty()) {
+        return std::nullopt;
     }
-    return std::nullopt;
+
+    std::map<std::pair<uint64_t, std::string>, DSR::Edge> edges_;
+    for (const auto &[key, edge_reg] : node_it->second.read_reg().fano()) {
+        edges_.emplace(key, DSR::Edge(edge_reg.read_reg()));
+    }
+    return edges_;
 }
 
 
@@ -952,7 +954,7 @@ inline void DSRGraph::update_maps_edge_delete(uint64_t from, uint64_t to, const
     std::unique_lock<std::shared_mutex> lck(_mutex_cache_maps);
     if (const auto tuple = std::pair{from, to}; edges.contains(tuple)) {
         edges.at(tuple).erase(key);
-        edges.erase({from, to});
+        if (edges.at(tuple).empty()) edges.erase(tuple);
     }
 
     if (to_edges.contains(to)) {
@@ -1102,11 +1104,20 @@ void DSRGraph::join_delta_node(IDL::MvregNode &&mvreg)
         };
 
         std::optional<std::unordered_set<std::pair<uint64_t, std::string>,hash_tuple>> cache_map_to_edges = {};
+        // Snapshot the data needed for signal emission while the lock is held.
+        // nodes.at(id) must NOT be accessed after the lock is released: a concurrent
+        // insert_node_/update_node call on the same id runs nodes[id].write() which
+        // calls dk.rmv() (clears dk.ds) followed by dk.add(), leaving a window where
+        // read_reg()'s assert(dk.ds.size() >= 1) would fire.
+        std::string node_type_snapshot;
+        std::vector<std::pair<uint64_t, std::string>> from_edges_snapshot;
         {
             std::unique_lock<std::shared_mutex> lock(_mutex);
             if (!deleted.contains(id)) {
                 joined = true;
-                maybe_deleted_node = (nodes[id].empty()) ? std::nullopt : std::make_optional(nodes.at(id).read_reg());
+                if (auto it = nodes.find(id); it != nodes.end() && !it->second.empty()) {
+                    maybe_deleted_node = it->second.read_reg();
+                }
                 nodes[id].join(std::move(crdt_delta));
                 if (nodes.at(id).empty() or d_empty) {
                     nodes.erase(id);
@@ -1115,8 +1126,14 @@ void DSRGraph::join_delta_node(IDL::MvregNode &&mvreg)
                     delete_unprocessed_deltas();
                 } else {
                     signal = true;
-                    update_maps_node_insert(id, nodes.at(id).read_reg());
+                    const auto& reg = nodes.at(id).read_reg();
+                    update_maps_node_insert(id, reg);
                     consume_unprocessed_deltas();
+                    // Snapshot type and outgoing edges before the lock is released.
+                    node_type_snapshot = reg.type();
+                    for (const auto &[k, v] : reg.fano()) {
+                        from_edges_snapshot.emplace_back(k.first, k.second);
+                    }
                 }
             } else {
                 delete_unprocessed_deltas();
@@ -1125,11 +1142,11 @@ void DSRGraph::join_delta_node(IDL::MvregNode &&mvreg)
 
         if (joined) {
             if (signal) {
-                DSR_LOG_DEBUG("[JOIN_NODE] node inserted/updated:", id, nodes.at(id).read_reg().type());
-                emitter.update_node_signal(id, nodes.at(id).read_reg().type(), SignalInfo{ mvreg.agent_id() });
-                for (const auto &[k, v] : nodes.at(id).read_reg().fano()) {
-                    DSR_LOG_DEBUG("[JOIN_NODE] add edge FROM:", id, k.first, k.second);
-                    emitter.update_edge_signal(id, k.first, k.second, SignalInfo{ mvreg.agent_id() });
+                DSR_LOG_DEBUG("[JOIN_NODE] node inserted/updated:", id, node_type_snapshot);
+                emitter.update_node_signal(id, node_type_snapshot, SignalInfo{ mvreg.agent_id() });
+                for (const auto &[to_id, edge_type] : from_edges_snapshot) {
+                    DSR_LOG_DEBUG("[JOIN_NODE] add edge FROM:", id, to_id, edge_type);
+                    emitter.update_edge_signal(id, to_id, edge_type, SignalInfo{ mvreg.agent_id() });
                 }
 
                 for (const auto &[k, v]: map_new_to_edges)
@@ -1452,7 +1469,10 @@ std::optional<std::string> DSRGraph::join_delta_edge_attr(IDL::MvregEdgeAttr &&m
 
 void DSRGraph::join_full_graph(IDL::OrMap &&full_graph)
 {
-    std::vector<std::tuple<bool, uint64_t, std::string, std::optional<CRDTNode>>> updates;
+    // 5th element: post-join node snapshot captured inside the lock, used for
+    // signal emission after the lock is released to avoid racing with
+    // insert_node_/update_node (same pattern as join_delta_node).
+    std::vector<std::tuple<bool, uint64_t, std::string, std::optional<CRDTNode>, std::optional<CRDTNode>>> updates;
 
     uint64_t id{0}, timestamp{0};
     uint32_t agent_id_ch{0};
@@ -1539,30 +1559,37 @@ void DSRGraph::join_full_graph(IDL::OrMap &&full_graph)
             auto mv = IDLNode_to_CRDT(std::move(val));
             bool mv_empty = mv.empty();
             agent_id_ch = val.agent_id();
-            std::optional<CRDTNode> nd = (nodes[k].empty()) ? std::nullopt : std::make_optional(nodes[k].read_reg());
+            auto it = nodes.find(k);
+            std::optional<CRDTNode> nd =
+                    (it != nodes.end() and !it->second.empty()) ? std::make_optional(it->second.read_reg()) : std::nullopt;
             id = k;
             if (!deleted.contains(k)) {
-                nodes[k].join(std::move(mv));
-                if (mv_empty or nodes.at(k).empty()) {
+                if (it == nodes.end()) {
+                    it = nodes.emplace(k, mvreg<CRDTNode>{}).first;
+                }
+                it->second.join(std::move(mv));
+                if (mv_empty or it->second.empty()) {
                     update_maps_node_delete(k, nd);
-                    updates.emplace_back(false, k, "", std::nullopt);
+                    updates.emplace_back(false, k, "", std::nullopt, std::nullopt);
                     delete_unprocessed_deltas();
                 } else {
-                    update_maps_node_insert(k, nodes.at(k).read_reg());
-                    updates.emplace_back(true, k, nodes.at(k).read_reg().type(), nd);
+                    const auto& reg = it->second.read_reg();
+                    update_maps_node_insert(k, reg);
+                    updates.emplace_back(true, k, reg.type(), nd, reg);
                     consume_unprocessed_deltas();
                 }
             }
         }
 
     }
-    for (auto &[signal, id, type, nd] : updates)
+    for (auto &[signal, id, type, nd, current_nd] : updates)
         if (signal) {
-            //check what change is joined
-            if (!nd.has_value() || nd->attrs() != nodes[id].read_reg().attrs()) {
-                emitter.update_node_signal(id, nodes[id].read_reg().type(), SignalInfo{ agent_id_ch });
-            } else if (nd.value() != nodes[id].read_reg()) {
-                auto iter = nodes[id].read_reg().fano();
+            //check what change is joined — use the snapshot captured inside the lock,
+            //not nodes[id], which races with concurrent insert_node_/update_node calls.
+            if (!nd.has_value() || nd->attrs() != current_nd->attrs()) {
+                emitter.update_node_signal(id, type, SignalInfo{ agent_id_ch });
+            } else if (nd.value() != *current_nd) {
+                const auto& iter = current_nd->fano();
                 for (const auto &[k, v] : nd->fano()) {
                     if (!iter.contains(k)) {
                         emitter.del_edge_signal(id, k.first, k.second, SignalInfo{ agent_id_ch });
@@ -1681,7 +1708,7 @@ void DSRGraph::node_subscription_thread()
         catch (const std::exception &ex) { std::cerr << ex.what() << std::endl; }
     };
     dsrpub_call_node = NewMessageFunctor(this, lambda_general_topic);
-    auto [res, sub, reader] = dsrsub_node.init(dsrparticipant.getParticipant(), dsrparticipant.getNodeTopic(), dsrpub_call_node, mtx_entity_creation);
+    auto [res, sub, reader] = dsrsub_node.init(dsrparticipant.getParticipant(), dsrparticipant.getNodeTopic(), dsrparticipant.get_domain_id(), dsrpub_call_node, mtx_entity_creation);
     dsrparticipant.add_subscriber(dsrparticipant.getNodeTopic()->get_name(), {sub, reader});
 }
 
@@ -1715,7 +1742,7 @@ void DSRGraph::edge_subscription_thread()
         catch (const std::exception &ex) { std::cerr << ex.what() << std::endl; }
     };
     dsrpub_call_edge = NewMessageFunctor(this, lambda_general_topic);
-    auto [res, sub, reader]  = dsrsub_edge.init(dsrparticipant.getParticipant(), dsrparticipant.getEdgeTopic(), dsrpub_call_edge, mtx_entity_creation);
+    auto [res, sub, reader]  = dsrsub_edge.init(dsrparticipant.getParticipant(), dsrparticipant.getEdgeTopic(), dsrparticipant.get_domain_id(), dsrpub_call_edge, mtx_entity_creation);
     dsrparticipant.add_subscriber(dsrparticipant.getEdgeTopic()->get_name(), {sub, reader});
 
 }
@@ -1781,7 +1808,7 @@ void DSRGraph::edge_attrs_subscription_thread()
 
     };
     dsrpub_call_edge_attrs = NewMessageFunctor(this, lambda_general_topic);
-    auto [res, sub, reader] = dsrsub_edge_attrs.init(dsrparticipant.getParticipant(), dsrparticipant.getAttEdgeTopic(),
+    auto [res, sub, reader] = dsrsub_edge_attrs.init(dsrparticipant.getParticipant(), dsrparticipant.getAttEdgeTopic(), dsrparticipant.get_domain_id(),
                            dsrpub_call_edge_attrs, mtx_entity_creation);
     dsrparticipant.add_subscriber(dsrparticipant.getAttEdgeTopic()->get_name(), {sub, reader});
     //dsrsub_edge_attrs_stream.init(dsrparticipant.getParticipant(), "DSR_EDGE_ATTRS_STREAM", dsrparticipant.getEdgeAttrTopicName(),
@@ -1850,7 +1877,7 @@ void DSRGraph::node_attrs_subscription_thread()
 
     };
     dsrpub_call_node_attrs = NewMessageFunctor(this, lambda_general_topic);
-    auto [res, sub, reader] = dsrsub_node_attrs.init(dsrparticipant.getParticipant(), dsrparticipant.getAttNodeTopic(),
+    auto [res, sub, reader] = dsrsub_node_attrs.init(dsrparticipant.getParticipant(), dsrparticipant.getAttNodeTopic(), dsrparticipant.get_domain_id(),
                            dsrpub_call_node_attrs, mtx_entity_creation);
     dsrparticipant.add_subscriber(dsrparticipant.getAttNodeTopic()->get_name(), {sub, reader});
 
@@ -1904,7 +1931,7 @@ void DSRGraph::fullgraph_server_thread()
         }
     };
     dsrpub_graph_request_call = NewMessageFunctor(this, lambda_graph_request);
-    auto [res, sub, reader] = dsrsub_graph_request.init(dsrparticipant.getParticipant(), dsrparticipant.getGraphRequestTopic(),
+    auto [res, sub, reader] = dsrsub_graph_request.init(dsrparticipant.getParticipant(), dsrparticipant.getGraphRequestTopic(), dsrparticipant.get_domain_id(),
                               dsrpub_graph_request_call, mtx_entity_creation);
     dsrparticipant.add_subscriber(dsrparticipant.getGraphRequestTopic()->get_name(), {sub, reader});
 
@@ -1912,8 +1939,8 @@ void DSRGraph::fullgraph_server_thread()
 
 std::pair<bool, bool> DSRGraph::fullgraph_request_thread()
 {
-    bool sync = false;
-    bool repeated = false;
+    std::atomic<bool> sync{false};
+    std::atomic<bool> repeated{false};
     auto lambda_request_answer = [&](eprosima::fastdds::dds::DataReader *reader, DSR::DSRGraph *graph)
     {
         while (true)
@@ -1946,7 +1973,7 @@ std::pair<bool, bool> DSRGraph::fullgraph_request_thread()
     };
 
     dsrpub_request_answer_call = NewMessageFunctor(this, lambda_request_answer);
-    auto [res, sub, reader] = dsrsub_request_answer.init(dsrparticipant.getParticipant(), dsrparticipant.getGraphTopic(),
+    auto [res, sub, reader] = dsrsub_request_answer.init(dsrparticipant.getParticipant(), dsrparticipant.getGraphTopic(), dsrparticipant.get_domain_id(),
                                dsrpub_request_answer_call, mtx_entity_creation);
     dsrparticipant.add_subscriber(dsrparticipant.getGraphTopic()->get_name(), {sub, reader});
 
diff --git a/api/dsr_inner_eigen_api.cpp b/api/dsr_inner_eigen_api.cpp
index 97e8c0c..d75052e 100644
--- a/api/dsr_inner_eigen_api.cpp
+++ b/api/dsr_inner_eigen_api.cpp
@@ -123,10 +123,8 @@ std::optional<Mat::RTMat> InnerEigenAPI::get_transformation_matrix(const std::st
             }
         }
         // update node cache reference
-        uint64_t dst_id = G->get_node(dest).value().id();
-        node_map[dst_id].push_back(key);
-        uint64_t orig_id = G->get_node(orig).value().id();
-        node_map[orig_id].push_back(key);
+        node_map[bn.value().id()].push_back(key);
+        node_map[an.value().id()].push_back(key);
 
         // update cache
         auto ret = btotal.inverse() * atotal;
@@ -212,7 +210,6 @@ std::optional<Mat::Vector6d> InnerEigenAPI::transform_axis(const std::string &de
 
 std::optional<Mat::Vector6d> InnerEigenAPI::transform_axis( const std::string &dest,  const std::string & orig, std::uint64_t timestamp)
 {
-    Mat::Vector6d v;
 	return transform_axis(dest, Mat::Vector6d::Zero(), orig, timestamp);
 }
 
diff --git a/api/include/dsr/api/dsr_api.h b/api/include/dsr/api/dsr_api.h
index d2c2ea6..6fbc18e 100644
--- a/api/include/dsr/api/dsr_api.h
+++ b/api/include/dsr/api/dsr_api.h
@@ -57,6 +57,7 @@ namespace DSR
     class DSRGraph : public QObject
     {
         friend RT_API;
+        friend class DSRGraphTestAccess;
 
         public:
         size_t size() const;
@@ -584,7 +585,6 @@ namespace DSR
         const bool copy;
         std::unique_ptr<Utilities> utils;
         std::unordered_set<std::string_view> ignored_attributes;
-        ThreadPool tp, tp_delta_attr;
         bool same_host;
         id_generator generator;
         GraphSettings::LOGLEVEL log_level;
@@ -677,6 +677,11 @@ namespace DSR
         std::unordered_multimap<uint64_t, std::tuple<uint64_t, std::string, mvreg<DSR::CRDTEdge>, uint64_t>> unprocessed_delta_edge_to;
         std::unordered_multimap<std::tuple<uint64_t, uint64_t, std::string>, std::tuple<std::string, mvreg<DSR::CRDTAttribute>, uint64_t>, hash_tuple> unprocessed_delta_edge_att;
 
+        // ThreadPools are declared after all data they access so that their
+        // destructors (which join worker threads) run before the data members
+        // are destroyed, preventing use-after-free data races on shutdown.
+        ThreadPool tp, tp_delta_attr;
+
         //Custom function for each rtps topic
         class NewMessageFunctor {
         public:
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
new file mode 100644
index 0000000..a72bfd7
--- /dev/null
+++ b/benchmarks/CMakeLists.txt
@@ -0,0 +1,160 @@
+cmake_minimum_required(VERSION 3.10)
+project(dsr_benchmarks
+        VERSION 2024.12.01
+        DESCRIPTION "DSR Benchmarking Suite"
+        LANGUAGES CXX)
+
+# Fetch Catch2 if not already available
+Include(FetchContent)
+
+FetchContent_Declare(
+  Catch2
+  GIT_REPOSITORY https://github.com/catchorg/Catch2.git
+  GIT_TAG        v3.8.0
+)
+
+FetchContent_Declare(
+  nanobench
+  GIT_REPOSITORY https://github.com/martinus/nanobench.git
+  GIT_TAG        v4.3.11
+)
+
+FetchContent_MakeAvailable(Catch2 nanobench)
+
+# Find required packages
+find_package(Boost REQUIRED)
+find_package(Qt6 COMPONENTS Core REQUIRED)
+find_package(Eigen3 3.3 REQUIRED NO_MODULE)
+
+# Collect source files
+set(BENCHMARK_SOURCES
+    benchmark_main.cpp
+
+    # Latency benchmarks
+    latency/delta_propagation_bench.cpp
+    latency/signal_latency_bench.cpp
+    latency/crdt_join_bench.cpp
+
+    # Throughput benchmarks
+    throughput/single_agent_ops_bench.cpp
+    throughput/concurrent_writers_bench.cpp
+    throughput/single_agent_ops_with_latency_bench.cpp
+    throughput/query_ops_bench.cpp
+
+    # Scalability benchmarks
+    scalability/multi_agent_sync_bench.cpp
+    scalability/graph_size_impact_bench.cpp
+    scalability/thread_scaling_bench.cpp
+    scalability/graph_size_scaling_bench.cpp
+    scalability/agent_scaling_bench.cpp
+
+    # Consistency benchmarks
+    consistency/convergence_time_bench.cpp
+    consistency/conflict_rate_bench.cpp
+)
+
+# Header files for IDE integration
+set(BENCHMARK_HEADERS
+    core/benchmark_config.h
+    core/timing_utils.h
+    core/metrics_collector.h
+    core/nanobench_adapter.h
+    core/report_generator.h
+    fixtures/multi_agent_fixture.h
+    fixtures/graph_generator.h
+)
+
+# Create benchmark executable
+add_executable(dsr_benchmarks
+    ${BENCHMARK_SOURCES}
+    ${BENCHMARK_HEADERS}
+)
+
+# Set C++ standard
+set_target_properties(dsr_benchmarks PROPERTIES
+    CMAKE_CXX_STANDARD 23
+    CXX_STANDARD_REQUIRED ON
+    CXX_EXTENSIONS ON
+)
+
+target_compile_options(dsr_benchmarks PUBLIC -g -std=c++23)
+
+# -DTSAN=ON  enables ThreadSanitizer.  Requires the dsr_api/dsr_core libraries
+# to also be built with TSAN=ON (via the root CMakeLists.txt), otherwise TSan
+# will report false positives from uninstrumented library code.
+option(TSAN "Enable ThreadSanitizer" OFF)
+if (TSAN)
+    message(STATUS "ThreadSanitizer enabled for benchmarks")
+    target_compile_options(dsr_benchmarks PRIVATE -fsanitize=thread -fno-omit-frame-pointer)
+    target_link_options(dsr_benchmarks PRIVATE -fsanitize=thread)
+endif()
+
+# Include directories
+target_include_directories(dsr_benchmarks PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR}/core
+    ${CMAKE_CURRENT_SOURCE_DIR}/fixtures
+)
+
+# Link libraries
+target_link_libraries(dsr_benchmarks PRIVATE
+    Catch2::Catch2
+    nanobench
+    dsr_api
+    dsr_core
+    Qt6::Core
+    Eigen3::Eigen
+    fastdds
+    fastcdr
+)
+
+# Create results directory
+file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/results)
+
+# Copy results directory structure
+add_custom_command(TARGET dsr_benchmarks POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:dsr_benchmarks>/results
+    COMMENT "Creating results directory"
+)
+
+# Flamegraph target — generates one SVG per benchmark test case via perf.
+# Requires: perf, and FlameGraph scripts (flamegraph.pl + stackcollapse-perf.pl).
+# Set FG_DIR to the FlameGraph checkout if the scripts aren't on PATH, e.g.:
+#   cmake --build . --target flamegraph -j1 -- FG_DIR=/opt/FlameGraph
+# Or pass a Catch2 filter to profile a subset:
+#   cmake --build . --target flamegraph -j1 -- BENCH_FILTER=[LATENCY]
+set(FLAMEGRAPH_SCRIPT ${CMAKE_CURRENT_SOURCE_DIR}/flamegraph.sh)
+set(FLAMEGRAPH_OUTDIR ${CMAKE_CURRENT_BINARY_DIR}/results/flamegraphs)
+add_custom_target(flamegraph
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${FLAMEGRAPH_OUTDIR}
+    COMMAND env
+        FG_DIR=$ENV{FG_DIR}
+        ${FLAMEGRAPH_SCRIPT}
+            -b $<TARGET_FILE:dsr_benchmarks>
+            -o ${FLAMEGRAPH_OUTDIR}
+            $ENV{BENCH_FILTER}
+    DEPENDS dsr_benchmarks
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+    COMMENT "Generating per-benchmark flamegraphs in ${FLAMEGRAPH_OUTDIR}"
+    USES_TERMINAL
+)
+
+# Register tests with CTest (optional)
+# Disabled auto-discovery as it requires running the binary at build time
+# which may fail if libraries are not in LD_LIBRARY_PATH
+# include(Catch)
+# catch_discover_tests(dsr_benchmarks)
+
+# Installation (optional)
+install(TARGETS dsr_benchmarks
+    RUNTIME DESTINATION bin
+)
+
+# Print configuration summary
+message(STATUS "")
+message(STATUS "DSR Benchmarks Configuration:")
+message(STATUS "  Build type: ${CMAKE_BUILD_TYPE}")
+message(STATUS "  C++ Standard: C++23")
+message(STATUS "  Catch2 version: 3.8.0")
+message(STATUS "  nanobench version: 4.3.11")
+message(STATUS "")
diff --git a/benchmarks/benchmark_main.cpp b/benchmarks/benchmark_main.cpp
new file mode 100644
index 0000000..7321fcc
--- /dev/null
+++ b/benchmarks/benchmark_main.cpp
@@ -0,0 +1,131 @@
+// DSR Benchmarking Suite
+// Main entry point using Catch2
+
+#define CATCH_CONFIG_RUNNER
+#include <catch2/catch_session.hpp>
+#include <QCoreApplication>
+#include <QtGlobal>
+#include <iostream>
+#include <stdexcept>
+
+// Custom Qt message handler to filter debug output during benchmarks
+static bool g_verbose = false;
+
+namespace {
+
+bool hasCliFlag(int argc, char* argv[], const char* flag) {
+    for (int i = 1; i < argc; ++i) {
+        if (std::string(argv[i]) == flag) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool shouldPrintBenchmarkPreamble(int argc, char* argv[]) {
+    return !hasCliFlag(argc, argv, "--help")
+        && !hasCliFlag(argc, argv, "-?")
+        && !hasCliFlag(argc, argv, "--list-tests")
+        && !hasCliFlag(argc, argv, "--list-tags")
+        && !hasCliFlag(argc, argv, "--list-reporters")
+        && !hasCliFlag(argc, argv, "--list-listeners");
+}
+
+}  // namespace
+
+void benchmarkMessageHandler(QtMsgType type, const QMessageLogContext& context, const QString& msg) {
+    // In non-verbose mode, only show warnings and above
+    if (!g_verbose) {
+        switch (type) {
+            case QtDebugMsg:
+            case QtInfoMsg:
+                return;  // Suppress debug and info messages
+            default:
+                break;
+        }
+    }
+
+    // Format and output remaining messages
+    QByteArray localMsg = msg.toLocal8Bit();
+    switch (type) {
+        case QtDebugMsg:
+            std::cout << "[DEBUG] " << localMsg.constData() << std::endl;
+            break;
+        case QtInfoMsg:
+            std::cout << "[INFO] " << localMsg.constData() << std::endl;
+            break;
+        case QtWarningMsg:
+            std::cout << "[WARNING] " << localMsg.constData() << std::endl;
+            break;
+        case QtCriticalMsg:
+            std::cout << "[CRITICAL] " << localMsg.constData() << std::endl;
+            break;
+        case QtFatalMsg:
+            std::cout << "[FATAL] " << localMsg.constData() << std::endl;
+            // Throw instead of abort() so the fixture's try/catch can catch it,
+            // mark the test as failed, and let Catch2 continue to the next test.
+            throw std::runtime_error(localMsg.constData());
+    }
+}
+
+int main(int argc, char* argv[]) {
+    // Install custom message handler before QCoreApplication
+    qInstallMessageHandler(benchmarkMessageHandler);
+
+    // Check for verbose flag
+    for (int i = 1; i < argc; ++i) {
+        if (std::string(argv[i]) == "--verbose" || std::string(argv[i]) == "-v") {
+            g_verbose = true;
+            break;
+        }
+    }
+
+    // Initialize Qt (required for signals/slots)
+    QCoreApplication app(argc, argv);
+    // Initialize Catch2
+    Catch::Session session;
+
+    // Set default reporter to console with colors
+    session.configData().showDurations = Catch::ShowDurations::Always;
+
+    // Apply command line arguments
+    int returnCode = session.applyCommandLine(argc, argv);
+    if (returnCode != 0) {
+        return returnCode;
+    }
+
+    if (shouldPrintBenchmarkPreamble(argc, argv)) {
+        std::cout << "=================================\n";
+        std::cout << " DSR Benchmarking Suite\n";
+        std::cout << "=================================\n\n";
+        std::cout << "Available benchmark categories:\n";
+        std::cout << "  [BASELINE]     - Curated low-noise regression baseline\n";
+        std::cout << "  [EXTENDED]     - Slower supplementary baseline coverage\n";
+        std::cout << "  [LATENCY]      - Signal emission, CRDT operations\n";
+        std::cout << "  [THROUGHPUT]   - Single agent insert/read/update/delete, concurrent writers\n";
+        std::cout << "  [CRDT]         - mvreg and dot_context micro-benchmarks\n";
+        std::cout << "  [SCALABILITY]  - Thread scaling, graph size impact\n";
+        std::cout << "  [CONSISTENCY]  - Convergence time, conflict rates\n";
+        std::cout << "  [PROFILE]      - Expensive profiling-focused cases\n";
+        std::cout << "  [LOAD]         - Work-under-load and concurrency-heavy cases\n";
+        std::cout << "  [MULTIAGENT]   - Multi-agent synchronization/consistency cases\n";
+        std::cout << "\n";
+        std::cout << "Usage examples:\n";
+        std::cout << "  ./dsr_benchmarks                    # Run all non-hidden benchmarks\n";
+        std::cout << "  ./dsr_benchmarks \"[BASELINE]\"       # Run curated baseline benchmarks\n";
+        std::cout << "  ./dsr_benchmarks \"[EXTENDED]\"       # Run slower supplementary coverage\n";
+        std::cout << "  ./dsr_benchmarks \"[LATENCY]\"        # Run latency benchmarks\n";
+        std::cout << "  ./dsr_benchmarks \"[THROUGHPUT]\"     # Run throughput benchmarks\n";
+        std::cout << "  ./dsr_benchmarks \"[CRDT]\"           # Run CRDT micro-benchmarks\n";
+        std::cout << "  ./dsr_benchmarks \"[PROFILE][LOAD]\"  # Run long load-heavy cases\n";
+        std::cout << "  ./dsr_benchmarks \"[PROFILE][MULTIAGENT]\" # Run multi-agent profiling cases\n";
+        std::cout << "  ./dsr_benchmarks \"[.multi]\"         # Run multi-agent tests (may timeout)\n";
+        std::cout << "  ./dsr_benchmarks -r json::out=x.json # Export to JSON\n";
+        std::cout << "  ./dsr_benchmarks --verbose          # Show Qt debug messages\n";
+        std::cout << "\n";
+        std::cout << "Note: [.multi] and [.extended] tests are hidden by default.\n";
+        std::cout << "\n";
+    }
+
+    return session.run();
+}
diff --git a/benchmarks/consistency/conflict_rate_bench.cpp b/benchmarks/consistency/conflict_rate_bench.cpp
new file mode 100644
index 0000000..fd76cb7
--- /dev/null
+++ b/benchmarks/consistency/conflict_rate_bench.cpp
@@ -0,0 +1,354 @@
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/benchmark/catch_benchmark.hpp>
+#include <atomic>
+#include <thread>
+#include <barrier>
+
+#include "../core/timing_utils.h"
+#include "../core/metrics_collector.h"
+#include "../core/report_generator.h"
+#include "../fixtures/multi_agent_fixture.h"
+#include "../fixtures/graph_generator.h"
+
+using namespace DSR;
+using namespace DSR::Benchmark;
+
+TEST_CASE("Conflict rate benchmarks", "[CONSISTENCY][conflict][.multi][PROFILE][MULTIAGENT]") {
+    GraphGenerator generator;
+    MetricsCollector collector("conflict_rate");
+
+    SECTION("Concurrent attribute updates - same node") {
+        MultiAgentFixture fixture;
+        auto config_file = generator.generate_empty_graph();
+        REQUIRE(fixture.create_agents(4, config_file));
+        fixture.wait_for_sync();
+
+        // Create shared node and capture actual ID
+        auto* agent_0 = fixture.get_agent(0);
+        auto shared_node = GraphGenerator::create_test_node(
+            0, agent_0->get_agent_id(), "conflict_test");
+        auto insert_result = agent_0->insert_node(shared_node);
+        REQUIRE(insert_result.has_value());
+        uint64_t shared_node_id = insert_result.value();
+
+        fixture.wait_for_sync();
+        REQUIRE(fixture.verify_convergence());
+
+        constexpr int NUM_ROUNDS = 50;
+        constexpr int UPDATES_PER_AGENT = 10;
+        constexpr size_t NUM_AGENTS = 4;
+
+        std::atomic<uint64_t> total_updates{0};
+        uint64_t conflicts_detected = 0;
+
+        std::barrier sync_point(NUM_AGENTS);
+
+        for (int round = 0; round < NUM_ROUNDS; ++round) {
+            std::vector<std::thread> threads;
+            threads.reserve(NUM_AGENTS);
+
+            // Record initial values before concurrent updates
+            std::vector<int32_t> expected_values(NUM_AGENTS);
+            for (size_t i = 0; i < NUM_AGENTS; ++i) {
+                expected_values[i] = static_cast<int32_t>(round * 1000 + i * 100);
+            }
+
+            for (size_t agent_idx = 0; agent_idx < NUM_AGENTS; ++agent_idx) {
+                threads.emplace_back([&, agent_idx, node_id = shared_node_id]() {
+                    auto* agent = fixture.get_agent(agent_idx);
+                    sync_point.arrive_and_wait();
+
+                    for (int i = 0; i < UPDATES_PER_AGENT; ++i) {
+                        auto node = agent->get_node(node_id);
+                        if (node) {
+                            int32_t value = static_cast<int32_t>(
+                                round * 1000 + agent_idx * 100 + i);
+                            agent->add_or_modify_attrib_local<level_att>(*node, value);
+                            agent->update_node(*node);
+                            total_updates.fetch_add(1, std::memory_order_relaxed);
+                        }
+                    }
+                });
+            }
+
+            for (auto& t : threads) {
+                t.join();
+            }
+
+            // Wait for convergence
+            fixture.wait_for_sync(std::chrono::milliseconds(500));
+
+            // Check if all agents converged to the same value
+            std::set<int32_t> final_values;
+            for (size_t i = 0; i < NUM_AGENTS; ++i) {
+                auto* agent = fixture.get_agent(i);
+                auto node = agent->get_node(shared_node_id);
+                if (node) {
+                    auto attr = agent->get_attrib_by_name<level_att>(*node);
+                    if (attr.has_value()) {
+                        final_values.insert(attr.value());
+                    }
+                }
+            }
+
+            // If agents have different values, conflict resolution may still be in progress
+            // or there was a conflict that resolved differently
+            if (final_values.size() > 1) {
+                conflicts_detected++;
+            }
+        }
+
+        double conflict_rate = static_cast<double>(conflicts_detected) /
+                              static_cast<double>(NUM_ROUNDS) * 100.0;
+
+        collector.record_consistency("concurrent_update_conflict_rate",
+            conflict_rate, "%",
+            {{"num_agents", std::to_string(NUM_AGENTS)},
+             {"updates_per_round", std::to_string(UPDATES_PER_AGENT * NUM_AGENTS)}});
+
+        INFO("Conflict rate: " << conflict_rate << "% (" << conflicts_detected
+             << "/" << NUM_ROUNDS << " rounds)");
+        INFO("Total updates: " << total_updates.load());
+
+        // Verify final convergence
+        fixture.wait_for_sync(std::chrono::milliseconds(1000));
+        CHECK(fixture.verify_convergence());
+    }
+
+    SECTION("Concurrent node creations - potential ID conflicts") {
+        // This tests CRDT behavior when multiple agents create nodes
+        MultiAgentFixture fixture;
+        auto config_file = generator.generate_empty_graph();
+        REQUIRE(fixture.create_agents(4, config_file));
+        fixture.wait_for_sync();
+
+        constexpr int NODES_PER_AGENT = 100;
+        constexpr size_t NUM_AGENTS = 4;
+
+        std::atomic<uint64_t> total_created{0};
+        std::atomic<uint64_t> creation_failures{0};
+
+        std::barrier sync_point(NUM_AGENTS);
+        std::vector<std::thread> threads;
+        threads.reserve(NUM_AGENTS);
+
+        for (size_t agent_idx = 0; agent_idx < NUM_AGENTS; ++agent_idx) {
+            threads.emplace_back([&, agent_idx]() {
+                auto* agent = fixture.get_agent(agent_idx);
+                sync_point.arrive_and_wait();
+
+                for (int i = 0; i < NODES_PER_AGENT; ++i) {
+                    // Each agent uses unique IDs in its range
+                    uint64_t node_id = 8500000 + agent_idx * 10000 + i;
+                    auto node = GraphGenerator::create_test_node(
+                        node_id, agent->get_agent_id(),
+                        "agent" + std::to_string(agent_idx) + "_node" + std::to_string(i));
+
+                    auto result = agent->insert_node(node);
+                    if (result.has_value()) {
+                        total_created.fetch_add(1, std::memory_order_relaxed);
+                    } else {
+                        creation_failures.fetch_add(1, std::memory_order_relaxed);
+                    }
+                }
+            });
+        }
+
+        for (auto& t : threads) {
+            t.join();
+        }
+
+        // Wait for convergence
+        fixture.wait_for_sync(std::chrono::milliseconds(2000));
+
+        // Verify all agents have the same nodes
+        auto* agent_0 = fixture.get_agent(0);
+        size_t expected_node_count = agent_0->get_nodes().size();
+
+        bool all_match = true;
+        for (size_t i = 1; i < NUM_AGENTS; ++i) {
+            auto* agent = fixture.get_agent(i);
+            if (agent->get_nodes().size() != expected_node_count) {
+                all_match = false;
+            }
+        }
+
+        collector.record_consistency("node_creation_success_rate",
+            static_cast<double>(total_created.load()) /
+            static_cast<double>(NODES_PER_AGENT * NUM_AGENTS) * 100.0, "%");
+
+        collector.record_consistency("final_convergence",
+            all_match ? 100.0 : 0.0, "%");
+
+        INFO("Created: " << total_created.load() << "/" << NODES_PER_AGENT * NUM_AGENTS);
+        INFO("Failures: " << creation_failures.load());
+        INFO("All agents converged: " << (all_match ? "yes" : "no"));
+
+        CHECK(fixture.verify_convergence());
+    }
+
+    SECTION("Edge conflict resolution") {
+        MultiAgentFixture fixture;
+        auto config_file = generator.generate_empty_graph();
+        REQUIRE(fixture.create_agents(2, config_file));
+        fixture.wait_for_sync();
+
+        auto* agent_a = fixture.get_agent(0);
+        auto* agent_b = fixture.get_agent(1);
+
+        // Create shared nodes and capture actual IDs
+        auto node1 = GraphGenerator::create_test_node(0, agent_a->get_agent_id(), "edge_node_1");
+        auto node2 = GraphGenerator::create_test_node(0, agent_a->get_agent_id(), "edge_node_2");
+        auto result1 = agent_a->insert_node(node1);
+        auto result2 = agent_a->insert_node(node2);
+        REQUIRE(result1.has_value());
+        REQUIRE(result2.has_value());
+        uint64_t node1_id = result1.value();
+        uint64_t node2_id = result2.value();
+
+        fixture.wait_for_sync();
+        REQUIRE(fixture.verify_convergence());
+
+        uint64_t conflicts = 0;
+        constexpr int NUM_ROUNDS = 50;
+
+        for (int round = 0; round < NUM_ROUNDS; ++round) {
+            // Both agents try to create the same edge simultaneously
+            auto edge_a = GraphGenerator::create_test_edge(
+                node1_id, node2_id, agent_a->get_agent_id(), "test_edge");
+            auto edge_b = GraphGenerator::create_test_edge(
+                node1_id, node2_id, agent_b->get_agent_id(), "test_edge");
+
+            std::thread ta([&]() { agent_a->insert_or_assign_edge(edge_a); });
+            std::thread tb([&]() { agent_b->insert_or_assign_edge(edge_b); });
+
+            ta.join();
+            tb.join();
+
+            fixture.wait_for_sync(std::chrono::milliseconds(200));
+
+            // Check both agents see the edge
+            auto edge_on_a = agent_a->get_edge(node1_id, node2_id, "test_edge");
+            auto edge_on_b = agent_b->get_edge(node1_id, node2_id, "test_edge");
+
+            if (!edge_on_a.has_value() || !edge_on_b.has_value()) {
+                conflicts++;
+            }
+
+            // Delete edge for next round
+            agent_a->delete_edge(node1_id, node2_id, "test_edge");
+            fixture.wait_for_sync(std::chrono::milliseconds(100));
+        }
+
+        double conflict_rate = static_cast<double>(conflicts) /
+                              static_cast<double>(NUM_ROUNDS) * 100.0;
+
+        collector.record_consistency("edge_conflict_rate",
+            conflict_rate, "%");
+
+        INFO("Edge conflict rate: " << conflict_rate << "%");
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "conflict_rate");
+}
+
+TEST_CASE("CRDT eventual consistency verification", "[CONSISTENCY][eventual][.multi][PROFILE][MULTIAGENT]") {
+    GraphGenerator generator;
+    MetricsCollector collector("eventual_consistency");
+
+    MultiAgentFixture fixture;
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(4, config_file));
+    fixture.wait_for_sync();
+
+    SECTION("All agents eventually converge after chaos") {
+        constexpr size_t NUM_AGENTS = 4;
+        constexpr int OPS_PER_AGENT = 50;
+
+        std::barrier sync_point(NUM_AGENTS);
+        std::atomic<bool> stop_flag{false};
+
+        // Each agent performs random operations
+        std::vector<std::thread> threads;
+        for (size_t agent_idx = 0; agent_idx < NUM_AGENTS; ++agent_idx) {
+            threads.emplace_back([&, agent_idx]() {
+                auto* agent = fixture.get_agent(agent_idx);
+                uint64_t base_id = 8700000 + agent_idx * 10000;
+
+                sync_point.arrive_and_wait();
+
+                for (int i = 0; i < OPS_PER_AGENT && !stop_flag.load(); ++i) {
+                    int op = i % 3;
+
+                    if (op == 0) {
+                        // Insert node
+                        auto node = GraphGenerator::create_test_node(
+                            base_id + i, agent->get_agent_id());
+                        agent->insert_node(node);
+                    } else if (op == 1) {
+                        // Update existing node
+                        auto node = agent->get_node(base_id + (i % (std::max(1, i / 2))));
+                        if (node) {
+                            agent->add_or_modify_attrib_local<level_att>(
+                                *node, static_cast<int32_t>(i));
+                            agent->update_node(*node);
+                        }
+                    } else {
+                        // Insert edge
+                        auto root = agent->get_node_root();
+                        if (root) {
+                            auto existing = agent->get_node(base_id + (i % (std::max(1, i / 2))));
+                            if (existing) {
+                                auto edge = GraphGenerator::create_test_edge(
+                                    root->id(), existing->id(), agent->get_agent_id());
+                                agent->insert_or_assign_edge(edge);
+                            }
+                        }
+                    }
+
+                    // Small delay between operations
+                    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+                }
+            });
+        }
+
+        for (auto& t : threads) {
+            t.join();
+        }
+
+        // Wait for eventual consistency
+        INFO("Waiting for eventual consistency...");
+
+        auto start = std::chrono::steady_clock::now();
+        bool converged = fixture.verify_convergence(std::chrono::seconds(30));
+        auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+            std::chrono::steady_clock::now() - start);
+
+        collector.record_consistency("eventual_consistency_achieved",
+            converged ? 100.0 : 0.0, "%");
+        collector.record_consistency("convergence_duration_after_chaos",
+            static_cast<double>(duration.count()), "ms");
+
+        INFO("Convergence " << (converged ? "achieved" : "FAILED")
+             << " in " << duration.count() << " ms");
+
+        CHECK(converged);
+
+        if (converged) {
+            // Verify all agents have same node count
+            auto* agent_0 = fixture.get_agent(0);
+            size_t node_count = agent_0->get_nodes().size();
+
+            for (size_t i = 1; i < NUM_AGENTS; ++i) {
+                auto* agent = fixture.get_agent(i);
+                CHECK(agent->get_nodes().size() == node_count);
+            }
+        }
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "eventual_consistency");
+}
diff --git a/benchmarks/consistency/convergence_time_bench.cpp b/benchmarks/consistency/convergence_time_bench.cpp
new file mode 100644
index 0000000..fa77784
--- /dev/null
+++ b/benchmarks/consistency/convergence_time_bench.cpp
@@ -0,0 +1,253 @@
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/benchmark/catch_benchmark.hpp>
+#include <atomic>
+#include <mutex>
+#include <thread>
+
+#include "../core/timing_utils.h"
+#include "../core/metrics_collector.h"
+#include "../core/report_generator.h"
+#include "../fixtures/multi_agent_fixture.h"
+#include "../fixtures/graph_generator.h"
+
+using namespace DSR;
+using namespace DSR::Benchmark;
+
+TEST_CASE("Convergence time benchmarks", "[CONSISTENCY][convergence][.multi][PROFILE][MULTIAGENT]") {
+    GraphGenerator generator;
+    MetricsCollector collector("convergence_time");
+
+    SECTION("Single update convergence") {
+        MultiAgentFixture fixture;
+        auto config_file = generator.generate_empty_graph();
+        REQUIRE(fixture.create_agents(2, config_file));
+        fixture.wait_for_sync();
+
+        auto* agent_a = fixture.get_agent(0);
+        auto* agent_b = fixture.get_agent(1);
+        REQUIRE(agent_a != nullptr);
+        REQUIRE(agent_b != nullptr);
+
+        LatencyTracker tracker(100);
+
+        for (int i = 0; i < 100; ++i) {
+            auto node = GraphGenerator::create_test_node(
+                0, agent_a->get_agent_id(),
+                "conv_node_" + std::to_string(i));
+
+            uint64_t start = get_unix_timestamp();
+            auto result = agent_a->insert_node(node);
+            if (!result.has_value()) continue;
+            uint64_t node_id = result.value();
+
+            // Poll until agent B sees the node
+            auto poll_start = std::chrono::steady_clock::now();
+            while (std::chrono::steady_clock::now() - poll_start < std::chrono::seconds(5)) {
+                fixture.process_events(1);
+                auto b_node = agent_b->get_node(node_id);
+                if (b_node.has_value()) {
+                    uint64_t conv_time = get_unix_timestamp() - start;
+                    tracker.record(conv_time);
+                    break;
+                }
+            }
+        }
+
+        auto stats = tracker.stats();
+        collector.record_latency_stats("single_node_convergence", stats);
+        collector.record_consistency("convergence_success_rate",
+            (static_cast<double>(tracker.count()) / 100.0) * 100, "%");
+
+        INFO("Single node convergence - Mean: " << stats.mean_us() << " us, "
+             << "P99: " << stats.p99_us() << " us");
+        INFO("Success rate: " << tracker.count() << "/100");
+    }
+
+    SECTION("Batch convergence time") {
+        MultiAgentFixture fixture;
+        auto config_file = generator.generate_empty_graph();
+        REQUIRE(fixture.create_agents(2, config_file));
+        fixture.wait_for_sync();
+
+        auto* agent_a = fixture.get_agent(0);
+        auto* agent_b = fixture.get_agent(1);
+
+        LatencyTracker tracker(20);
+
+        for (int batch = 0; batch < 20; ++batch) {
+            // Insert batch of 10 nodes and capture actual IDs
+            std::vector<uint64_t> node_ids;
+            node_ids.reserve(10);
+
+            uint64_t start = get_unix_timestamp();
+
+            for (int i = 0; i < 10; ++i) {
+                auto node = GraphGenerator::create_test_node(
+                    0, agent_a->get_agent_id());
+                auto result = agent_a->insert_node(node);
+                if (result.has_value()) {
+                    node_ids.push_back(result.value());
+                }
+            }
+
+            // Wait for all nodes to converge
+            auto poll_start = std::chrono::steady_clock::now();
+            while (std::chrono::steady_clock::now() - poll_start < std::chrono::seconds(10)) {
+                fixture.process_events(1);
+
+                bool all_converged = true;
+                for (auto id : node_ids) {
+                    if (!agent_b->get_node(id).has_value()) {
+                        all_converged = false;
+                        break;
+                    }
+                }
+
+                if (all_converged) {
+                    uint64_t conv_time = get_unix_timestamp() - start;
+                    tracker.record(conv_time);
+                    break;
+                }
+            }
+        }
+
+        auto stats = tracker.stats();
+        collector.record_latency_stats("batch_convergence_10_nodes", stats);
+
+        INFO("Batch convergence (10 nodes) - Mean: " << stats.mean_ms() << " ms");
+    }
+
+    SECTION("Convergence under concurrent updates") {
+        MultiAgentFixture fixture;
+        auto config_file = generator.generate_empty_graph();
+        REQUIRE(fixture.create_agents(4, config_file));
+        fixture.wait_for_sync();
+
+        LatencyTracker tracker(50);
+
+        // Each agent creates nodes concurrently
+        for (int round = 0; round < 50; ++round) {
+            std::vector<uint64_t> all_node_ids;
+            std::mutex ids_mutex;
+
+            uint64_t start = get_unix_timestamp();
+
+            // Each agent creates 5 nodes in parallel
+            std::vector<std::thread> threads;
+            for (size_t agent_idx = 0; agent_idx < 4; ++agent_idx) {
+                threads.emplace_back([&, agent_idx]() {
+                    auto* agent = fixture.get_agent(agent_idx);
+                    for (int i = 0; i < 5; ++i) {
+                        auto node = GraphGenerator::create_test_node(
+                            0, agent->get_agent_id());
+                        auto result = agent->insert_node(node);
+                        if (result.has_value()) {
+                            std::lock_guard<std::mutex> lock(ids_mutex);
+                            all_node_ids.push_back(result.value());
+                        }
+                    }
+                });
+            }
+            for (auto& t : threads) t.join();
+
+            // Wait for all agents to see all nodes
+            auto poll_start = std::chrono::steady_clock::now();
+            while (std::chrono::steady_clock::now() - poll_start < std::chrono::seconds(15)) {
+                fixture.process_events(5);
+
+                bool all_converged = true;
+                for (size_t agent_idx = 0; agent_idx < 4 && all_converged; ++agent_idx) {
+                    auto* agent = fixture.get_agent(agent_idx);
+                    for (auto id : all_node_ids) {
+                        if (!agent->get_node(id).has_value()) {
+                            all_converged = false;
+                            break;
+                        }
+                    }
+                }
+
+                if (all_converged) {
+                    uint64_t conv_time = get_unix_timestamp() - start;
+                    tracker.record(conv_time);
+                    break;
+                }
+            }
+        }
+
+        auto stats = tracker.stats();
+        collector.record_latency_stats("concurrent_convergence_4_agents", stats);
+
+        INFO("Concurrent convergence (4 agents) - Mean: " << stats.mean_ms() << " ms, "
+             << "P99: " << stats.p99_ms() << " ms");
+
+        // Check against timeout
+        CHECK(stats.p99_ms() < 1000);  // Should converge within 1 second p99
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "convergence_time");
+}
+
+TEST_CASE("Attribute convergence", "[CONSISTENCY][convergence][attributes][.multi][PROFILE][MULTIAGENT]") {
+    GraphGenerator generator;
+    MetricsCollector collector("attribute_convergence");
+
+    MultiAgentFixture fixture;
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(2, config_file));
+    fixture.wait_for_sync();
+
+    auto* agent_a = fixture.get_agent(0);
+    auto* agent_b = fixture.get_agent(1);
+
+    // Create shared test node and capture actual ID
+    auto test_node = GraphGenerator::create_test_node(
+        0, agent_a->get_agent_id(), "attr_conv_test");
+    auto insert_result = agent_a->insert_node(test_node);
+    REQUIRE(insert_result.has_value());
+    uint64_t shared_node_id = insert_result.value();
+
+    fixture.wait_for_sync();
+    REQUIRE(fixture.verify_convergence());
+
+    SECTION("Attribute update convergence") {
+        LatencyTracker tracker(100);
+
+        for (int i = 0; i < 100; ++i) {
+            auto node = agent_a->get_node(shared_node_id);
+            REQUIRE(node.has_value());
+
+            int32_t new_value = 1000 + i;
+            agent_a->add_or_modify_attrib_local<level_att>(*node, new_value);
+
+            uint64_t start = get_unix_timestamp();
+            agent_a->update_node(*node);
+
+            // Wait for attribute to converge
+            auto poll_start = std::chrono::steady_clock::now();
+            while (std::chrono::steady_clock::now() - poll_start < std::chrono::seconds(5)) {
+                fixture.process_events(1);
+
+                auto b_node = agent_b->get_node(shared_node_id);
+                if (b_node.has_value()) {
+                    auto attr = agent_b->get_attrib_by_name<level_att>(*b_node);
+                    if (attr.has_value() && attr.value() == new_value) {
+                        uint64_t conv_time = get_unix_timestamp() - start;
+                        tracker.record(conv_time);
+                        break;
+                    }
+                }
+            }
+        }
+
+        auto stats = tracker.stats();
+        collector.record_latency_stats("attribute_update_convergence", stats);
+
+        INFO("Attribute convergence - Mean: " << stats.mean_us() << " us");
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "attribute_convergence");
+}
diff --git a/benchmarks/core/benchmark_config.h b/benchmarks/core/benchmark_config.h
new file mode 100644
index 0000000..0734131
--- /dev/null
+++ b/benchmarks/core/benchmark_config.h
@@ -0,0 +1,55 @@
+#ifndef DSR_BENCHMARK_CONFIG_H
+#define DSR_BENCHMARK_CONFIG_H
+
+#include <cstdint>
+#include <string>
+#include <chrono>
+
+namespace DSR::Benchmark {
+
+struct BenchmarkConfig {
+    // Timing configuration
+    uint32_t warmup_iterations = 10;
+    uint32_t measurement_iterations = 100;
+    std::chrono::milliseconds sync_wait_time{200};
+    std::chrono::seconds max_convergence_timeout{10};
+
+    // Multi-agent configuration
+    uint32_t default_agent_count = 2;
+    uint32_t max_agent_count = 16;
+
+    // Graph generation
+    uint32_t small_graph_nodes = 100;
+    uint32_t medium_graph_nodes = 1000;
+    uint32_t large_graph_nodes = 10000;
+
+    // Throughput settings
+    uint32_t throughput_duration_seconds = 5;
+    uint32_t concurrent_writer_threads = 4;
+
+    // Output settings
+    std::string results_directory = "results";
+    bool export_json = true;
+    bool export_csv = true;
+    bool verbose = false;
+};
+
+// Default configuration singleton
+inline BenchmarkConfig& default_config() {
+    static BenchmarkConfig config;
+    return config;
+}
+
+// Percentile levels for latency statistics
+constexpr double PERCENTILE_P50 = 0.50;
+constexpr double PERCENTILE_P90 = 0.90;
+constexpr double PERCENTILE_P95 = 0.95;
+constexpr double PERCENTILE_P99 = 0.99;
+
+// Threshold constants for validation
+constexpr uint64_t MAX_EXPECTED_LATENCY_NS = 100'000'000;  // 100ms
+constexpr uint64_t MIN_EXPECTED_THROUGHPUT_OPS = 1000;     // 1000 ops/sec
+
+}  // namespace DSR::Benchmark
+
+#endif  // DSR_BENCHMARK_CONFIG_H
diff --git a/benchmarks/core/metrics_collector.h b/benchmarks/core/metrics_collector.h
new file mode 100644
index 0000000..cf08f60
--- /dev/null
+++ b/benchmarks/core/metrics_collector.h
@@ -0,0 +1,239 @@
+#ifndef DSR_METRICS_COLLECTOR_H
+#define DSR_METRICS_COLLECTOR_H
+
+#include <string>
+#include <vector>
+#include <map>
+#include <mutex>
+#include <chrono>
+#include <memory>
+#include "timing_utils.h"
+#include "benchmark_config.h"
+
+namespace DSR::Benchmark {
+
+// Categories of benchmark metrics
+enum class MetricCategory {
+    Latency,
+    Throughput,
+    Scalability,
+    Consistency
+};
+
+inline std::string to_string(MetricCategory cat) {
+    switch (cat) {
+        case MetricCategory::Latency: return "latency";
+        case MetricCategory::Throughput: return "throughput";
+        case MetricCategory::Scalability: return "scalability";
+        case MetricCategory::Consistency: return "consistency";
+    }
+    return "unknown";
+}
+
+
+// Individual metric measurement
+struct Metric {
+    std::string name;
+    MetricCategory category;
+    std::string unit;
+    double value;
+    std::map<std::string, double> additional_values;  // For percentiles, etc.
+    std::map<std::string, std::string> tags;          // For categorization
+};
+
+
+// Result of a complete benchmark run
+struct BenchmarkResult {
+    std::string benchmark_name;
+    std::string timestamp;
+    std::chrono::milliseconds total_duration;
+    std::vector<Metric> metrics;
+    std::map<std::string, std::string> metadata;
+};
+
+
+// Thread-safe collector for benchmark metrics
+class MetricsCollector {
+public:
+    MetricsCollector() = default;
+
+    explicit MetricsCollector(std::string benchmark_name)
+        : benchmark_name_(std::move(benchmark_name))
+        , start_time_(std::chrono::steady_clock::now())
+    {}
+
+    // Set benchmark name
+    void set_benchmark_name(const std::string& name) {
+        std::lock_guard lock(mutex_);
+        benchmark_name_ = name;
+    }
+
+    // Add metadata
+    void add_metadata(const std::string& key, const std::string& value) {
+        std::lock_guard lock(mutex_);
+        metadata_[key] = value;
+    }
+
+    // Record a simple metric
+    void record(const std::string& name, MetricCategory category,
+                double value, const std::string& unit = "") {
+        Metric m;
+        m.name = name;
+        m.category = category;
+        m.value = value;
+        m.unit = unit;
+
+        std::lock_guard lock(mutex_);
+        metrics_.push_back(std::move(m));
+    }
+
+    // Record a metric with tags
+    void record(const std::string& name, MetricCategory category,
+                double value, const std::string& unit,
+                const std::map<std::string, std::string>& tags) {
+        Metric m;
+        m.name = name;
+        m.category = category;
+        m.value = value;
+        m.unit = unit;
+        m.tags = tags;
+
+        std::lock_guard lock(mutex_);
+        metrics_.push_back(std::move(m));
+    }
+
+    // Record latency statistics from a LatencyTracker
+    void record_latency_stats(const std::string& name, LatencyStats stats,
+                              const std::map<std::string, std::string>& tags = {}) {
+        Metric m;
+        m.name = name;
+        m.category = MetricCategory::Latency;
+        m.value = stats.mean_ns;
+        m.unit = "ns";
+        m.tags = tags;
+        m.additional_values["count"] = static_cast<double>(stats.count);
+        m.additional_values["mean_ns"] = stats.mean_ns;
+        m.additional_values["stddev_ns"] = stats.stddev_ns;
+        m.additional_values["min_ns"] = static_cast<double>(stats.min_ns);
+        m.additional_values["max_ns"] = static_cast<double>(stats.max_ns);
+        m.additional_values["p50_ns"] = static_cast<double>(stats.p50_ns);
+        m.additional_values["p90_ns"] = static_cast<double>(stats.p90_ns);
+        m.additional_values["p95_ns"] = static_cast<double>(stats.p95_ns);
+        m.additional_values["p99_ns"] = static_cast<double>(stats.p99_ns);
+
+        std::lock_guard lock(mutex_);
+        metrics_.push_back(std::move(m));
+    }
+
+    // Record throughput
+    void record_throughput(const std::string& name, uint64_t operations,
+                           std::chrono::milliseconds duration,
+                           const std::map<std::string, std::string>& tags = {}) {
+        double ops_per_sec = static_cast<double>(operations) /
+                             (static_cast<double>(duration.count()) / 1000.0);
+
+        Metric m;
+        m.name = name;
+        m.category = MetricCategory::Throughput;
+        m.value = ops_per_sec;
+        m.unit = "ops/sec";
+        m.tags = tags;
+        m.additional_values["total_operations"] = static_cast<double>(operations);
+        m.additional_values["duration_ms"] = static_cast<double>(duration.count());
+
+        std::lock_guard lock(mutex_);
+        metrics_.push_back(std::move(m));
+    }
+
+    // Record scalability metric
+    void record_scalability(const std::string& name, uint32_t scale_factor,
+                            double metric_value, const std::string& unit,
+                            const std::map<std::string, std::string>& tags = {}) {
+        Metric m;
+        m.name = name;
+        m.category = MetricCategory::Scalability;
+        m.value = metric_value;
+        m.unit = unit;
+        m.tags = tags;
+        m.additional_values["scale_factor"] = static_cast<double>(scale_factor);
+
+        std::lock_guard lock(mutex_);
+        metrics_.push_back(std::move(m));
+    }
+
+    // Record consistency metric
+    void record_consistency(const std::string& name, double value,
+                            const std::string& unit,
+                            const std::map<std::string, std::string>& tags = {}) {
+        Metric m;
+        m.name = name;
+        m.category = MetricCategory::Consistency;
+        m.value = value;
+        m.unit = unit;
+        m.tags = tags;
+
+        std::lock_guard lock(mutex_);
+        metrics_.push_back(std::move(m));
+    }
+
+    // Get all metrics by category
+    [[nodiscard]] std::vector<Metric> get_metrics(MetricCategory category) const {
+        std::lock_guard lock(mutex_);
+        std::vector<Metric> result;
+        for (const auto& m : metrics_) {
+            if (m.category == category) {
+                result.push_back(m);
+            }
+        }
+        return result;
+    }
+
+    // Get all metrics
+    [[nodiscard]] std::vector<Metric> get_all_metrics() const {
+        std::lock_guard lock(mutex_);
+        return metrics_;
+    }
+
+    // Generate final result
+    [[nodiscard]] BenchmarkResult finalize() {
+        auto end_time = std::chrono::steady_clock::now();
+        auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+            end_time - start_time_);
+
+        // Generate timestamp
+        auto now = std::chrono::system_clock::now();
+        auto time_t_now = std::chrono::system_clock::to_time_t(now);
+        char timestamp_buf[64];
+        std::strftime(timestamp_buf, sizeof(timestamp_buf), "%Y-%m-%dT%H:%M:%S",
+                      std::localtime(&time_t_now));
+
+        std::lock_guard lock(mutex_);
+        BenchmarkResult result;
+        result.benchmark_name = benchmark_name_;
+        result.timestamp = timestamp_buf;
+        result.total_duration = duration;
+        result.metrics = metrics_;
+        result.metadata = metadata_;
+
+        return result;
+    }
+
+    // Clear all collected metrics
+    void clear() {
+        std::lock_guard lock(mutex_);
+        metrics_.clear();
+        metadata_.clear();
+        start_time_ = std::chrono::steady_clock::now();
+    }
+
+private:
+    mutable std::mutex mutex_;
+    std::string benchmark_name_;
+    std::chrono::steady_clock::time_point start_time_;
+    std::vector<Metric> metrics_;
+    std::map<std::string, std::string> metadata_;
+};
+
+}  // namespace DSR::Benchmark
+
+#endif  // DSR_METRICS_COLLECTOR_H
diff --git a/benchmarks/core/nanobench_adapter.h b/benchmarks/core/nanobench_adapter.h
new file mode 100644
index 0000000..e7ff1a4
--- /dev/null
+++ b/benchmarks/core/nanobench_adapter.h
@@ -0,0 +1,176 @@
+#ifndef DSR_NANOBENCH_ADAPTER_H
+#define DSR_NANOBENCH_ADAPTER_H
+
+// Bridge between ankerl::nanobench and the MetricsCollector/LatencyStats pipeline.
+//
+// Usage pattern:
+//
+//   auto bench = make_latency_bench(1000);          // 1000 samples, 100 warmup
+//   bench.run("op_name", [&] {
+//       auto result = graph->some_op();
+//       ankerl::nanobench::doNotOptimizeAway(result);
+//   });
+//   collector.record_latency_stats("op_name", nb_to_stats(bench));
+//   collector.record("op_name", MetricCategory::Throughput,
+//                    nb_throughput(bench), "ops/sec", tags);
+//
+// make_latency_bench() is intended for steady-state operations and allows the
+// call sites to raise minEpochIterations() for very fast paths. For destructive
+// or state-mutating workloads, use make_single_op_latency_bench() so each epoch
+// stays a single operation and the benchmarked state does not drift with
+// nanobench's adaptive iteration counts.
+
+#include <nanobench.h>
+#include <vector>
+#include <algorithm>
+#include <cmath>
+#include <fstream>
+#include <iostream>
+#include <filesystem>
+#include "timing_utils.h"   // LatencyStats
+
+namespace DSR::Benchmark {
+
+// ---------------------------------------------------------------------------
+// TeeBuf / nb_report_stream
+//
+// Writes nanobench table output to both stdout and results/nanobench_report.md
+// so the full table is available for offline inspection.
+// The file is created/truncated once on the first call; all test cases in the
+// same process run append naturally via the shared static ofstream.
+// ---------------------------------------------------------------------------
+class TeeBuf : public std::streambuf {
+public:
+    TeeBuf(std::streambuf* a, std::streambuf* b) : a_(a), b_(b) {}
+protected:
+    int overflow(int c) override {
+        if (c == traits_type::eof()) return traits_type::not_eof(c);
+        if (a_->sputc(static_cast<char_type>(c)) == traits_type::eof()) return traits_type::eof();
+        if (b_->sputc(static_cast<char_type>(c)) == traits_type::eof()) return traits_type::eof();
+        return c;
+    }
+    std::streamsize xsputn(const char* s, std::streamsize n) override {
+        a_->sputn(s, n);
+        return b_->sputn(s, n);
+    }
+private:
+    std::streambuf *a_, *b_;
+};
+
+inline std::ostream& nb_report_stream() {
+    static std::ofstream file = []() {
+        std::filesystem::create_directories("results");
+        return std::ofstream("results/nanobench_report.md");
+    }();
+    static TeeBuf tee(std::cout.rdbuf(), file.rdbuf());
+    static std::ostream stream(&tee);
+    return stream;
+}
+
+// ---------------------------------------------------------------------------
+// nb_to_stats
+//
+// Extracts per-epoch elapsed times from the last benchmark run, sorts them,
+// and returns a LatencyStats compatible with MetricsCollector::record_latency_stats().
+// Note: nanobench stores elapsed as average time per iteration within each
+// epoch, not total epoch time. If a benchmark uses minEpochIterations() > 1,
+// the returned distribution is still useful for steady-state throughput/latency
+// summaries, but it is not a raw single-operation percentile distribution.
+// ---------------------------------------------------------------------------
+inline LatencyStats nb_to_stats(const ankerl::nanobench::Bench& bench) {
+    using Measure = ankerl::nanobench::Result::Measure;
+
+    if (bench.results().empty()) return {};
+
+    const auto& r = bench.results().back();
+    const size_t n = r.size();
+    if (n == 0) return {};
+
+    // Collect per-epoch elapsed times in nanoseconds
+    std::vector<double> ns(n);
+    for (size_t i = 0; i < n; ++i)
+        ns[i] = r.get(i, Measure::elapsed) * 1e9;
+
+    std::sort(ns.begin(), ns.end());
+
+    // Percentile helper: nearest-rank
+    auto pct = [&](double p) -> uint64_t {
+        const size_t idx = static_cast<size_t>(p / 100.0 * static_cast<double>(n - 1) + 0.5);
+        return static_cast<uint64_t>(ns[std::min(idx, n - 1)]);
+    };
+
+    double sum = 0.0;
+    for (double v : ns) sum += v;
+    const double mean = sum / static_cast<double>(n);
+
+    double var = 0.0;
+    for (double v : ns) var += (v - mean) * (v - mean);
+
+    LatencyStats s{};
+    s.count     = n;
+    s.mean_ns   = mean;
+    s.stddev_ns = (n > 1) ? std::sqrt(var / static_cast<double>(n - 1)) : 0.0;
+    s.min_ns    = static_cast<uint64_t>(ns.front());
+    s.max_ns    = static_cast<uint64_t>(ns.back());
+    s.p50_ns    = pct(50);
+    s.p90_ns    = pct(90);
+    s.p95_ns    = pct(95);
+    s.p99_ns    = pct(99);
+    return s;
+}
+
+// ---------------------------------------------------------------------------
+// nb_throughput
+//
+// Derives single-operation throughput (ops/sec) from the mean latency of the
+// last benchmark run.
+// ---------------------------------------------------------------------------
+inline double nb_throughput(const ankerl::nanobench::Bench& bench) {
+    if (bench.results().empty()) return 0.0;
+    using Measure = ankerl::nanobench::Result::Measure;
+    const double mean_s = bench.results().back().average(Measure::elapsed);
+    return (mean_s > 0.0) ? 1.0 / mean_s : 0.0;
+}
+
+// ---------------------------------------------------------------------------
+// make_latency_bench
+//
+// Returns a Bench pre-configured for single-operation latency measurement:
+//   epochIterations(1)  — one sample per epoch → full percentile resolution
+//   epochs(n_samples)   — total independent latency samples to collect
+//   warmup(n_warmup)    — thrown-away warm-up iterations before measurement
+//   output(stream)      — tee to stdout + results/nanobench_report.md
+// ---------------------------------------------------------------------------
+inline ankerl::nanobench::Bench make_latency_bench(
+    size_t n_samples  = 1000,
+    size_t n_warmup   = 100)
+{
+    ankerl::nanobench::Bench b;
+    b.warmup(n_warmup)
+     .epochs(n_samples)
+     .minEpochIterations(1)
+     .minEpochTime(std::chrono::milliseconds(10))
+     .performanceCounters(false)
+     .output(&nb_report_stream());
+    return b;
+}
+
+// Returns a Bench that keeps the measured workload fixed at one operation per
+// epoch. This is for destructive or stateful benchmarks where adaptive
+// iteration counts would otherwise change the graph shape during the run.
+inline ankerl::nanobench::Bench make_single_op_latency_bench(
+    size_t n_samples  = 1000,
+    size_t n_warmup   = 100)
+{
+    ankerl::nanobench::Bench b;
+    b.warmup(n_warmup)
+     .epochs(n_samples)
+     .epochIterations(1)
+     .performanceCounters(false)
+     .output(&nb_report_stream());
+    return b;
+}
+
+}  // namespace DSR::Benchmark
+
+#endif  // DSR_NANOBENCH_ADAPTER_H
diff --git a/benchmarks/core/report_generator.h b/benchmarks/core/report_generator.h
new file mode 100644
index 0000000..6831f2a
--- /dev/null
+++ b/benchmarks/core/report_generator.h
@@ -0,0 +1,255 @@
+#ifndef DSR_REPORT_GENERATOR_H
+#define DSR_REPORT_GENERATOR_H
+
+#include <string>
+#include <fstream>
+#include <iomanip>
+#include <sstream>
+#include <filesystem>
+#include "metrics_collector.h"
+
+namespace DSR::Benchmark {
+
+class ReportGenerator {
+public:
+    explicit ReportGenerator(std::string output_directory = "results")
+        : output_directory_(std::move(output_directory))
+    {}
+
+    // Export benchmark result to JSON
+    bool export_json(const BenchmarkResult& result, const std::string& filename = "") {
+        std::string filepath = generate_filepath(result, filename, ".json");
+        std::ofstream out(filepath);
+        if (!out.is_open()) {
+            return false;
+        }
+
+        out << "{\n";
+        out << "  \"benchmark_name\": " << quote(result.benchmark_name) << ",\n";
+        out << "  \"timestamp\": " << quote(result.timestamp) << ",\n";
+        out << "  \"total_duration_ms\": " << result.total_duration.count() << ",\n";
+
+        // Metadata
+        out << "  \"metadata\": {\n";
+        bool first = true;
+        for (const auto& [key, value] : result.metadata) {
+            if (!first) out << ",\n";
+            out << "    " << quote(key) << ": " << quote(value);
+            first = false;
+        }
+        out << "\n  },\n";
+
+        // Metrics
+        out << "  \"metrics\": [\n";
+        for (size_t i = 0; i < result.metrics.size(); ++i) {
+            const auto& m = result.metrics[i];
+            out << "    {\n";
+            out << "      \"name\": " << quote(m.name) << ",\n";
+            out << "      \"category\": " << quote(to_string(m.category)) << ",\n";
+            out << "      \"value\": " << format_double(m.value) << ",\n";
+            out << "      \"unit\": " << quote(m.unit);
+
+            if (!m.additional_values.empty()) {
+                out << ",\n      \"additional\": {\n";
+                bool first_add = true;
+                for (const auto& [key, value] : m.additional_values) {
+                    if (!first_add) out << ",\n";
+                    out << "        " << quote(key) << ": " << format_double(value);
+                    first_add = false;
+                }
+                out << "\n      }";
+            }
+
+            if (!m.tags.empty()) {
+                out << ",\n      \"tags\": {\n";
+                bool first_tag = true;
+                for (const auto& [key, value] : m.tags) {
+                    if (!first_tag) out << ",\n";
+                    out << "        " << quote(key) << ": " << quote(value);
+                    first_tag = false;
+                }
+                out << "\n      }";
+            }
+
+            out << "\n    }";
+            if (i < result.metrics.size() - 1) out << ",";
+            out << "\n";
+        }
+        out << "  ]\n";
+        out << "}\n";
+
+        out.close();
+        last_json_path_ = filepath;
+        return true;
+    }
+
+    // Export benchmark result to CSV
+    bool export_csv(const BenchmarkResult& result, const std::string& filename = "") {
+        std::string filepath = generate_filepath(result, filename, ".csv");
+        std::ofstream out(filepath);
+        if (!out.is_open()) {
+            return false;
+        }
+
+        // Header
+        out << "benchmark_name,timestamp,metric_name,category,value,unit,"
+            << "mean_ns,stddev_ns,min_ns,max_ns,p50_ns,p90_ns,p95_ns,p99_ns,count\n";
+
+        // Data rows
+        for (const auto& m : result.metrics) {
+            out << quote_csv(result.benchmark_name) << ","
+                << quote_csv(result.timestamp) << ","
+                << quote_csv(m.name) << ","
+                << quote_csv(to_string(m.category)) << ","
+                << format_double(m.value) << ","
+                << quote_csv(m.unit) << ",";
+
+            // Additional values (latency-specific)
+            auto get_add = [&m](const std::string& key) -> std::string {
+                auto it = m.additional_values.find(key);
+                if (it != m.additional_values.end()) {
+                    return format_double(it->second);
+                }
+                return "";
+            };
+
+            out << get_add("mean_ns") << ","
+                << get_add("stddev_ns") << ","
+                << get_add("min_ns") << ","
+                << get_add("max_ns") << ","
+                << get_add("p50_ns") << ","
+                << get_add("p90_ns") << ","
+                << get_add("p95_ns") << ","
+                << get_add("p99_ns") << ","
+                << get_add("count") << "\n";
+        }
+
+        out.close();
+        last_csv_path_ = filepath;
+        return true;
+    }
+
+    // Export both JSON and CSV
+    bool export_all(const BenchmarkResult& result, const std::string& base_filename = "") {
+        bool json_ok = export_json(result, base_filename);
+        bool csv_ok = export_csv(result, base_filename);
+        return json_ok && csv_ok;
+    }
+
+    // Compare with baseline and generate comparison report
+    bool compare_with_baseline(const BenchmarkResult& current,
+                               const std::string& baseline_json_path,
+                               double regression_threshold_percent = 10.0) {
+        // Read baseline JSON (simplified parsing)
+        std::ifstream baseline_file(baseline_json_path);
+        if (!baseline_file.is_open()) {
+            return false;
+        }
+
+        // For now, just note that comparison is requested
+        // Full JSON parsing would require nlohmann/json
+        comparison_requested_ = true;
+        baseline_path_ = baseline_json_path;
+        regression_threshold_ = regression_threshold_percent;
+
+        return true;
+    }
+
+    // Get last generated file paths
+    [[nodiscard]] const std::string& last_json_path() const { return last_json_path_; }
+    [[nodiscard]] const std::string& last_csv_path() const { return last_csv_path_; }
+
+    // Set output directory
+    void set_output_directory(const std::string& dir) {
+        output_directory_ = dir;
+    }
+
+private:
+    std::string generate_filepath(const BenchmarkResult& result,
+                                  const std::string& filename,
+                                  const std::string& extension) {
+        // Ensure directory exists
+        std::filesystem::create_directories(output_directory_);
+
+        std::string name = filename;
+        if (name.empty()) {
+            // Generate filename from benchmark name and timestamp
+            name = "benchmark_" + sanitize_filename(result.benchmark_name) +
+                   "_" + sanitize_filename(result.timestamp);
+        }
+
+        // Remove extension if present
+        if (name.size() > extension.size() &&
+            name.substr(name.size() - extension.size()) == extension) {
+            name = name.substr(0, name.size() - extension.size());
+        }
+
+        return output_directory_ + "/" + name + extension;
+    }
+
+    static std::string sanitize_filename(const std::string& name) {
+        std::string result;
+        for (char c : name) {
+            if (std::isalnum(c) || c == '_' || c == '-') {
+                result += c;
+            } else if (c == ' ' || c == ':' || c == '/') {
+                result += '_';
+            }
+        }
+        return result;
+    }
+
+    static std::string quote(const std::string& s) {
+        std::string result = "\"";
+        for (char c : s) {
+            if (c == '"') result += "\\\"";
+            else if (c == '\\') result += "\\\\";
+            else if (c == '\n') result += "\\n";
+            else result += c;
+        }
+        result += "\"";
+        return result;
+    }
+
+    static std::string quote_csv(const std::string& s) {
+        if (s.find(',') != std::string::npos ||
+            s.find('"') != std::string::npos ||
+            s.find('\n') != std::string::npos) {
+            std::string escaped;
+            for (char c : s) {
+                if (c == '"') escaped += "\"\"";
+                else escaped += c;
+            }
+            return "\"" + escaped + "\"";
+        }
+        return s;
+    }
+
+    static std::string format_double(double value) {
+        std::ostringstream oss;
+        oss << std::setprecision(6) << std::fixed << value;
+        std::string str = oss.str();
+        // Remove trailing zeros
+        size_t dot_pos = str.find('.');
+        if (dot_pos != std::string::npos) {
+            size_t last_non_zero = str.find_last_not_of('0');
+            if (last_non_zero > dot_pos) {
+                str = str.substr(0, last_non_zero + 1);
+            } else {
+                str = str.substr(0, dot_pos);
+            }
+        }
+        return str;
+    }
+
+    std::string output_directory_;
+    std::string last_json_path_;
+    std::string last_csv_path_;
+    bool comparison_requested_ = false;
+    std::string baseline_path_;
+    double regression_threshold_ = 10.0;
+};
+
+}  // namespace DSR::Benchmark
+
+#endif  // DSR_REPORT_GENERATOR_H
diff --git a/benchmarks/core/timing_utils.h b/benchmarks/core/timing_utils.h
new file mode 100644
index 0000000..51d904e
--- /dev/null
+++ b/benchmarks/core/timing_utils.h
@@ -0,0 +1,301 @@
+#ifndef DSR_TIMING_UTILS_H
+#define DSR_TIMING_UTILS_H
+
+#include <chrono>
+#include <functional>
+#include <vector>
+#include <algorithm>
+#include <numeric>
+#include <cmath>
+#include <stdexcept>
+#include <type_traits>
+#include <dsr/core/utils.h>
+
+namespace DSR::Benchmark {
+
+// Monotonic nanosecond counter for benchmark measurements.
+// Uses steady_clock (CLOCK_MONOTONIC on Linux) instead of system_clock so
+// that NTP adjustments and settimeofday() cannot produce negative intervals
+// or artificially inflate latency samples.
+inline uint64_t bench_now() noexcept {
+    return static_cast<uint64_t>(
+        std::chrono::steady_clock::now().time_since_epoch().count());
+}
+
+// RAII timer that calls a callback with elapsed nanoseconds on destruction
+class ScopedTimer {
+public:
+    using Callback = std::function<void(uint64_t)>;
+
+    explicit ScopedTimer(Callback on_complete)
+        : callback_(std::move(on_complete))
+        , start_time_(bench_now())
+    {}
+
+    ~ScopedTimer() {
+        if (callback_) {
+            uint64_t elapsed = bench_now() - start_time_;
+            callback_(elapsed);
+        }
+    }
+
+    // Disable copy
+    ScopedTimer(const ScopedTimer&) = delete;
+    ScopedTimer& operator=(const ScopedTimer&) = delete;
+
+    // Allow move
+    ScopedTimer(ScopedTimer&& other) noexcept
+        : callback_(std::move(other.callback_))
+        , start_time_(other.start_time_)
+    {
+        other.callback_ = nullptr;
+    }
+
+    ScopedTimer& operator=(ScopedTimer&& other) noexcept {
+        if (this != &other) {
+            callback_ = std::move(other.callback_);
+            start_time_ = other.start_time_;
+            other.callback_ = nullptr;
+        }
+        return *this;
+    }
+
+    // Get elapsed time without stopping
+    [[nodiscard]] uint64_t elapsed_ns() const {
+        return bench_now() - start_time_;
+    }
+
+    // Cancel the callback
+    void cancel() {
+        callback_ = nullptr;
+    }
+
+private:
+    Callback callback_;
+    uint64_t start_time_;
+};
+
+
+// Statistics from latency measurements
+struct LatencyStats {
+    uint64_t count = 0;
+    double mean_ns = 0.0;
+    double stddev_ns = 0.0;
+    uint64_t min_ns = 0;
+    uint64_t max_ns = 0;
+    uint64_t p50_ns = 0;
+    uint64_t p90_ns = 0;
+    uint64_t p95_ns = 0;
+    uint64_t p99_ns = 0;
+
+    // Convenience methods for different units
+    [[nodiscard]] double mean_us() const { return mean_ns / 1000.0; }
+    [[nodiscard]] double mean_ms() const { return mean_ns / 1'000'000.0; }
+    [[nodiscard]] double stddev_us() const { return stddev_ns / 1000.0; }
+    [[nodiscard]] double stddev_ms() const { return stddev_ns / 1'000'000.0; }
+    [[nodiscard]] double min_us() const { return min_ns / 1000.0; }
+    [[nodiscard]] double max_us() const { return max_ns / 1000.0; }
+    [[nodiscard]] double p50_us() const { return p50_ns / 1000.0; }
+    [[nodiscard]] double p90_us() const { return p90_ns / 1000.0; }
+    [[nodiscard]] double p95_us() const { return p95_ns / 1000.0; }
+    [[nodiscard]] double p99_us() const { return p99_ns / 1000.0; }
+    [[nodiscard]] double min_ms() const { return min_ns / 1'000'000.0; }
+    [[nodiscard]] double max_ms() const { return max_ns / 1'000'000.0; }
+    [[nodiscard]] double p50_ms() const { return p50_ns / 1'000'000.0; }
+    [[nodiscard]] double p90_ms() const { return p90_ns / 1'000'000.0; }
+    [[nodiscard]] double p95_ms() const { return p95_ns / 1'000'000.0; }
+    [[nodiscard]] double p99_ms() const { return p99_ns / 1'000'000.0; }
+};
+
+
+// Collects latency samples and computes statistics
+class LatencyTracker {
+public:
+    LatencyTracker() = default;
+
+    // Reserve space for expected samples
+    explicit LatencyTracker(size_t expected_samples) {
+        samples_.reserve(expected_samples);
+    }
+
+    // Record a latency sample in nanoseconds
+    void record(uint64_t latency_ns) {
+        samples_.push_back(latency_ns);
+        stats_valid_ = false;
+    }
+
+    // Record using ScopedTimer callback pattern
+    [[nodiscard]] auto recorder() {
+        return [this](uint64_t latency_ns) {
+            this->record(latency_ns);
+        };
+    }
+
+    // Create a ScopedTimer that records to this tracker
+    [[nodiscard]] ScopedTimer scoped_record() {
+        return ScopedTimer(recorder());
+    }
+
+    // Get number of recorded samples
+    [[nodiscard]] size_t count() const {
+        return samples_.size();
+    }
+
+    // Check if tracker has samples
+    [[nodiscard]] bool empty() const {
+        return samples_.empty();
+    }
+
+    // Clear all samples
+    void clear() {
+        samples_.clear();
+        stats_valid_ = false;
+    }
+
+    // Get raw samples (for export)
+    [[nodiscard]] const std::vector<uint64_t>& samples() const {
+        return samples_;
+    }
+
+    // Compute and return statistics
+    [[nodiscard]] LatencyStats stats() {
+        if (stats_valid_) {
+            return cached_stats_;
+        }
+
+        if (samples_.empty()) {
+            return LatencyStats{};
+        }
+
+        // Sort samples for percentile calculation
+        std::vector<uint64_t> sorted = samples_;
+        std::sort(sorted.begin(), sorted.end());
+
+        LatencyStats result;
+        result.count = sorted.size();
+        result.min_ns = sorted.front();
+        result.max_ns = sorted.back();
+
+        // Calculate mean
+        double sum = std::accumulate(sorted.begin(), sorted.end(), 0.0);
+        result.mean_ns = sum / static_cast<double>(result.count);
+
+        // Calculate standard deviation
+        double sq_sum = std::accumulate(sorted.begin(), sorted.end(), 0.0,
+            [mean = result.mean_ns](double acc, uint64_t val) {
+                double diff = static_cast<double>(val) - mean;
+                return acc + diff * diff;
+            });
+        result.stddev_ns = std::sqrt(sq_sum / static_cast<double>(result.count));
+
+        // Calculate percentiles
+        result.p50_ns = percentile(sorted, 0.50);
+        result.p90_ns = percentile(sorted, 0.90);
+        result.p95_ns = percentile(sorted, 0.95);
+        result.p99_ns = percentile(sorted, 0.99);
+
+        cached_stats_ = result;
+        stats_valid_ = true;
+        return result;
+    }
+
+private:
+    static uint64_t percentile(const std::vector<uint64_t>& sorted, double p) {
+        if (sorted.empty()) return 0;
+        if (sorted.size() == 1) return sorted[0];
+
+        double index = p * static_cast<double>(sorted.size() - 1);
+        size_t lower = static_cast<size_t>(std::floor(index));
+        size_t upper = static_cast<size_t>(std::ceil(index));
+
+        if (lower == upper) {
+            return sorted[lower];
+        }
+
+        double fraction = index - static_cast<double>(lower);
+        return static_cast<uint64_t>(
+            static_cast<double>(sorted[lower]) * (1.0 - fraction) +
+            static_cast<double>(sorted[upper]) * fraction
+        );
+    }
+
+    std::vector<uint64_t> samples_;
+    LatencyStats cached_stats_;
+    bool stats_valid_ = false;
+};
+
+
+// Utility function to measure a single operation
+template<typename Func>
+uint64_t measure_ns(Func&& func) {
+    uint64_t start = bench_now();
+    std::forward<Func>(func)();
+    return bench_now() - start;
+}
+
+// Utility function to run warmup iterations
+template<typename Func>
+void warmup(Func&& func, uint32_t iterations) {
+    for (uint32_t i = 0; i < iterations; ++i) {
+        std::forward<Func>(func)();
+    }
+}
+
+struct SampledBenchmarkResult {
+    LatencyStats latency;
+    std::chrono::milliseconds wall_time{0};
+};
+
+template<typename MeasureFunc, typename MaintenanceFunc>
+SampledBenchmarkResult run_sampled_benchmark(
+    size_t warmup_iterations,
+    size_t measurement_iterations,
+    MeasureFunc&& measure_func,
+    MaintenanceFunc&& maintenance_func,
+    size_t maintenance_period = 1)
+{
+    auto maybe_maintain = [&](size_t iteration) {
+        if constexpr (std::is_invocable_v<MaintenanceFunc>) {
+            if (maintenance_period != 0 && ((iteration + 1) % maintenance_period) == 0) {
+                maintenance_func();
+            }
+        }
+    };
+
+    for (size_t i = 0; i < warmup_iterations; ++i) {
+        measure_func();
+        maybe_maintain(i);
+    }
+
+    LatencyTracker tracker(measurement_iterations);
+    auto wall_start = std::chrono::steady_clock::now();
+
+    for (size_t i = 0; i < measurement_iterations; ++i) {
+        tracker.record(measure_ns(measure_func));
+        maybe_maintain(i);
+    }
+
+    auto wall_end = std::chrono::steady_clock::now();
+    return {
+        .latency = tracker.stats(),
+        .wall_time = std::chrono::duration_cast<std::chrono::milliseconds>(wall_end - wall_start),
+    };
+}
+
+template<typename MeasureFunc>
+SampledBenchmarkResult run_sampled_benchmark(
+    size_t warmup_iterations,
+    size_t measurement_iterations,
+    MeasureFunc&& measure_func)
+{
+    return run_sampled_benchmark(
+        warmup_iterations,
+        measurement_iterations,
+        std::forward<MeasureFunc>(measure_func),
+        [] {},
+        0);
+}
+
+}  // namespace DSR::Benchmark
+
+#endif  // DSR_TIMING_UTILS_H
diff --git a/benchmarks/fixtures/graph_generator.h b/benchmarks/fixtures/graph_generator.h
new file mode 100644
index 0000000..f0e6ec5
--- /dev/null
+++ b/benchmarks/fixtures/graph_generator.h
@@ -0,0 +1,357 @@
+#ifndef DSR_GRAPH_GENERATOR_H
+#define DSR_GRAPH_GENERATOR_H
+
+#include <string>
+#include <fstream>
+#include <atomic>
+#include <random>
+#include <vector>
+#include <cstdlib>
+#include <filesystem>
+#include <dsr/api/dsr_api.h>
+
+namespace DSR::Benchmark {
+
+// Graph topology types
+enum class GraphTopology {
+    Linear,      // Chain of nodes
+    Star,        // Hub with spokes
+    Tree,        // Hierarchical tree
+    FullMesh,    // Every node connected to every other
+    Random       // Random connections
+};
+
+
+// Configuration for synthetic graph generation
+struct GraphGeneratorConfig {
+    uint32_t num_nodes = 100;
+    uint32_t edges_per_node = 2;
+    GraphTopology topology = GraphTopology::Tree;
+    std::string node_type = "test_node";
+    std::string edge_type = "test_edge";
+    bool include_rt_edges = false;
+    bool include_attributes = true;
+    uint32_t attributes_per_node = 3;
+};
+
+
+class GraphGenerator {
+public:
+    static constexpr unsigned int DEFAULT_SEED = 0x5A17B3C1u;
+
+    explicit GraphGenerator(unsigned int seed = DEFAULT_SEED)
+        : rng_(seed)
+    {
+        // Ensure test types are registered (safe to call multiple times)
+        register_test_types();
+    }
+
+    // Register test node/edge types - call this before using any DSR operations
+    static void register_test_types() {
+        static bool registered = false;
+        if (!registered) {
+            node_types::register_type("test_node");
+            edge_types::register_type("test_edge");
+            registered = true;
+        }
+    }
+
+    // Generate a config file with synthetic graph
+    std::string generate_config_file(const GraphGeneratorConfig& config) {
+        std::string filename = temp_filename();
+        std::ofstream out(filename);
+        if (!out.is_open()) {
+            return "";
+        }
+
+        out << "{\n";
+        out << "  \"DSRModel\": {\n";
+        out << "    \"symbols\": {\n";
+
+        // Generate root node
+        out << generate_root_node();
+
+        // Generate additional nodes based on topology
+        auto node_ids = generate_node_ids(config.num_nodes);
+
+        for (size_t i = 0; i < node_ids.size(); ++i) {
+            out << ",\n";
+            out << generate_node(node_ids[i], config, i);
+        }
+
+        out << "\n    }\n";
+        out << "  }\n";
+        out << "}\n";
+
+        out.close();
+        return filename;
+    }
+
+    // Generate small graph (100 nodes)
+    std::string generate_small_graph() {
+        GraphGeneratorConfig config;
+        config.num_nodes = 100;
+        config.topology = GraphTopology::Tree;
+        return generate_config_file(config);
+    }
+
+    // Generate medium graph (1000 nodes)
+    std::string generate_medium_graph() {
+        GraphGeneratorConfig config;
+        config.num_nodes = 1000;
+        config.topology = GraphTopology::Tree;
+        return generate_config_file(config);
+    }
+
+    // Generate large graph (10000 nodes)
+    std::string generate_large_graph() {
+        GraphGeneratorConfig config;
+        config.num_nodes = 10000;
+        config.topology = GraphTopology::Tree;
+        config.include_attributes = false;  // Reduce size
+        return generate_config_file(config);
+    }
+
+    // Generate empty config (just root)
+    std::string generate_empty_graph() {
+        std::string filename = temp_filename();
+        std::ofstream out(filename);
+        if (!out.is_open()) {
+            return "";
+        }
+
+        out << "{\n";
+        out << "  \"DSRModel\": {\n";
+        out << "    \"symbols\": {\n";
+        out << generate_root_node();
+        out << "\n    }\n";
+        out << "  }\n";
+        out << "}\n";
+
+        out.close();
+        return filename;
+    }
+
+    // Add nodes directly to an existing graph
+    void populate_graph(DSRGraph& graph, uint32_t num_nodes,
+                        const std::string& node_type = "test_node") {
+        uint64_t base_id = 1000;
+        auto root = graph.get_node_root();
+        uint64_t parent_id = root ? root->id() : 100;
+
+        for (uint32_t i = 0; i < num_nodes; ++i) {
+            DSR::Node node;
+            node.id(base_id + i);
+            node.name("bench_node_" + std::to_string(i));
+            node.type(node_type);
+            node.agent_id(graph.get_agent_id());
+
+            // Add some attributes
+            graph.add_attrib_local<level_att>(node, static_cast<int32_t>(i % 10));
+
+            graph.insert_node(node);
+
+            // Add edge from parent
+            if (i > 0 && (i % 10) == 0) {
+                parent_id = base_id + i - 1;
+            }
+
+            DSR::Edge edge;
+            edge.from(parent_id);
+            edge.to(node.id());
+            edge.type("test_edge");
+            edge.agent_id(graph.get_agent_id());
+            graph.insert_or_assign_edge(edge);
+        }
+    }
+
+    // Create a node for insertion benchmarks
+    static DSR::Node create_test_node(uint64_t id, uint32_t agent_id,
+                                       const std::string& name = "") {
+        DSR::Node node;
+        node.id(id);
+        node.name(name.empty() ? "test_node_" + std::to_string(id) : name);
+        node.type("test_node");
+        node.agent_id(agent_id);
+        return node;
+    }
+
+    // Create an edge for insertion benchmarks
+    static DSR::Edge create_test_edge(uint64_t from, uint64_t to,
+                                       uint32_t agent_id,
+                                       const std::string& type = "test_edge") {
+        DSR::Edge edge;
+        edge.from(from);
+        edge.to(to);
+        edge.type(type);
+        edge.agent_id(agent_id);
+        return edge;
+    }
+
+private:
+    std::string temp_filename() {
+        static std::atomic<uint64_t> next_id{0};
+        return "/tmp/dsr_bench_" + std::to_string(getpid()) + "_"
+               + std::to_string(next_id.fetch_add(1, std::memory_order_relaxed)) + ".json";
+    }
+
+    std::vector<uint64_t> generate_node_ids(uint32_t count) {
+        std::vector<uint64_t> ids;
+        ids.reserve(count);
+        for (uint32_t i = 0; i < count; ++i) {
+            ids.push_back(1000 + i);  // Start from 1000 to avoid conflicts
+        }
+        return ids;
+    }
+
+    std::string generate_root_node() {
+        return R"(      "100": {
+        "attribute": {
+          "level": {
+            "type": 1,
+            "value": 0
+          }
+        },
+        "id": "100",
+        "links": [],
+        "name": "root",
+        "type": "root"
+      })";
+    }
+
+    std::string generate_node(uint64_t id, const GraphGeneratorConfig& config,
+                              size_t index) {
+        std::ostringstream oss;
+        oss << "      \"" << id << "\": {\n";
+
+        // Attributes
+        oss << "        \"attribute\": {\n";
+        oss << "          \"level\": {\n";
+        oss << "            \"type\": 1,\n";
+        oss << "            \"value\": " << (index % 10 + 1) << "\n";
+        oss << "          }";
+
+        if (config.include_attributes) {
+            for (uint32_t a = 0; a < config.attributes_per_node; ++a) {
+                oss << ",\n          \"attr_" << a << "\": {\n";
+                oss << "            \"type\": 1,\n";
+                oss << "            \"value\": " << (rng_() % 1000) << "\n";
+                oss << "          }";
+            }
+        }
+
+        oss << "\n        },\n";
+
+        // ID and name
+        oss << "        \"id\": \"" << id << "\",\n";
+
+        // Links (edges)
+        oss << "        \"links\": [";
+        auto links = generate_links(id, config, index);
+        for (size_t i = 0; i < links.size(); ++i) {
+            if (i > 0) oss << ", ";
+            oss << "\n" << links[i];
+        }
+        if (!links.empty()) oss << "\n        ";
+        oss << "],\n";
+
+        // Name and type
+        oss << "        \"name\": \"node_" << id << "\",\n";
+        oss << "        \"type\": \"" << config.node_type << "\"\n";
+        oss << "      }";
+
+        return oss.str();
+    }
+
+    std::vector<std::string> generate_links(uint64_t from_id,
+                                             const GraphGeneratorConfig& config,
+                                             size_t index) {
+        std::vector<std::string> links;
+
+        // Always link back to root for tree topology
+        if (config.topology == GraphTopology::Tree && index == 0) {
+            links.push_back(generate_link(from_id, 100, config.edge_type, config.include_rt_edges));
+        }
+
+        // Generate additional links based on topology
+        switch (config.topology) {
+            case GraphTopology::Linear:
+                if (index > 0) {
+                    links.push_back(generate_link(from_id, 1000 + index - 1,
+                                                   config.edge_type, config.include_rt_edges));
+                } else {
+                    links.push_back(generate_link(from_id, 100,
+                                                   config.edge_type, config.include_rt_edges));
+                }
+                break;
+
+            case GraphTopology::Star:
+                links.push_back(generate_link(from_id, 100,
+                                               config.edge_type, config.include_rt_edges));
+                break;
+
+            case GraphTopology::Tree: {
+                // Each node links to its parent in tree
+                uint64_t parent_id = (index == 0) ? 100 : (1000 + (index - 1) / 2);
+                links.push_back(generate_link(from_id, parent_id,
+                                               config.edge_type, config.include_rt_edges));
+                break;
+            }
+
+            case GraphTopology::FullMesh:
+                // Limited to avoid explosion
+                for (uint64_t target = 1000; target < from_id && links.size() < 5; ++target) {
+                    links.push_back(generate_link(from_id, target,
+                                                   config.edge_type, config.include_rt_edges));
+                }
+                break;
+
+            case GraphTopology::Random: {
+                std::uniform_int_distribution<uint32_t> count_dist(1, config.edges_per_node);
+                std::uniform_int_distribution<uint64_t> id_dist(100, 1000 + index - 1);
+                uint32_t num_links = (index == 0) ? 1 : count_dist(rng_);
+                for (uint32_t i = 0; i < num_links; ++i) {
+                    uint64_t target = (index == 0) ? 100 : id_dist(rng_);
+                    links.push_back(generate_link(from_id, target,
+                                                   config.edge_type, config.include_rt_edges));
+                }
+                break;
+            }
+        }
+
+        return links;
+    }
+
+    std::string generate_link(uint64_t from, uint64_t to,
+                              const std::string& type, bool include_rt) {
+        std::ostringstream oss;
+        oss << "          {\n";
+        oss << "            \"dst\": \"" << to << "\",\n";
+        oss << "            \"label\": \"" << type << "\",\n";
+        oss << "            \"linkAttribute\": {";
+
+        if (include_rt && type == "RT") {
+            oss << R"(
+              "rt_rotation_euler_xyz": {
+                "type": 3,
+                "value": [0, 0, 0]
+              },
+              "rt_translation": {
+                "type": 3,
+                "value": [0, 0, 0]
+              })";
+        }
+
+        oss << "},\n";
+        oss << "            \"src\": \"" << from << "\"\n";
+        oss << "          }";
+
+        return oss.str();
+    }
+
+    std::mt19937 rng_;
+};
+
+}  // namespace DSR::Benchmark
+
+#endif  // DSR_GRAPH_GENERATOR_H
diff --git a/benchmarks/fixtures/multi_agent_fixture.h b/benchmarks/fixtures/multi_agent_fixture.h
new file mode 100644
index 0000000..46e9f32
--- /dev/null
+++ b/benchmarks/fixtures/multi_agent_fixture.h
@@ -0,0 +1,275 @@
+#ifndef DSR_MULTI_AGENT_FIXTURE_H
+#define DSR_MULTI_AGENT_FIXTURE_H
+
+#include <memory>
+#include <vector>
+#include <string>
+#include <thread>
+#include <chrono>
+#include <atomic>
+#include <functional>
+#include <QCoreApplication>
+#include <QTimer>
+#include <QEventLoop>
+#include <dsr/api/dsr_api.h>
+#include <dsr/core/types/type_checking/type_checker.h>
+#include "../core/benchmark_config.h"
+#include "../core/timing_utils.h"
+
+namespace DSR::Benchmark {
+
+// Agent info for tracking
+struct AgentInfo {
+    uint32_t id;
+    std::string name;
+    std::unique_ptr<DSRGraph> graph;
+    std::atomic<int> participants_matched{0};
+};
+
+
+// Forward declaration for type registration
+class GraphGenerator;
+
+// Reusable multi-agent test fixture
+class MultiAgentFixture {
+public:
+    explicit MultiAgentFixture(const BenchmarkConfig& config = default_config())
+        : config_(config)
+    {
+        // Ensure test types are registered before any DSR operations
+        register_benchmark_types();
+    }
+
+    // Register node/edge types needed by benchmarks
+    static void register_benchmark_types() {
+        static bool registered = false;
+        if (!registered) {
+            node_types::register_type("test_node");
+            edge_types::register_type("test_edge");
+            registered = true;
+        }
+    }
+
+    ~MultiAgentFixture() {
+        cleanup();
+    }
+
+    // Disable copy
+    MultiAgentFixture(const MultiAgentFixture&) = delete;
+    MultiAgentFixture& operator=(const MultiAgentFixture&) = delete;
+
+    // Create N agent instances with DSRGraph
+    // First agent loads from config_file, others sync via DDS
+    bool create_agents(uint32_t num_agents, const std::string& config_file) {
+        if (num_agents == 0 || num_agents > config_.max_agent_count) {
+            qWarning("Can't create agents");
+            return false;
+        }
+        if (config_file.empty()) {
+            qWarning("create_agents: config_file is empty — graph generator likely failed to write to /tmp (check permissions)");
+            return false;
+        }
+
+        // Keep agent IDs deterministic while remaining disjoint across fixture
+        // instances in the same process.
+        static std::atomic<uint32_t> next_base_agent_id{1000};
+        base_agent_id_ = next_base_agent_id.fetch_add(config_.max_agent_count + 1,
+                                                      std::memory_order_relaxed);
+
+        agents_.clear();
+        agents_.reserve(num_agents);
+
+        // Create first agent with config file (it defines the initial graph)
+        {
+            auto agent = std::make_unique<AgentInfo>();
+            agent->id = base_agent_id_;
+            agent->name = "bench_agent_0";
+
+            try {
+                agent->graph = std::make_unique<DSRGraph>(
+                    agent->name,
+                    agent->id,
+                    config_file,
+                    true
+                );
+                agents_.push_back(std::move(agent));
+            } catch (const std::exception& e) {
+                qWarning("Failed to create primary agent: %s", e.what());
+                return false;
+            }
+        }
+
+        // Small delay for DDS to initialize primary agent
+        process_events(50);
+
+        // Create additional agents WITHOUT config file - they sync via DDS
+        for (uint32_t i = 1; i < num_agents; ++i) {
+            auto agent = std::make_unique<AgentInfo>();
+            agent->id = base_agent_id_ + i;
+            agent->name = "bench_agent_" + std::to_string(i);
+
+            try {
+                // No config file - agent receives graph from DDS
+                agent->graph = std::make_unique<DSRGraph>(
+                    agent->name,
+                    agent->id,
+                    std::string{},
+                    true
+                );
+                agents_.push_back(std::move(agent));
+            } catch (const std::exception& e) {
+                qWarning("Failed to create agent %u: %s", i, e.what());
+                return false;
+            }
+
+            // Process events after each agent creation
+            process_events(20);
+        }
+
+        return true;
+    }
+
+    // Wait for DDS synchronization between agents
+    // Actively processes events while waiting
+    void wait_for_sync(std::chrono::milliseconds wait_time = std::chrono::milliseconds{0}) {
+        if (wait_time.count() == 0) {
+            wait_time = config_.sync_wait_time;
+        }
+
+        auto start = std::chrono::steady_clock::now();
+        while (std::chrono::steady_clock::now() - start < wait_time) {
+            process_events(10);
+        }
+    }
+
+    // Verify all agents have converged to same state
+    bool verify_convergence(std::chrono::seconds timeout = std::chrono::seconds{0}) {
+        if (timeout.count() == 0) {
+            timeout = config_.max_convergence_timeout;
+        }
+
+        if (agents_.size() < 2) {
+            return true;  // Single agent is always converged
+        }
+
+        auto start = std::chrono::steady_clock::now();
+
+        while (std::chrono::steady_clock::now() - start < timeout) {
+            if (check_node_convergence()) {
+                return true;
+            }
+            std::this_thread::sleep_for(std::chrono::milliseconds(10));
+            process_events();
+        }
+
+        return false;
+    }
+
+    // Measure time to convergence
+    std::chrono::milliseconds measure_convergence_time() {
+        auto start = std::chrono::steady_clock::now();
+
+        while (!check_node_convergence()) {
+            std::this_thread::sleep_for(std::chrono::milliseconds(1));
+            process_events();
+
+            auto elapsed = std::chrono::steady_clock::now() - start;
+            if (elapsed > config_.max_convergence_timeout) {
+                return std::chrono::milliseconds{-1};  // Timeout
+            }
+        }
+
+        return std::chrono::duration_cast<std::chrono::milliseconds>(
+            std::chrono::steady_clock::now() - start);
+    }
+
+    // Get agent by index
+    DSRGraph* get_agent(size_t index) {
+        if (index < agents_.size()) {
+            return agents_[index]->graph.get();
+        }
+        return nullptr;
+    }
+
+    // Get agent info by index
+    AgentInfo* get_agent_info(size_t index) {
+        if (index < agents_.size()) {
+            return agents_[index].get();
+        }
+        return nullptr;
+    }
+
+    // Get number of agents
+    [[nodiscard]] size_t agent_count() const {
+        return agents_.size();
+    }
+
+    // Connect signal handler to all agents
+    template<typename Signal, typename Slot>
+    void connect_all(Signal signal, Slot slot) {
+        for (auto& agent : agents_) {
+            QObject::connect(agent->graph.get(), signal, slot, Qt::QueuedConnection);
+        }
+    }
+
+    // Process Qt events (for signal delivery)
+    void process_events(int timeout_ms = 10) {
+        auto* app = QCoreApplication::instance();
+        if (app) {
+            app->processEvents(QEventLoop::AllEvents, timeout_ms);
+        }
+    }
+
+    // Run event loop for specified duration
+    void run_event_loop(std::chrono::milliseconds duration) {
+        auto* app = QCoreApplication::instance();
+        if (!app) return;
+
+        QEventLoop loop;
+        QTimer::singleShot(duration.count(), &loop, &QEventLoop::quit);
+        loop.exec();
+    }
+
+    // Cleanup all agents
+    void cleanup() {
+        agents_.clear();
+    }
+
+    // Get number of agents
+    [[nodiscard]] size_t size() const {
+        return agents_.size();
+    }
+
+private:
+    bool check_node_convergence() {
+        if (agents_.size() < 2) return true;
+
+        auto& first_graph = agents_[0]->graph;
+        auto first_nodes = first_graph->get_nodes();
+
+        for (size_t i = 1; i < agents_.size(); ++i) {
+            auto nodes = agents_[i]->graph->get_nodes();
+            if (nodes.size() != first_nodes.size()) {
+                return false;
+            }
+
+            // Check each node exists in the other graph
+            for (const auto& node : first_nodes) {
+                auto other_node = agents_[i]->graph->get_node(node.id());
+                if (!other_node.has_value()) {
+                    return false;
+                }
+            }
+        }
+
+        return true;
+    }
+
+    BenchmarkConfig config_;
+    uint32_t base_agent_id_ = 0;
+    std::vector<std::unique_ptr<AgentInfo>> agents_;
+};
+
+}  // namespace DSR::Benchmark
+
+#endif  // DSR_MULTI_AGENT_FIXTURE_H
diff --git a/benchmarks/flamegraph.sh b/benchmarks/flamegraph.sh
new file mode 100755
index 0000000..6067101
--- /dev/null
+++ b/benchmarks/flamegraph.sh
@@ -0,0 +1,223 @@
+#!/usr/bin/env bash
+# flamegraph.sh - generate a per-benchmark flamegraph SVG using perf.
+#
+# Usage:
+#   ./flamegraph.sh [OPTIONS] [FILTER]
+#
+# Options:
+#   -b BINARY     Path to dsr_benchmarks (default: ./build/dsr_benchmarks)
+#   -o OUTPUT     Output root directory for run subdirectories
+#                 (default: ./results/flamegraphs)
+#   -F FREQ       perf sampling frequency in Hz (default: 999)
+#   -k            Keep raw perf.data files (deleted by default)
+#   -l            List matching profile targets and exit
+#   -p PRESET     Built-in preset: load, multiagent, profile
+#   -r RUN_ID     Run directory name under OUTPUT
+#                 (default: flamegraph-YYYYMMDD-HHMMSS)
+#   -h            Show this help
+#
+# FILTER is forwarded to Catch2 as a tag expression or exact test name, e.g.:
+#   ./flamegraph.sh "Signal emission under load"
+#   ./flamegraph.sh "[PROFILE][LOAD]"
+#   ./flamegraph.sh -p multiagent
+#   ./flamegraph.sh -l -p profile
+#
+# The script intentionally does not default to "all benchmarks". Pass an exact
+# benchmark name, a Catch2 tag expression, or a preset for scoped profiling.
+#
+# Requirements:
+#   1. perf
+#        sudo apt install linux-tools-common linux-tools-$(uname -r)
+#
+#   2. FlameGraph scripts (flamegraph.pl + stackcollapse-perf.pl)
+#        git clone https://github.com/brendangregg/FlameGraph /opt/FlameGraph
+#        export FG_DIR=/opt/FlameGraph
+#      Either add them to PATH or set FG_DIR before running this script.
+#
+#   3. perf_event_paranoia - perf needs read access to kernel symbols.
+#      If perf says "Permission denied" or produces empty stacks, lower the
+#      paranoia level (resets on reboot):
+#        echo 1 | sudo tee /proc/sys/kernel/perf_event_paranoia
+#      To make it permanent:
+#        echo 'kernel.perf_event_paranoia = 1' | sudo tee /etc/sysctl.d/99-perf.conf
+#        sudo sysctl --system
+#
+#   4. Debug symbols - for meaningful stack frames the binary should be built
+#      with frame pointers or DWARF info. The CMake target already passes -g.
+
+set -euo pipefail
+
+BINARY="./build/dsr_benchmarks"
+OUTROOT="./results/flamegraphs"
+FREQ=999
+KEEP_DATA=0
+LIST_ONLY=0
+PRESET=""
+FILTER=""
+RUN_ID="flamegraph-$(date +%Y%m%d-%H%M%S)"
+
+while getopts "b:o:F:klp:r:h" opt; do
+    case "$opt" in
+        b) BINARY="$OPTARG" ;;
+        o) OUTROOT="$OPTARG" ;;
+        F) FREQ="$OPTARG" ;;
+        k) KEEP_DATA=1 ;;
+        l) LIST_ONLY=1 ;;
+        p) PRESET="$OPTARG" ;;
+        r) RUN_ID="$OPTARG" ;;
+        h)
+            sed -n '2,/^set -/p' "$0" | grep '^#' | sed 's/^# \{0,1\}//'
+            exit 0
+            ;;
+        *) echo "Unknown option: -$OPTARG" >&2; exit 1 ;;
+    esac
+done
+shift $((OPTIND - 1))
+FILTER="${1:-}"
+
+preset_to_filter() {
+    case "$1" in
+        load) echo "[PROFILE][LOAD]" ;;
+        multiagent) echo "[PROFILE][MULTIAGENT]" ;;
+        profile) echo "[PROFILE]" ;;
+        *)
+            echo "ERROR: unknown preset '$1' (expected: load, multiagent, profile)" >&2
+            exit 1
+            ;;
+    esac
+}
+
+find_tool() {
+    local name="$1"
+
+    if [[ -n "${FG_DIR:-}" && -x "${FG_DIR}/${name}" ]]; then
+        echo "${FG_DIR}/${name}"
+        return
+    fi
+
+    if command -v "$name" >/dev/null 2>&1; then
+        command -v "$name"
+        return
+    fi
+
+    for p in /usr/share/FlameGraph /opt/FlameGraph "$HOME/FlameGraph"; do
+        if [[ -x "${p}/${name}" ]]; then
+            echo "${p}/${name}"
+            return
+        fi
+    done
+
+    echo ""
+}
+
+if [[ -n "$PRESET" && -n "$FILTER" ]]; then
+    echo "ERROR: use either -p PRESET or a FILTER argument, not both" >&2
+    exit 1
+fi
+
+if [[ -n "$PRESET" ]]; then
+    FILTER="$(preset_to_filter "$PRESET")"
+fi
+
+if [[ -z "$FILTER" ]]; then
+    cat >&2 <<'EOF'
+ERROR: a benchmark filter is required.
+Examples:
+  ./flamegraph.sh "Signal emission under load"
+  ./flamegraph.sh "[PROFILE][LOAD]"
+  ./flamegraph.sh -p multiagent
+  ./flamegraph.sh -l -p profile
+EOF
+    exit 1
+fi
+
+[[ -x "$BINARY" ]] || { echo "ERROR: binary not found or not executable: $BINARY" >&2; exit 1; }
+command -v perf >/dev/null 2>&1 || { echo "ERROR: perf not found" >&2; exit 1; }
+
+OUTDIR="${OUTROOT}/${RUN_ID}"
+mkdir -p "$OUTDIR"
+
+mapfile -t TEST_NAMES < <(
+    "$BINARY" --list-tests --verbosity quiet "$FILTER" 2>/dev/null \
+    | sed 's/\r$//' \
+    | grep -v '^[[:space:]]' \
+    | grep -v '^All available test cases:' \
+    | grep -v '^[0-9][0-9]* test cases$' \
+    | grep -v '^$'
+)
+
+if [[ ${#TEST_NAMES[@]} -eq 0 ]]; then
+    echo "No tests matched filter: '${FILTER}'" >&2
+    echo "Run '$BINARY --list-tests' to see available tests." >&2
+    exit 1
+fi
+
+if [[ $LIST_ONLY -eq 1 ]]; then
+    printf '%s\n' "${TEST_NAMES[@]}"
+    exit 0
+fi
+
+COLLAPSE="$(find_tool stackcollapse-perf.pl)"
+FLAMEGRAPH="$(find_tool flamegraph.pl)"
+
+if [[ -z "$COLLAPSE" || -z "$FLAMEGRAPH" ]]; then
+    cat >&2 <<'EOF'
+ERROR: FlameGraph tools not found.
+Install Brendan Gregg's FlameGraph scripts:
+  git clone https://github.com/brendangregg/FlameGraph /opt/FlameGraph
+  export FG_DIR=/opt/FlameGraph
+or set FG_DIR to the directory containing flamegraph.pl and stackcollapse-perf.pl.
+EOF
+    exit 1
+fi
+
+echo "Found ${#TEST_NAMES[@]} test(s) to profile."
+echo "Output: $OUTDIR"
+echo
+
+PASS=0
+FAIL=0
+
+for name in "${TEST_NAMES[@]}"; do
+    safe="$(echo "$name" | tr -cs 'A-Za-z0-9_-' '_' | sed 's/_\+/_/g; s/^_//; s/_$//')"
+
+    perf_data="${OUTDIR}/${safe}.perf.data"
+    svg_out="${OUTDIR}/${safe}.svg"
+    perf_tmp="${perf_data}.tmp.$$"
+    svg_tmp="${svg_out}.tmp.$$"
+
+    echo "-- $name"
+
+    if perf record \
+        -F "$FREQ" \
+        -g \
+        --call-graph dwarf \
+        -o "$perf_tmp" \
+        -- "$BINARY" "$name" 2>/dev/null; then
+
+        perf script -i "$perf_tmp" 2>/dev/null \
+            | perl "$COLLAPSE" --inline \
+            | perl "$FLAMEGRAPH" --title "$name" \
+            > "$svg_tmp"
+
+        mv -f "$svg_tmp" "$svg_out"
+
+        echo "   -> $svg_out"
+        ((PASS++)) || true
+    else
+        echo "   x perf record failed" >&2
+        rm -f "$perf_tmp" "$svg_tmp"
+        ((FAIL++)) || true
+        continue
+    fi
+
+    if [[ $KEEP_DATA -eq 0 && -f "$perf_tmp" ]]; then
+        rm -f "$perf_tmp"
+    elif [[ -f "$perf_tmp" ]]; then
+        mv -f "$perf_tmp" "$perf_data"
+    fi
+done
+
+echo
+echo "Done: $PASS succeeded, $FAIL failed."
+[[ $FAIL -eq 0 ]]
diff --git a/benchmarks/latency/crdt_join_bench.cpp b/benchmarks/latency/crdt_join_bench.cpp
new file mode 100644
index 0000000..8d7e6fc
--- /dev/null
+++ b/benchmarks/latency/crdt_join_bench.cpp
@@ -0,0 +1,217 @@
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/benchmark/catch_benchmark.hpp>
+
+#include <dsr/core/crdt/delta_crdt.h>
+#include <dsr/core/types/crdt_types.h>
+#include "../core/nanobench_adapter.h"
+#include "../core/metrics_collector.h"
+#include "../core/report_generator.h"
+
+using namespace DSR::Benchmark;
+
+// Create a test attribute
+static DSR::CRDTAttribute make_test_attribute(uint32_t agent_id, int32_t value) {
+    DSR::CRDTAttribute attr;
+    attr.value(value);
+    attr.timestamp(bench_now());
+    attr.agent_id(agent_id);
+    return attr;
+}
+
+// All four mvreg operations in a single TEST_CASE so they export together
+// to one JSON file.
+TEST_CASE("CRDT mvreg operations", "[CRDT][mvreg][BASELINE]") {
+    MetricsCollector collector("crdt_mvreg");
+    collector.add_metadata("profile", "baseline");
+
+    // ── mvreg write ───────────────────────────────────────────────────────────
+    {
+        mvreg<DSR::CRDTAttribute> reg;
+        reg.id = 100;
+        int i = 0;
+
+        auto bench = make_latency_bench();
+        bench.run("mvreg_write", [&] {
+            auto attr = make_test_attribute(100, i++);
+            auto delta = reg.write(attr);
+            ankerl::nanobench::doNotOptimizeAway(delta);
+        });
+        collector.record_latency_stats("mvreg_write", nb_to_stats(bench));
+    }
+
+    // ── mvreg join (same agent) ───────────────────────────────────────────────
+    {
+        mvreg<DSR::CRDTAttribute> reg;
+        reg.id = 100;
+        auto init_attr = make_test_attribute(100, 0);
+        reg.write(init_attr);
+        int i = 0;
+
+        auto bench = make_latency_bench();
+        bench.run("mvreg_join_same_agent", [&] {
+            mvreg<DSR::CRDTAttribute> delta_reg;
+            delta_reg.id = 100;
+            auto new_attr = make_test_attribute(100, i++);
+            auto delta = delta_reg.write(new_attr);
+            reg.join(std::move(delta));
+            ankerl::nanobench::doNotOptimizeAway(reg);
+        });
+        collector.record_latency_stats("mvreg_join_same_agent", nb_to_stats(bench));
+    }
+
+    // ── mvreg join (different agents) ────────────────────────────────────────
+    {
+        int i = 0;
+
+        auto bench = make_latency_bench();
+        bench.run("mvreg_join_different_agent", [&] {
+            mvreg<DSR::CRDTAttribute> reg;
+            reg.id = 100;
+            auto attr = make_test_attribute(100, 0);
+            auto delta = reg.write(attr);
+
+            uint32_t other_agent = 200 + (i % 10);
+            mvreg<DSR::CRDTAttribute> delta_reg;
+            delta_reg.id = other_agent;
+            delta_reg.join(std::move(delta));
+            auto new_attr = make_test_attribute(other_agent, i * 2);
+            delta = delta_reg.write(new_attr);
+
+            reg.join(std::move(delta));
+            ankerl::nanobench::doNotOptimizeAway(reg);
+            ++i;
+        });
+        collector.record_latency_stats("mvreg_join_different_agent", nb_to_stats(bench));
+    }
+
+    // ── mvreg read ────────────────────────────────────────────────────────────
+    {
+        mvreg<DSR::CRDTAttribute> reg;
+        reg.id = 100;
+        auto attr = make_test_attribute(100, 42);
+        reg.write(attr);
+
+        // Read is pure — no warmup needed (cache already warm after write)
+        auto bench = make_latency_bench(1000, 0);
+        bench.minEpochIterations(10);
+        bench.run("mvreg_read", [&] {
+            const auto& value = reg.read_reg();
+            ankerl::nanobench::doNotOptimizeAway(value);
+        });
+        collector.record_latency_stats("mvreg_read", nb_to_stats(bench));
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "crdt_mvreg");
+}
+
+TEST_CASE("CRDT dot_context operations", "[CRDT][dot_context][BASELINE]") {
+    MetricsCollector collector("crdt_dot_context");
+    collector.add_metadata("profile", "baseline");
+
+    // ── makedot ───────────────────────────────────────────────────────────────
+    {
+        dot_context ctx;
+        int i = 0;
+
+        auto bench = make_latency_bench();
+        bench.minEpochIterations(10);
+        bench.run("dot_context_makedot", [&] {
+            auto dot = ctx.makedot(100 + (i++ % 10));
+            ankerl::nanobench::doNotOptimizeAway(dot);
+        });
+        collector.record_latency_stats("dot_context_makedot", nb_to_stats(bench));
+    }
+
+    // ── dotin ─────────────────────────────────────────────────────────────────
+    {
+        dot_context ctx;
+        for (int i = 0; i < 100; ++i) ctx.makedot(100 + (i % 10));
+        int i = 0;
+
+        auto bench = make_latency_bench(1000, 0);
+        bench.minEpochIterations(10);
+        bench.run("dot_context_dotin", [&] {
+            std::pair<key_type, int> dot{100 + (i++ % 10), i % 50};
+            bool r = ctx.dotin(dot);
+            ankerl::nanobench::doNotOptimizeAway(r);
+        });
+        collector.record_latency_stats("dot_context_dotin", nb_to_stats(bench));
+    }
+
+    // ── join ──────────────────────────────────────────────────────────────────
+    {
+        auto bench = make_latency_bench();
+        bench.run("dot_context_join", [&] {
+            dot_context ctx1;
+            dot_context ctx2;
+            for (int j = 0; j < 10; ++j) {
+                ctx1.makedot(100);
+                ctx2.makedot(200);
+            }
+            ctx1.join(ctx2);
+            ankerl::nanobench::doNotOptimizeAway(ctx1);
+        });
+        collector.record_latency_stats("dot_context_join", nb_to_stats(bench));
+    }
+
+    // ── compact ───────────────────────────────────────────────────────────────
+    {
+        auto bench = make_latency_bench();
+        bench.run("dot_context_compact", [&] {
+            dot_context ctx;
+            for (int j = 0; j < 50; ++j) ctx.insertdot({100, j * 2}, false);
+            ctx.compact();
+            ankerl::nanobench::doNotOptimizeAway(ctx);
+        });
+        collector.record_latency_stats("dot_context_compact", nb_to_stats(bench));
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "crdt_dot_context");
+}
+
+// Catch2 BENCHMARK macros — kept hidden; run with [!benchmark] to activate.
+TEST_CASE("CRDT micro-benchmarks (Catch2 BENCHMARK)", "[.][crdt][!benchmark]") {
+
+    BENCHMARK("mvreg write") {
+        mvreg<DSR::CRDTAttribute> reg;
+        reg.id = 100;
+        auto attr = make_test_attribute(100, 42);
+        return reg.write(attr);
+    };
+
+    BENCHMARK("mvreg join") {
+        mvreg<DSR::CRDTAttribute> reg;
+        reg.id = 100;
+        auto attr1 = make_test_attribute(100, 1);
+        auto delta = reg.write(attr1);
+
+        mvreg<DSR::CRDTAttribute> delta_reg;
+        delta_reg.id = 200;
+        delta_reg.join(std::move(delta));
+        auto attr2 = make_test_attribute(200, 2);
+        delta = delta_reg.write(attr2);
+
+        reg.join(std::move(delta));
+        return reg.read_reg();
+    };
+
+    BENCHMARK("dot_context makedot") {
+        dot_context ctx;
+        return ctx.makedot(100);
+    };
+
+    BENCHMARK("dot_context join") {
+        dot_context ctx1;
+        dot_context ctx2;
+        for (int i = 0; i < 10; ++i) {
+            ctx1.makedot(100);
+            ctx2.makedot(200);
+        }
+        ctx1.join(ctx2);
+        return ctx1.cc.size();
+    };
+}
diff --git a/benchmarks/latency/delta_propagation_bench.cpp b/benchmarks/latency/delta_propagation_bench.cpp
new file mode 100644
index 0000000..b301352
--- /dev/null
+++ b/benchmarks/latency/delta_propagation_bench.cpp
@@ -0,0 +1,338 @@
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/benchmark/catch_benchmark.hpp>
+#include <atomic>
+#include <latch>
+
+#include "../core/timing_utils.h"
+#include "../core/metrics_collector.h"
+#include "../core/report_generator.h"
+#include "../fixtures/multi_agent_fixture.h"
+#include "../fixtures/graph_generator.h"
+
+using namespace DSR;
+using namespace DSR::Benchmark;
+
+// Multi-agent tests require working DDS synchronization
+// Skip these by default - run with "[delta]" tag explicitly to test
+TEST_CASE("Delta propagation latency between agents", "[LATENCY][delta][.multi][PROFILE][MULTIAGENT]") {
+    // Setup
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("delta_propagation");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(2, config_file));
+
+    // Wait for DDS discovery and initial sync
+    fixture.wait_for_sync(std::chrono::milliseconds(500));
+    REQUIRE(fixture.verify_convergence(std::chrono::seconds(10)));
+
+    auto* agent_a = fixture.get_agent(0);
+    auto* agent_b = fixture.get_agent(1);
+    REQUIRE(agent_a != nullptr);
+    REQUIRE(agent_b != nullptr);
+
+    SECTION("Node insertion propagation latency") {
+        LatencyTracker tracker(100);
+        std::atomic<uint64_t> receive_time{0};
+        std::atomic<bool> received{false};
+        std::atomic<uint64_t> expected_node_id{0};
+
+        // Connect to agent B's signal
+        QObject::connect(agent_b, &DSR::DSRGraph::update_node_signal, agent_b,
+            [&](uint64_t id, const std::string& type, DSR::SignalInfo) {
+                if (id == expected_node_id.load(std::memory_order_acquire)) {
+                    receive_time.store(get_unix_timestamp());
+                    received.store(true);
+                }
+            }, Qt::DirectConnection);
+
+        // Warmup
+        for (int i = 0; i < 10; ++i) {
+            auto node = GraphGenerator::create_test_node(
+                2000 + i, agent_a->get_agent_id(), "warmup_" + std::to_string(i));
+            agent_a->insert_node(node);
+            fixture.wait_for_sync(std::chrono::milliseconds(50));
+        }
+
+        // Measurement iterations
+        for (int i = 0; i < 100; ++i) {
+            received.store(false);
+
+            auto node = GraphGenerator::create_test_node(
+                expected_node_id, agent_a->get_agent_id(),
+                "bench_node_" + std::to_string(i));
+
+            uint64_t send_time = get_unix_timestamp();
+            auto ins_result = agent_a->insert_node(node);
+            REQUIRE(ins_result.has_value());
+            expected_node_id.store(ins_result.value(), std::memory_order_release);
+            
+            // Wait for signal with timeout
+            auto start = std::chrono::steady_clock::now();
+            while (!received.load()) {
+                fixture.process_events(1);
+                if (std::chrono::steady_clock::now() - start > std::chrono::seconds(5)) {
+                    FAIL("Timeout waiting for node propagation");
+                }
+            }
+
+            uint64_t latency = receive_time.load() - send_time;
+            tracker.record(latency);
+        }
+
+        auto stats = tracker.stats();
+        collector.record_latency_stats("node_propagation", stats);
+
+        INFO("Node propagation latency - Mean: " << stats.mean_us() << " us, "
+             << "P99: " << stats.p99_us() << " us");
+
+        // Validation
+        CHECK(stats.p99_ns < MAX_EXPECTED_LATENCY_NS);
+    }
+
+    SECTION("Edge insertion propagation latency") {
+        LatencyTracker tracker(100);
+        std::atomic<uint64_t> receive_time{0};
+        std::atomic<bool> received{false};
+
+        // First create nodes on agent A
+        auto root = agent_a->get_node_root();
+        REQUIRE(root.has_value());
+
+        std::vector<uint64_t> node_to_ids = {};
+
+        for (int i = 0; i < 110; ++i) {
+            auto node = GraphGenerator::create_test_node(
+                4000 + i, agent_a->get_agent_id(), "edge_node_" + std::to_string(i));
+            auto ins = agent_a->insert_node(node);
+            REQUIRE(ins.has_value());
+            node_to_ids.push_back(ins.value());
+        }
+
+        // Wait for all nodes to sync to agent B before creating edges
+        fixture.wait_for_sync(std::chrono::milliseconds(500));
+        REQUIRE(fixture.verify_convergence(std::chrono::seconds(10)));
+
+        // Connect to agent B's edge signal
+        std::atomic<uint64_t> expected_from{0};
+        std::atomic<uint64_t> expected_to{0};
+        QObject::connect(agent_b, &DSR::DSRGraph::update_edge_signal, agent_b,
+            [&](uint64_t from, uint64_t to, const std::string& type, DSR::SignalInfo) {
+                if (from == expected_from.load(std::memory_order_acquire) &&
+                    to   == expected_to.load(std::memory_order_acquire)) {
+                    receive_time.store(get_unix_timestamp());
+                    received.store(true);
+                }
+            }, Qt::DirectConnection);
+
+        // Warmup
+        for (int i = 0; i < 10; ++i) {
+            auto edge = GraphGenerator::create_test_edge(
+                root->id(), node_to_ids[i], agent_a->get_agent_id());
+            agent_a->insert_or_assign_edge(edge);
+            fixture.wait_for_sync(std::chrono::milliseconds(50));
+        }
+
+        // Measurement iterations
+        for (int i = 10; i < 110; ++i) {
+            expected_from.store(root->id(), std::memory_order_release);
+            expected_to.store(node_to_ids[i], std::memory_order_release);
+            received.store(false);
+
+            auto edge = GraphGenerator::create_test_edge(
+                expected_from, expected_to, agent_a->get_agent_id());
+
+            uint64_t send_time = get_unix_timestamp();
+            agent_a->insert_or_assign_edge(edge);
+
+            // Wait for signal with timeout
+            auto start = std::chrono::steady_clock::now();
+            while (!received.load()) {
+                fixture.process_events(1);
+                if (std::chrono::steady_clock::now() - start > std::chrono::seconds(5)) {
+                    FAIL("Timeout waiting for edge propagation");
+                }
+            }
+
+            uint64_t latency = receive_time.load() - send_time;
+            tracker.record(latency);
+        }
+
+        auto stats = tracker.stats();
+        collector.record_latency_stats("edge_propagation", stats);
+
+        INFO("Edge propagation latency - Mean: " << stats.mean_us() << " us, "
+             << "P99: " << stats.p99_us() << " us");
+
+        CHECK(stats.p99_ns < MAX_EXPECTED_LATENCY_NS);
+    }
+
+    SECTION("Attribute update propagation latency") {
+        LatencyTracker tracker(100);
+        std::atomic<uint64_t> receive_time{0};
+        std::atomic<bool> received{false};
+
+        // Create a node for attribute updates
+        auto test_node = GraphGenerator::create_test_node(
+            5000, agent_a->get_agent_id(), "attr_test_node");
+        auto insert_result = agent_a->insert_node(test_node);
+        REQUIRE(insert_result.has_value());
+
+        // Wait for sync to agent B
+        fixture.wait_for_sync(std::chrono::milliseconds(500));
+        REQUIRE(fixture.verify_convergence(std::chrono::seconds(10)));
+
+        // Verify node exists on agent A
+        auto check_node = agent_a->get_node(*insert_result);
+        REQUIRE(check_node.has_value());
+
+        // Connect to agent B's attribute signal
+        QObject::connect(agent_b, &DSR::DSRGraph::update_node_attr_signal, agent_b,
+            [&](uint64_t id, const std::vector<std::string>& att_names, DSR::SignalInfo) {
+                if (id == *insert_result) {
+                    receive_time.store(get_unix_timestamp());
+                    received.store(true);
+                }
+            }, Qt::DirectConnection);
+
+        // Warmup
+        for (int i = 0; i < 10; ++i) {
+            auto node = agent_a->get_node(*insert_result);
+            if (node) {
+                agent_a->add_or_modify_attrib_local<level_att>(*node, static_cast<int32_t>(i));
+                agent_a->update_node(*node);
+            }
+            fixture.wait_for_sync(std::chrono::milliseconds(50));
+        }
+
+        // Measurement iterations
+        for (int i = 0; i < 100; ++i) {
+            received.store(false);
+
+            auto node = agent_a->get_node(*insert_result);
+            REQUIRE(node.has_value());
+
+            agent_a->add_or_modify_attrib_local<level_att>(*node, static_cast<int32_t>(1000 + i));
+
+            uint64_t send_time = get_unix_timestamp();
+            agent_a->update_node(*node);
+
+            // Wait for signal with timeout
+            auto start = std::chrono::steady_clock::now();
+            while (!received.load()) {
+                fixture.process_events(1);
+                if (std::chrono::steady_clock::now() - start > std::chrono::seconds(5)) {
+                    FAIL("Timeout waiting for attribute propagation");
+                }
+            }
+
+            uint64_t latency = receive_time.load() - send_time;
+            tracker.record(latency);
+        }
+
+        auto stats = tracker.stats();
+        collector.record_latency_stats("attribute_propagation", stats);
+
+        INFO("Attribute propagation latency - Mean: " << stats.mean_us() << " us, "
+             << "P99: " << stats.p99_us() << " us");
+
+        CHECK(stats.p99_ns < MAX_EXPECTED_LATENCY_NS);
+    }
+
+    // Export results
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "delta_propagation");
+}
+
+TEST_CASE("Delta propagation with varying agent counts", "[LATENCY][delta][scalability][.multi][PROFILE][MULTIAGENT]") {
+    MetricsCollector collector("delta_propagation_scaling");
+    GraphGenerator generator;
+
+    for (uint32_t num_agents : {2, 4, 8}) {
+        SECTION("With " + std::to_string(num_agents) + " agents") {
+            MultiAgentFixture fixture;
+            auto config_file = generator.generate_empty_graph();
+
+            if (!fixture.create_agents(num_agents, config_file)) {
+                WARN("Could not create " << num_agents << " agents, skipping");
+                continue;
+            }
+
+            // Wait for DDS discovery with all agents
+            fixture.wait_for_sync(std::chrono::milliseconds(500 * num_agents));
+            if (!fixture.verify_convergence(std::chrono::seconds(15))) {
+                WARN("Agents failed to converge, skipping");
+                continue;
+            }
+
+            auto* sender = fixture.get_agent(0);
+            REQUIRE(sender != nullptr);
+
+            LatencyTracker tracker(50);
+
+            // Track reception across all other agents
+            std::atomic<uint32_t> received_count{0};
+            std::vector<std::atomic<uint64_t>> receive_times(num_agents - 1);
+            std::atomic<uint64_t> current_expected_id{0};
+
+            for (size_t i = 1; i < num_agents; ++i) {
+                auto* receiver = fixture.get_agent(i);
+                QObject::connect(receiver, &DSR::DSRGraph::update_node_signal, receiver,
+                    [&, idx = i - 1](uint64_t id, const std::string& type, DSR::SignalInfo) {
+                        if (id == current_expected_id.load()) {
+                            receive_times[idx].store(get_unix_timestamp());
+                            received_count.fetch_add(1);
+                        }
+                    }, Qt::DirectConnection);
+            }
+
+            // Measurement
+            for (int i = 0; i < 50; ++i) {
+                received_count.store(0);
+                for (auto& rt : receive_times) rt.store(0);
+
+                auto node = GraphGenerator::create_test_node(
+                    0, sender->get_agent_id(),
+                    "scale_node_" + std::to_string(i));
+
+                uint64_t send_time = get_unix_timestamp();
+                auto result = sender->insert_node(node);
+                REQUIRE(result.has_value());
+                current_expected_id.store(result.value());
+
+                // Wait for all receivers
+                auto start = std::chrono::steady_clock::now();
+                while (received_count.load() < num_agents - 1) {
+                    fixture.process_events(1);
+                    if (std::chrono::steady_clock::now() - start > std::chrono::seconds(10)) {
+                        break;
+                    }
+                }
+
+                // Record max latency (time for all to receive)
+                uint64_t max_receive = 0;
+                for (const auto& rt : receive_times) {
+                    max_receive = std::max(max_receive, rt.load());
+                }
+                if (max_receive > 0) {
+                    tracker.record(max_receive - send_time);
+                }
+            }
+
+            auto stats = tracker.stats();
+            collector.record_latency_stats(
+                "propagation_" + std::to_string(num_agents) + "_agents",
+                stats,
+                {{"num_agents", std::to_string(num_agents)}});
+
+            INFO(num_agents << " agents - Mean: " << stats.mean_us() << " us, "
+                 << "P99: " << stats.p99_us() << " us");
+        }
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "delta_propagation_scaling");
+}
diff --git a/benchmarks/latency/signal_latency_bench.cpp b/benchmarks/latency/signal_latency_bench.cpp
new file mode 100644
index 0000000..0f95b47
--- /dev/null
+++ b/benchmarks/latency/signal_latency_bench.cpp
@@ -0,0 +1,261 @@
+#include <catch2/catch_test_macros.hpp>
+#include <atomic>
+
+#include "../core/nanobench_adapter.h"
+#include "../core/timing_utils.h"
+#include "../core/metrics_collector.h"
+#include "../core/report_generator.h"
+#include "../fixtures/multi_agent_fixture.h"
+#include "../fixtures/graph_generator.h"
+
+using namespace DSR;
+using namespace DSR::Benchmark;
+
+// For Qt::DirectConnection cases the signal fires synchronously within the
+// graph operation, so nanobench's elapsed time equals the dispatch latency.
+// For Qt::QueuedConnection the callback fires asynchronously via the Qt event
+// loop — manual bench_now() timing with fixture.process_events() is required.
+
+TEST_CASE("Node signal direct latency", "[LATENCY][signal][EXTENDED][.extended]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("signal_latency");
+    collector.add_metadata("profile", "extended");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    std::atomic<bool> callback_called{false};
+
+    QObject::connect(graph, &DSR::DSRGraph::update_node_signal, graph,
+        [&](uint64_t, const std::string&, DSR::SignalInfo) {
+            callback_called.store(true);
+        }, Qt::DirectConnection);
+
+    // ~40µs/op: 300 iters/epoch × 100 epochs ≈ 1.2 s
+    auto bench = make_latency_bench(100, 50);
+    bench.minEpochIterations(300);
+    bench.run("node_signal_direct", [&] {
+        callback_called.store(false);
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        graph->insert_node(node);
+        REQUIRE(callback_called.load());
+        ankerl::nanobench::doNotOptimizeAway(node);
+    });
+
+    auto stats = nb_to_stats(bench);
+    collector.record_latency_stats("node_signal_direct", stats);
+    INFO("Node signal (direct) - Mean: " << stats.mean_us() << " us, p99: " << stats.p99_us() << " us");
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "signal_node_direct");
+}
+
+TEST_CASE("Edge signal direct latency", "[LATENCY][signal][EXTENDED][.extended]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("signal_latency");
+    collector.add_metadata("profile", "extended");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    auto root = graph->get_node_root();
+    REQUIRE(root.has_value());
+
+    // Pre-create enough nodes for warmup(50) + epochs(1000) = 1050
+    std::vector<uint64_t> node_ids;
+    node_ids.reserve(1060);
+    for (int i = 0; i < 1060; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        auto id = graph->insert_node(node);
+        REQUIRE(id.has_value());
+        node_ids.push_back(*id);
+    }
+
+    std::atomic<bool> callback_called{false};
+    std::atomic<uint64_t> target_to{0};
+
+    QObject::connect(graph, &DSR::DSRGraph::update_edge_signal, graph,
+        [&](uint64_t, uint64_t to, const std::string&, DSR::SignalInfo) {
+            if (to == target_to.load()) {
+                callback_called.store(true);
+            }
+        }, Qt::DirectConnection);
+
+    // ~14µs/op: 600 iters/epoch × 100 epochs ≈ 0.84 s
+    size_t idx = 0;
+    auto bench = make_latency_bench(100, 50);
+    bench.minEpochIterations(600);
+    bench.run("edge_signal_direct", [&] {
+        uint64_t target = node_ids[idx++ % node_ids.size()];
+        target_to.store(target);
+        callback_called.store(false);
+        auto edge = GraphGenerator::create_test_edge(
+            root->id(), target, graph->get_agent_id());
+        graph->insert_or_assign_edge(edge);
+        REQUIRE(callback_called.load());
+        ankerl::nanobench::doNotOptimizeAway(edge);
+    });
+
+    auto stats = nb_to_stats(bench);
+    collector.record_latency_stats("edge_signal_direct", stats);
+    INFO("Edge signal (direct) - Mean: " << stats.mean_us() << " us, p99: " << stats.p99_us() << " us");
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "signal_edge_direct");
+}
+
+TEST_CASE("Attribute signal direct latency", "[LATENCY][signal][EXTENDED][.extended]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("signal_latency");
+    collector.add_metadata("profile", "extended");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    auto test_node = GraphGenerator::create_test_node(0, graph->get_agent_id(), "attr_signal_test");
+    auto node_id = graph->insert_node(test_node);
+    REQUIRE(node_id.has_value());
+
+    std::atomic<bool> callback_called{false};
+
+    QObject::connect(graph, &DSR::DSRGraph::update_node_attr_signal, graph,
+        [&](uint64_t id, const std::vector<std::string>&, DSR::SignalInfo) {
+            if (id == *node_id) {
+                callback_called.store(true);
+            }
+        }, Qt::DirectConnection);
+
+    uint64_t counter = 0;
+    auto bench = make_latency_bench(1000, 50);
+    bench.run("attr_signal_direct", [&] {
+        callback_called.store(false);
+        auto node = graph->get_node(*node_id);
+        REQUIRE(node.has_value());
+        graph->add_or_modify_attrib_local<level_att>(
+            *node, static_cast<int32_t>(100 + counter++));
+        graph->update_node(*node);
+        REQUIRE(callback_called.load());
+        ankerl::nanobench::doNotOptimizeAway(node);
+    });
+
+    auto stats = nb_to_stats(bench);
+    collector.record_latency_stats("attr_signal_direct", stats);
+    INFO("Attr signal (direct) - Mean: " << stats.mean_us() << " us, p99: " << stats.p99_us() << " us");
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "signal_attr_direct");
+}
+
+TEST_CASE("Node signal queued latency", "[LATENCY][signal]") {
+    // Qt::QueuedConnection dispatches via the event loop, so the callback
+    // fires asynchronously.  nanobench cannot model the poll-wait pattern;
+    // manual bench_now() + fixture.process_events() is used instead.
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("signal_latency");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    LatencyTracker tracker(1000);
+    std::atomic<uint64_t> callback_time{0};
+    std::atomic<bool> callback_called{false};
+
+    QObject::connect(graph, &DSR::DSRGraph::update_node_signal, graph,
+        [&](uint64_t, const std::string&, DSR::SignalInfo) {
+            callback_time.store(bench_now());
+            callback_called.store(true);
+        }, Qt::QueuedConnection);
+
+    // Warmup
+    for (int i = 0; i < 50; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        auto res = graph->insert_node(node);
+        REQUIRE(res.has_value());
+        fixture.process_events();
+    }
+
+    // Measurement
+    for (int i = 0; i < 1000; ++i) {
+        callback_called.store(false);
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        uint64_t pre_insert = bench_now();
+        auto res = graph->insert_node(node);
+        REQUIRE(res.has_value());
+
+        auto deadline = std::chrono::steady_clock::now() + std::chrono::milliseconds(100);
+        while (!callback_called.load() && std::chrono::steady_clock::now() < deadline) {
+            fixture.process_events(1);
+        }
+
+        if (callback_called.load()) {
+            tracker.record(callback_time.load() - pre_insert);
+        }
+    }
+
+    auto stats = tracker.stats();
+    collector.record_latency_stats("node_signal_queued", stats);
+    INFO("Node signal (queued) - Mean: " << stats.mean_us() << " us, p99: " << stats.p99_us() << " us");
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "signal_node_queued");
+}
+
+TEST_CASE("Signal emission under load", "[LATENCY][signal][stress][PROFILE][LOAD]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("signal_latency_stress");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    // Pre-populate graph with 1000 nodes
+    for (int i = 0; i < 1000; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        auto res = graph->insert_node(node);
+        REQUIRE(res.has_value());
+    }
+    fixture.process_events();
+
+    std::atomic<bool> callback_called{false};
+
+    QObject::connect(graph, &DSR::DSRGraph::update_node_signal, graph,
+        [&](uint64_t, const std::string&, DSR::SignalInfo) {
+            callback_called.store(true);
+        }, Qt::DirectConnection);
+
+    auto bench = make_latency_bench(1000, 50);
+    bench.minEpochIterations(10);
+    bench.run("signal_under_load", [&] {
+        callback_called.store(false);
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        graph->insert_node(node);
+        REQUIRE(callback_called.load());
+        ankerl::nanobench::doNotOptimizeAway(node);
+    });
+
+    auto stats = nb_to_stats(bench);
+    collector.record_latency_stats("signal_with_1000_nodes", stats, {{"existing_nodes", "1000"}});
+    INFO("Signal with 1000 nodes - Mean: " << stats.mean_us() << " us, p99: " << stats.p99_us() << " us");
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "signal_latency_stress");
+}
diff --git a/benchmarks/python/bench_baseline_graph.py b/benchmarks/python/bench_baseline_graph.py
new file mode 100644
index 0000000..7a8166c
--- /dev/null
+++ b/benchmarks/python/bench_baseline_graph.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+"""
+Stable Python baseline benchmarks on a fixed graph.
+
+This intentionally avoids graph growth during measurement. The goal is to
+provide a low-noise Python baseline for lookup/query/update paths and binding
+costs, not to model end-to-end insertion throughput.
+"""
+
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+from bench_utils import LatencyTracker, MetricsCollector, make_temp_config_file
+
+try:
+    import pydsr
+except ImportError:
+    print("Error: pydsr module not found.")
+    sys.exit(1)
+
+
+def benchmark_fixed_graph(graph: pydsr.DSRGraph, collector: MetricsCollector):
+    agent_id = graph.get_agent_id()
+    root = graph.get_node("root")
+    assert root is not None, "root node missing"
+
+    # Keep the Python baseline bounded so the top-level default run stays usable.
+    node_ids = []
+    for i in range(300):
+        node = pydsr.Node(agent_id, "testtype", f"baseline_node_{i}")
+        inserted = graph.insert_node(node)
+        assert inserted is not None, f"insert_node failed for baseline_node_{i}"
+        node_ids.append(inserted)
+        edge = pydsr.Edge(inserted, root.id, "testtype_e", agent_id)
+        assert graph.insert_or_assign_edge(edge), f"insert edge failed for baseline_node_{i}"
+
+    for node_id in node_ids:
+        assert graph.get_node(node_id) is not None
+    graph.get_nodes()
+    graph.get_nodes_by_type("testtype")
+    graph.get_edges(root.id)
+    graph.get_edges_by_type("testtype_e")
+
+    tracker = LatencyTracker(1000)
+    for i in range(1000):
+        node_id = node_ids[i % len(node_ids)]
+        with tracker.measure():
+            node = graph.get_node(node_id)
+        assert node is not None
+    collector.record_latency_stats("node_read_by_id", tracker.stats())
+
+    tracker = LatencyTracker(500)
+    for i in range(500):
+        name = f"baseline_node_{i % len(node_ids)}"
+        with tracker.measure():
+            node = graph.get_node(name)
+        assert node is not None
+    collector.record_latency_stats("node_read_by_name", tracker.stats())
+
+    tracker = LatencyTracker(500)
+    target = graph.get_node("baseline_node_0")
+    assert target is not None
+    for i in range(500):
+        target.attrs["level"] = pydsr.Attribute(i)
+        with tracker.measure():
+            ok = graph.update_node(target)
+        assert ok
+    collector.record_latency_stats("node_update", tracker.stats())
+
+    tracker = LatencyTracker(100)
+    for _ in range(100):
+        with tracker.measure():
+            nodes = graph.get_nodes()
+        assert nodes
+    collector.record_latency_stats("get_nodes", tracker.stats())
+
+    tracker = LatencyTracker(100)
+    for _ in range(100):
+        with tracker.measure():
+            nodes = graph.get_nodes_by_type("testtype")
+        assert nodes
+    collector.record_latency_stats("get_nodes_by_type", tracker.stats())
+
+    tracker = LatencyTracker(300)
+    for i in range(300):
+        node_id = node_ids[i % len(node_ids)]
+        with tracker.measure():
+            edge = graph.get_edge(root.id, node_id, "testtype_e")
+        assert edge is not None
+    collector.record_latency_stats("edge_read", tracker.stats())
+
+    tracker = LatencyTracker(100)
+    for _ in range(100):
+        with tracker.measure():
+            edges = graph.get_edges_by_type("testtype_e")
+        assert edges
+    collector.record_latency_stats("get_edges_by_type", tracker.stats())
+
+
+def main():
+    print("=" * 60)
+    print("DSR Python Baseline Graph Benchmarks")
+    print("=" * 60)
+    print()
+
+    collector = MetricsCollector("python_baseline_graph")
+    collector.metadata["profile"] = "baseline"
+
+    config_file = make_temp_config_file()
+    graph = pydsr.DSRGraph(0, "python_baseline_graph", 84, config_file)
+    time.sleep(0.3)
+
+    benchmark_fixed_graph(graph, collector)
+
+    del graph
+    os.unlink(config_file)
+
+    results_dir = os.environ.get(
+        "BENCH_RESULTS_DIR",
+        os.path.join(os.path.dirname(__file__), "..", "results"),
+    )
+    os.makedirs(results_dir, exist_ok=True)
+    collector.export_json(os.path.join(results_dir, "python_baseline_graph.json"))
+    collector.export_csv(os.path.join(results_dir, "python_baseline_graph.csv"))
+    print(f"\nResults exported to {results_dir}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/python/bench_binding_overhead.py b/benchmarks/python/bench_binding_overhead.py
new file mode 100644
index 0000000..e8dfb38
--- /dev/null
+++ b/benchmarks/python/bench_binding_overhead.py
@@ -0,0 +1,201 @@
+#!/usr/bin/env python3
+"""
+Benchmark: Python binding overhead vs C++.
+
+Measures the overhead introduced by pybind11 bindings.  Pure Python object
+creation (Node, Edge, Attribute) uses pyperf.Runner.bench_func() for
+calibrated, multi-process timing.  Numpy array copy benchmarks use
+bench_time_func() so the array setup happens outside the timed loop.
+
+Graph creation overhead is measured separately with a LatencyTracker because
+it is too expensive (~500 ms each) to repeat inside pyperf worker processes.
+"""
+
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+from bench_utils import (LatencyTracker, MetricsCollector, make_temp_config_file,
+                         pyperf_to_latency_stats)
+
+try:
+    import pydsr
+except ImportError:
+    print("Error: pydsr module not found. Build with Python bindings enabled.")
+    sys.exit(1)
+
+try:
+    import pyperf
+except ImportError:
+    print("Error: pyperf module not found.  Install with: pip install pyperf")
+    sys.exit(1)
+
+
+# ── Pure Python object creation (bench_func) ──────────────────────────────────
+
+def _create_node():
+    return pydsr.Node(1, "testtype", "bench_node")
+
+
+def _create_edge():
+    return pydsr.Edge(100, 200, "testtype_e", 1)
+
+
+def _create_attr_str():
+    return pydsr.Attribute("test_string")
+
+
+def _create_attr_int():
+    return pydsr.Attribute(42)
+
+
+def _create_attr_float():
+    return pydsr.Attribute(3.14159)
+
+
+def _create_attr_list():
+    return pydsr.Attribute([1.0, 2.0, 3.0])
+
+
+# ── Numpy copy benchmarks (bench_time_func) ───────────────────────────────────
+
+def _make_numpy_set_func(size: int):
+    """Return a bench_time_func that times setting a numpy array attribute."""
+    def time_func(loops):
+        if not hasattr(time_func, "_data"):
+            import numpy as np
+            time_func._data = np.random.randint(0, 255, size, dtype=np.uint8)
+            time_func._attr = pydsr.Attribute([0])
+            for _ in range(10):  # warmup
+                time_func._attr.value = time_func._data
+        data = time_func._data
+        attr = time_func._attr
+        t1 = pyperf.perf_counter()
+        for _ in range(loops):
+            attr.value = data
+        return pyperf.perf_counter() - t1
+    time_func.__name__ = f"numpy_set_{size}"
+    return time_func
+
+
+def _make_numpy_get_func(size: int):
+    """Return a bench_time_func that times getting a numpy array attribute."""
+    def time_func(loops):
+        if not hasattr(time_func, "_attr"):
+            import numpy as np
+            data = np.random.randint(0, 255, size, dtype=np.uint8)
+            attr = pydsr.Attribute([0])
+            attr.value = data
+            for _ in range(10):  # warmup
+                _ = attr.value
+            time_func._attr = attr
+        attr = time_func._attr
+        t1 = pyperf.perf_counter()
+        for _ in range(loops):
+            _ = attr.value
+        return pyperf.perf_counter() - t1
+    time_func.__name__ = f"numpy_get_{size}"
+    return time_func
+
+
+# ── Graph creation (LatencyTracker — too expensive for pyperf workers) ─────────
+
+def benchmark_graph_creation(collector: MetricsCollector):
+    tracker = LatencyTracker(10)
+    config_file = make_temp_config_file()
+
+    for i in range(10):
+        with tracker.measure():
+            g = pydsr.DSRGraph(0, f"bench_graph_{i}", 100 + i, config_file)
+        del g
+        time.sleep(0.5)
+
+    os.unlink(config_file)
+
+    stats = tracker.stats()
+    collector.record_latency_stats("graph_creation", stats)
+    print(f"Graph creation: mean={stats.mean_ms:.2f} ms")
+
+
+# ── Main ──────────────────────────────────────────────────────────────────────
+
+def main():
+    # Inject default pyperf tuning before Runner parses sys.argv.
+    # Worker processes always receive --worker so they are skipped here.
+    if "--worker" not in sys.argv:
+        if "--values" not in sys.argv:
+            sys.argv.extend(["--values", "20"])
+        if "--warmups" not in sys.argv:
+            sys.argv.extend(["--warmups", "5"])
+
+    runner = pyperf.Runner()
+
+    # runner.args may be None before the first bench_func call in some pyperf
+    # versions; use sys.argv directly (worker processes always receive --worker).
+    if "--worker" not in sys.argv:
+        print("=" * 60)
+        print("DSR Python Binding Overhead Benchmarks")
+        print("=" * 60)
+
+    # Pure Python object creation
+    bm_node      = runner.bench_func("node_creation",      _create_node)
+    bm_edge      = runner.bench_func("edge_creation",      _create_edge)
+    bm_attr_str  = runner.bench_func("attribute_string",   _create_attr_str)
+    bm_attr_int  = runner.bench_func("attribute_int",      _create_attr_int)
+    bm_attr_float = runner.bench_func("attribute_float",   _create_attr_float)
+    bm_attr_list = runner.bench_func("attribute_list",     _create_attr_list)
+
+    # Numpy attribute benchmarks
+    numpy_bms = {}
+    try:
+        import numpy  # noqa: F401 — check availability before spawning workers
+        for size in [1000, 10000, 100000, 1000000]:
+            numpy_bms[f"numpy_set_{size}"] = runner.bench_time_func(
+                f"numpy_set_{size}", _make_numpy_set_func(size))
+            numpy_bms[f"numpy_get_{size}"] = runner.bench_time_func(
+                f"numpy_get_{size}", _make_numpy_get_func(size))
+    except ImportError:
+        print("Numpy not available, skipping numpy benchmarks")
+
+    # Worker processes must not run the export code (stdout is not redirected,
+    # so workers printing zeros would overwrite/corrupt the master's output).
+    if "--worker" in sys.argv:
+        return
+
+    collector = MetricsCollector("binding_overhead")
+
+    pyperf_items = [
+        ("node_creation",    bm_node),
+        ("edge_creation",    bm_edge),
+        ("attribute_string", bm_attr_str),
+        ("attribute_int",    bm_attr_int),
+        ("attribute_float",  bm_attr_float),
+        ("attribute_list",   bm_attr_list),
+    ]
+    for name, bm in pyperf_items:
+        stats = pyperf_to_latency_stats(bm)
+        collector.record_latency_stats(name, stats)
+        print(f"{name}: mean={stats.mean_us:.3f} µs")
+
+    for name, bm in numpy_bms.items():
+        stats = pyperf_to_latency_stats(bm)
+        collector.record_latency_stats(name, stats)
+        print(f"{name}: mean={stats.mean_us:.2f} µs")
+
+    print("\n--- Graph Creation ---")
+    benchmark_graph_creation(collector)
+
+    results_dir = os.environ.get(
+        "BENCH_RESULTS_DIR",
+        os.path.join(os.path.dirname(__file__), "..", "results"),
+    )
+    os.makedirs(results_dir, exist_ok=True)
+    collector.export_json(os.path.join(results_dir, "python_binding_overhead.json"))
+    collector.export_csv(os.path.join(results_dir, "python_binding_overhead.csv"))
+    print(f"\nResults exported to {results_dir}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/python/bench_graph_operations.py b/benchmarks/python/bench_graph_operations.py
new file mode 100644
index 0000000..4e3412b
--- /dev/null
+++ b/benchmarks/python/bench_graph_operations.py
@@ -0,0 +1,281 @@
+#!/usr/bin/env python3
+"""
+Benchmark: Graph operations (CRUD) performance.
+
+Measures insert, read, update, delete performance for nodes and edges.
+
+pyperf is intentionally not used here: the benchmark functions share a single
+DSRGraph instance and depend on each other's side-effects (e.g. edge
+benchmarks rely on nodes inserted by node benchmarks).  pyperf's per-worker
+subprocess model would require re-running the full setup chain in each worker,
+and the shared-state dependency makes clean isolation impractical.
+"""
+
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+from bench_utils import LatencyTracker, MetricsCollector, make_temp_config_file, warmup
+
+try:
+    import pydsr
+except ImportError:
+    print("Error: pydsr module not found.")
+    sys.exit(1)
+
+
+def benchmark_node_operations(graph: pydsr.DSRGraph, collector: MetricsCollector):
+    """Benchmark node CRUD operations."""
+    agent_id = graph.get_agent_id()
+
+    # --- Insert ---
+    tracker = LatencyTracker(2000)
+    base_id = 10000
+
+    # Warmup
+    for i in range(100):
+        node = pydsr.Node(agent_id, "testtype", f"warmup_{i}")
+        result = graph.insert_node(node)
+        assert result is not None, f"Warmup insert_node failed at i={i}"
+
+    # Measure
+    for i in range(2000):
+        node = pydsr.Node(agent_id, "testtype", f"bench_node_{i}")
+        with tracker.measure():
+            result = graph.insert_node(node)
+        assert result is not None, f"insert_node failed at i={i}"
+
+    stats = tracker.stats()
+    collector.record_latency_stats("node_insert", stats)
+    print(f"Node insert: mean={stats.mean_us:.2f} us, p99={stats.p99_us:.2f} us")
+
+    # --- Read by ID ---
+    tracker = LatencyTracker(3000)
+    nodes = graph.get_nodes()
+    node_ids = [n.id for n in nodes[:100]]
+    # Warmup: touch all IDs to bring them into cache
+    for node_id in node_ids:
+        node = graph.get_node(node_id)
+        assert node is not None, f"Warmup get_node({node_id}) returned None"
+
+    for i in range(3000):
+        node_id = node_ids[i % len(node_ids)]
+        with tracker.measure():
+            node = graph.get_node(node_id)
+        assert node is not None, f"get_node({node_id}) returned None"
+
+    stats = tracker.stats()
+    collector.record_latency_stats("node_read_by_id", stats)
+    print(f"Node read (by id): mean={stats.mean_us:.2f} us")
+
+    # --- Read by name ---
+    tracker = LatencyTracker(3000)
+    node_names = [f"bench_node_{i}" for i in range(100)]
+    for name in node_names:
+        node = graph.get_node(name)
+        assert node is not None, f"Warmup get_node('{name}') returned None"
+
+    for i in range(3000):
+        name = node_names[i % len(node_names)]
+        with tracker.measure():
+            node = graph.get_node(name)
+        assert node is not None, f"get_node('{name}') returned None"
+
+    stats = tracker.stats()
+    collector.record_latency_stats("node_read_by_name", stats)
+    print(f"Node read (by name): mean={stats.mean_us:.2f} us")
+
+    # --- Update ---
+    tracker = LatencyTracker(2000)
+    test_node = graph.get_node("bench_node_0")
+    assert test_node is not None, "bench_node_0 not found for update benchmark"
+
+    for i in range(2000):
+        test_node.attrs["level"] = pydsr.Attribute(i)
+        with tracker.measure():
+            result = graph.update_node(test_node)
+        assert result, f"update_node failed at i={i}"
+
+    stats = tracker.stats()
+    collector.record_latency_stats("node_update", stats)
+    print(f"Node update: mean={stats.mean_us:.2f} us")
+
+    # --- Delete ---
+    tracker = LatencyTracker(500)
+    delete_nodes = [f"bench_node_{i}" for i in range(1500, 2000)]
+
+    for name in delete_nodes:
+        with tracker.measure():
+            result = graph.delete_node(name)
+        assert result, f"delete_node('{name}') failed"
+
+    stats = tracker.stats()
+    collector.record_latency_stats("node_delete", stats)
+    print(f"Node delete: mean={stats.mean_us:.2f} us")
+
+
+def benchmark_edge_operations(graph: pydsr.DSRGraph, collector: MetricsCollector):
+    """Benchmark edge CRUD operations."""
+    agent_id = graph.get_agent_id()
+
+    # Get root node
+    root = graph.get_node("root")
+    if not root:
+        print("No root node found")
+        return
+
+    # Create target nodes for edges
+    for i in range(200):
+        node = pydsr.Node(agent_id, "testtype", f"edge_target_{i}")
+        result = graph.insert_node(node)
+        assert result is not None, f"insert_node failed for edge_target_{i}"
+
+    time.sleep(0.1)
+
+    # --- Insert edge ---
+    tracker = LatencyTracker(200)
+
+    for i in range(200):
+        target = graph.get_node(f"edge_target_{i}")
+        assert target is not None, f"edge_target_{i} not found for edge insert"
+        edge = pydsr.Edge(target.id, root.id, "testtype_e", agent_id)
+        with tracker.measure():
+            result = graph.insert_or_assign_edge(edge)
+        assert result, f"insert_or_assign_edge failed for edge_target_{i}"
+
+    stats = tracker.stats()
+    collector.record_latency_stats("edge_insert", stats)
+    print(f"Edge insert: mean={stats.mean_us:.2f} us, p99={stats.p99_us:.2f} us")
+
+    # --- Read edge ---
+    tracker = LatencyTracker(500)
+
+    for i in range(500):
+        target = graph.get_node(f"edge_target_{i % 200}")
+        assert target is not None, f"edge_target_{i % 200} not found for edge read"
+        with tracker.measure():
+            edge = graph.get_edge(root.id, target.id, "testtype_e")
+        assert edge is not None, f"get_edge returned None for edge_target_{i % 200}"
+
+    stats = tracker.stats()
+    collector.record_latency_stats("edge_read", stats)
+    print(f"Edge read: mean={stats.mean_us:.2f} us")
+
+    # --- Delete edge ---
+    tracker = LatencyTracker(100)
+
+    for i in range(100, 200):
+        target = graph.get_node(f"edge_target_{i}")
+        assert target is not None, f"edge_target_{i} not found for edge delete"
+        with tracker.measure():
+            result = graph.delete_edge(root.id, target.id, "testtype_e")
+        assert result, f"delete_edge failed for edge_target_{i}"
+
+    stats = tracker.stats()
+    collector.record_latency_stats("edge_delete", stats)
+    print(f"Edge delete: mean={stats.mean_us:.2f} us")
+
+
+def benchmark_query_operations(graph: pydsr.DSRGraph, collector: MetricsCollector):
+    """Benchmark query operations."""
+
+    # --- get_nodes ---
+    tracker = LatencyTracker(500)
+
+    for _ in range(500):
+        with tracker.measure():
+            graph.get_nodes()
+
+    stats = tracker.stats()
+    collector.record_latency_stats("get_all_nodes", stats)
+    print(f"get_nodes(): mean={stats.mean_us:.2f} us")
+
+    # --- get_nodes_by_type ---
+    tracker = LatencyTracker(500)
+
+    for _ in range(500):
+        with tracker.measure():
+            graph.get_nodes_by_type("testtype")
+
+    stats = tracker.stats()
+    collector.record_latency_stats("get_nodes_by_type", stats)
+    print(f"get_nodes_by_type(): mean={stats.mean_us:.2f} us")
+
+    # --- get_edges (from node) ---
+    root = graph.get_node("root")
+    if root:
+        tracker = LatencyTracker(500)
+
+        for _ in range(500):
+            with tracker.measure():
+                graph.get_edges(root.id)
+
+        stats = tracker.stats()
+        collector.record_latency_stats("get_edges_from_node", stats)
+        print(f"get_edges(id): mean={stats.mean_us:.2f} us")
+
+    # --- get_edges_to_id ---
+    if root:
+        tracker = LatencyTracker(500)
+
+        for _ in range(500):
+            with tracker.measure():
+                graph.get_edges_to_id(root.id)
+
+        stats = tracker.stats()
+        collector.record_latency_stats("get_edges_to_id", stats)
+        print(f"get_edges_to_id(id): mean={stats.mean_us:.2f} us")
+
+    # --- get_edges_by_type ---
+    tracker = LatencyTracker(500)
+
+    for _ in range(500):
+        with tracker.measure():
+            graph.get_edges_by_type("testtype_e")
+
+    stats = tracker.stats()
+    collector.record_latency_stats("get_edges_by_type", stats)
+    print(f"get_edges_by_type(): mean={stats.mean_us:.2f} us")
+
+
+def main():
+    print("=" * 60)
+    print("DSR Python Graph Operations Benchmarks")
+    print("=" * 60)
+    print()
+
+    collector = MetricsCollector("graph_operations")
+
+    # Create graph
+    config_file = make_temp_config_file()
+    graph = pydsr.DSRGraph(0, "bench_graph_ops", 42, config_file)
+    time.sleep(0.5)
+
+    print("--- Node Operations ---")
+    benchmark_node_operations(graph, collector)
+
+    print("\n--- Edge Operations ---")
+    benchmark_edge_operations(graph, collector)
+
+    print("\n--- Query Operations ---")
+    benchmark_query_operations(graph, collector)
+
+    # Cleanup
+    del graph
+    os.unlink(config_file)
+
+    # Export
+    results_dir = os.environ.get(
+        "BENCH_RESULTS_DIR",
+        os.path.join(os.path.dirname(__file__), "..", "results"),
+    )
+    os.makedirs(results_dir, exist_ok=True)
+    collector.export_json(os.path.join(results_dir, "python_graph_operations.json"))
+    collector.export_csv(os.path.join(results_dir, "python_graph_operations.csv"))
+    print(f"\nResults exported to {results_dir}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/python/bench_signals.py b/benchmarks/python/bench_signals.py
new file mode 100644
index 0000000..0249787
--- /dev/null
+++ b/benchmarks/python/bench_signals.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+"""
+Benchmark: Signal/callback performance.
+
+Measures signal connection, emission, and callback invocation overhead.
+
+pyperf is intentionally not used here: the signal benchmarks use async
+patterns (threading.Event, callback_received.wait()) that are incompatible
+with pyperf's tight synchronous loop model.  The queued-signal benchmark
+also relies on Qt's event loop processing between operations.
+"""
+
+import sys
+import os
+import time
+import threading
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+from bench_utils import LatencyTracker, MetricsCollector, make_temp_config_file
+
+try:
+    import pydsr
+except ImportError:
+    print("Error: pydsr module not found.")
+    sys.exit(1)
+
+
+def benchmark_signal_callback_latency(graph: pydsr.DSRGraph, collector: MetricsCollector):
+    """Measure signal callback invocation latency."""
+    agent_id = graph.get_agent_id()
+    tracker = LatencyTracker(100)
+
+    callback_time = [0]
+    callback_received = threading.Event()
+    expected_id = [0]
+
+    def on_node_update(node_id: int, node_type: str):
+        if node_id == expected_id[0]:
+            callback_time[0] = time.perf_counter_ns()
+            callback_received.set()
+
+    # Connect signal
+    pydsr.signals.connect(graph, pydsr.signals.UPDATE_NODE, on_node_update)
+
+    # Warmup
+    for i in range(20):
+        node = pydsr.Node(agent_id, "testtype", f"warmup_sig_{i}")
+        result = graph.insert_node(node)
+        assert result is not None, f"Warmup insert_node failed at i={i}"
+        time.sleep(0.05)
+
+    # Measure
+    for i in range(100):
+        callback_received.clear()
+        node = pydsr.Node(agent_id, "testtype", f"signal_node_{i}")
+
+        send_time = time.perf_counter_ns()
+        expected_id[0] = graph.insert_node(node)
+        assert expected_id[0] is not None, f"insert_node failed at signal measurement i={i}"
+
+        # Wait for callback
+        if callback_received.wait(timeout=2.0):
+            latency = callback_time[0] - send_time
+            tracker.record(latency)
+
+    stats = tracker.stats()
+    collector.record_latency_stats("signal_callback_latency", stats)
+    print(f"Signal callback latency: mean={stats.mean_us:.2f} us, p99={stats.p99_us:.2f} us")
+    print(f"  (received {tracker.count}/100 callbacks)")
+
+
+def benchmark_signal_throughput(graph: pydsr.DSRGraph, collector: MetricsCollector):
+    """Measure how many signals can be processed per second.
+
+    Uses a fixed insert count instead of a time-based loop to keep the
+    callback backlog bounded.  An unbounded loop (e.g. 3 s × 40K inserts/sec)
+    creates a queue that outlasts the benchmark and blocks graph teardown.
+    """
+    agent_id = graph.get_agent_id()
+
+    callback_count = [0]
+
+    def on_node_update(node_id: int, node_type: str):
+        callback_count[0] += 1
+
+    pydsr.signals.connect(graph, pydsr.signals.UPDATE_NODE, on_node_update)
+
+    INSERT_COUNT = 3000
+    print("Generating signals...")
+    start = time.perf_counter()
+
+    for i in range(INSERT_COUNT):
+        node = pydsr.Node(agent_id, "testtype", f"sig_tp_{i}")
+        result = graph.insert_node(node)
+        assert result is not None, f"insert_node failed at signal throughput i={i}"
+
+    # Wait for callbacks to drain, but give up after a timeout so teardown
+    # isn't blocked indefinitely if the callback rate is very slow.
+    drain_deadline = time.perf_counter() + 5.0
+    prev = -1
+    while time.perf_counter() < drain_deadline:
+        time.sleep(0.1)
+        cur = callback_count[0]
+        if cur == prev:          # no new callbacks — queue is drained
+            break
+        prev = cur
+
+    duration = time.perf_counter() - start
+    callbacks_per_sec = callback_count[0] / duration
+
+    collector.record_throughput("signal_callbacks", callback_count[0], duration)
+    print(f"Signal throughput: {callbacks_per_sec:.0f} callbacks/sec")
+    print(f"  ({callback_count[0]} callbacks for {INSERT_COUNT} inserts)")
+
+
+def benchmark_multiple_handlers(graph: pydsr.DSRGraph, collector: MetricsCollector):
+    """Measure impact of multiple signal handlers."""
+    agent_id = graph.get_agent_id()
+
+    for num_handlers in [1, 5, 10]:
+        callback_counts = [0] * num_handlers
+
+        def make_handler(idx):
+            def handler(node_id: int, node_type: str):
+                callback_counts[idx] += 1
+            return handler
+
+        # Connect multiple handlers
+        handlers = [make_handler(i) for i in range(num_handlers)]
+        for h in handlers:
+            pydsr.signals.connect(graph, pydsr.signals.UPDATE_NODE, h)
+
+        # Generate updates
+        insert_count = 100
+        start = time.perf_counter()
+
+        for i in range(insert_count):
+            node = pydsr.Node(agent_id, "testtype", f"mh_{num_handlers}_{i}")
+            result = graph.insert_node(node)
+            assert result is not None, f"insert_node failed for mh_{num_handlers}_{i}"
+
+        time.sleep(0.3)  # Let callbacks process
+        duration = time.perf_counter() - start
+
+        total_callbacks = sum(callback_counts)
+        collector.record("callbacks_with_handlers", "throughput",
+                        total_callbacks / duration,
+                        "callbacks/sec",
+                        tags={"num_handlers": str(num_handlers)})
+
+        print(f"{num_handlers} handlers: {total_callbacks} callbacks in {duration:.2f}s")
+
+
+def main():
+    print("=" * 60)
+    print("DSR Python Signal Benchmarks")
+    print("=" * 60)
+    print()
+
+    collector = MetricsCollector("signals")
+
+    config_file = make_temp_config_file()
+    graph = pydsr.DSRGraph(0, "bench_signals", 42, config_file)
+    time.sleep(0.5)
+
+    print("--- Signal Callback Latency ---")
+    benchmark_signal_callback_latency(graph, collector)
+
+    print("\n--- Signal Throughput ---")
+    benchmark_signal_throughput(graph, collector)
+
+    print("\n--- Multiple Handlers Impact ---")
+    benchmark_multiple_handlers(graph, collector)
+
+    del graph
+    os.unlink(config_file)
+
+    # Export
+    results_dir = os.environ.get(
+        "BENCH_RESULTS_DIR",
+        os.path.join(os.path.dirname(__file__), "..", "results"),
+    )
+    os.makedirs(results_dir, exist_ok=True)
+    collector.export_json(os.path.join(results_dir, "python_signals.json"))
+    collector.export_csv(os.path.join(results_dir, "python_signals.csv"))
+    print(f"\nResults exported to {results_dir}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/python/bench_throughput.py b/benchmarks/python/bench_throughput.py
new file mode 100644
index 0000000..3157842
--- /dev/null
+++ b/benchmarks/python/bench_throughput.py
@@ -0,0 +1,243 @@
+#!/usr/bin/env python3
+"""
+Benchmark: Single-agent throughput + latency for node/edge operations.
+
+Uses pyperf.Runner with bench_time_func so that pyperf calibrates the
+iteration count and runs multiple worker processes for noise reduction.
+Each bench_time_func performs lazy setup (graph creation) outside the timed
+loop on the first call; subsequent calls in the same worker reuse the graph.
+
+The master process collects all Benchmark objects, converts them to
+LatencyStats, and exports to python_throughput.json.
+"""
+
+import sys
+import os
+import time
+
+sys.path.insert(0, os.path.dirname(__file__))
+
+from bench_utils import MetricsCollector, make_temp_config_file, pyperf_to_latency_stats
+
+try:
+    import pydsr
+except ImportError:
+    print("Error: pydsr module not found.")
+    sys.exit(1)
+
+try:
+    import pyperf
+except ImportError:
+    print("Error: pyperf module not found.  Install with: pip install pyperf")
+    sys.exit(1)
+
+
+# ── Lazy graph initialisation (runs once per worker process) ──────────────────
+
+def _init_graph(tag: str, agent_id_hint: int = 43):
+    """Create a DSRGraph and return (graph, config_path, agent_id)."""
+    config = make_temp_config_file()
+    graph = pydsr.DSRGraph(0, f"bench_throughput_{tag}", agent_id_hint, config)
+    time.sleep(0.2)
+    return graph, config, graph.get_agent_id()
+
+
+# ── bench_time_func implementations ──────────────────────────────────────────
+# Each function signature is (loops,) -> float (elapsed seconds).
+# pyperf calls time_func(loops) — use pyperf.perf_counter() directly.
+# State is stored as function attributes so setup only happens once per worker.
+
+def _bench_node_insert(loops):
+    if not hasattr(_bench_node_insert, "_graph"):
+        graph, config, agent_id = _init_graph("insert")
+        _bench_node_insert._graph = graph
+        _bench_node_insert._config = config
+        _bench_node_insert._agent_id = agent_id
+        _bench_node_insert._counter = 0
+
+    graph = _bench_node_insert._graph
+    agent_id = _bench_node_insert._agent_id
+
+    t1 = pyperf.perf_counter()
+    for _ in range(loops):
+        node = pydsr.Node(agent_id, "testtype", f"thr_ins_{_bench_node_insert._counter}")
+        _bench_node_insert._counter += 1
+        graph.insert_node(node)
+    return pyperf.perf_counter() - t1
+
+
+def _bench_node_read(loops):
+    if not hasattr(_bench_node_read, "_graph"):
+        graph, config, agent_id = _init_graph("read")
+        node_ids = []
+        for i in range(1000):
+            node = pydsr.Node(agent_id, "testtype", f"thr_rd_{i}")
+            nid = graph.insert_node(node)
+            assert nid is not None
+            node_ids.append(nid)
+        for nid in node_ids:
+            graph.get_node(nid)  # cache warmup
+        _bench_node_read._graph = graph
+        _bench_node_read._config = config
+        _bench_node_read._node_ids = node_ids
+        _bench_node_read._idx = 0
+
+    graph = _bench_node_read._graph
+    node_ids = _bench_node_read._node_ids
+    idx = _bench_node_read._idx
+
+    t1 = pyperf.perf_counter()
+    for _ in range(loops):
+        graph.get_node(node_ids[idx % len(node_ids)])
+        idx += 1
+    _bench_node_read._idx = idx
+    return pyperf.perf_counter() - t1
+
+
+def _bench_node_update(loops):
+    if not hasattr(_bench_node_update, "_graph"):
+        graph, config, agent_id = _init_graph("update")
+        node = pydsr.Node(agent_id, "testtype", "thr_upd_target")
+        nid = graph.insert_node(node)
+        assert nid is not None
+        target = graph.get_node("thr_upd_target")
+        assert target is not None
+        _bench_node_update._graph = graph
+        _bench_node_update._config = config
+        _bench_node_update._target = target
+        _bench_node_update._counter = 0
+
+    graph = _bench_node_update._graph
+    target = _bench_node_update._target
+
+    t1 = pyperf.perf_counter()
+    for _ in range(loops):
+        target.attrs["level"] = pydsr.Attribute(_bench_node_update._counter % 1000)
+        _bench_node_update._counter += 1
+        graph.update_node(target)
+    return pyperf.perf_counter() - t1
+
+
+def _bench_edge_insert(loops):
+    if not hasattr(_bench_edge_insert, "_graph"):
+        graph, config, agent_id = _init_graph("edge_insert", 44)
+        root = graph.get_node("root")
+        assert root is not None, "no root node"
+        targets = []
+        for i in range(1000):
+            node = pydsr.Node(agent_id, "testtype", f"thr_etgt_{i}")
+            ins = graph.insert_node(node)
+            assert ins is not None
+            n = graph.get_node(f"thr_etgt_{i}")
+            assert n is not None
+            targets.append(n.id)
+        _bench_edge_insert._graph = graph
+        _bench_edge_insert._config = config
+        _bench_edge_insert._agent_id = agent_id
+        _bench_edge_insert._root_id = root.id
+        _bench_edge_insert._targets = targets
+        _bench_edge_insert._idx = 0
+
+    graph = _bench_edge_insert._graph
+    agent_id = _bench_edge_insert._agent_id
+    root_id = _bench_edge_insert._root_id
+    targets = _bench_edge_insert._targets
+    idx = _bench_edge_insert._idx
+
+    t1 = pyperf.perf_counter()
+    for _ in range(loops):
+        tid = targets[idx % len(targets)]
+        edge = pydsr.Edge(tid, root_id, "testtype_e", agent_id)
+        graph.insert_or_assign_edge(edge)
+        idx += 1
+    _bench_edge_insert._idx = idx
+    return pyperf.perf_counter() - t1
+
+
+def _bench_edge_read(loops):
+    if not hasattr(_bench_edge_read, "_graph"):
+        graph, config, agent_id = _init_graph("edge_read", 45)
+        root = graph.get_node("root")
+        assert root is not None, "no root node"
+        targets = []
+        for i in range(1000):
+            node = pydsr.Node(agent_id, "testtype", f"thr_erd_{i}")
+            ins = graph.insert_node(node)
+            assert ins is not None
+            n = graph.get_node(f"thr_erd_{i}")
+            assert n is not None
+            targets.append(n.id)
+            edge = pydsr.Edge(n.id, root.id, "testtype_e", agent_id)
+            graph.insert_or_assign_edge(edge)
+        for tid in targets:
+            graph.get_edge(root.id, tid, "testtype_e")  # cache warmup
+        _bench_edge_read._graph = graph
+        _bench_edge_read._config = config
+        _bench_edge_read._root_id = root.id
+        _bench_edge_read._targets = targets
+        _bench_edge_read._idx = 0
+
+    graph = _bench_edge_read._graph
+    root_id = _bench_edge_read._root_id
+    targets = _bench_edge_read._targets
+    idx = _bench_edge_read._idx
+
+    t1 = pyperf.perf_counter()
+    for _ in range(loops):
+        graph.get_edge(root_id, targets[idx % len(targets)], "testtype_e")
+        idx += 1
+    _bench_edge_read._idx = idx
+    return pyperf.perf_counter() - t1
+
+
+# ── Main ──────────────────────────────────────────────────────────────────────
+
+def main():
+    # Inject default pyperf tuning before Runner parses sys.argv.
+    # Worker processes always receive --worker so they are skipped here.
+    if "--worker" not in sys.argv:
+        if "--values" not in sys.argv:
+            sys.argv.extend(["--values", "20"])
+        if "--warmups" not in sys.argv:
+            sys.argv.extend(["--warmups", "5"])
+
+    runner = pyperf.Runner()
+
+    bm_node_insert = runner.bench_time_func("node_insert", _bench_node_insert)
+    bm_node_read   = runner.bench_time_func("node_read",   _bench_node_read)
+    bm_node_update = runner.bench_time_func("node_update", _bench_node_update)
+    bm_edge_insert = runner.bench_time_func("edge_insert", _bench_edge_insert)
+    bm_edge_read   = runner.bench_time_func("edge_read",   _bench_edge_read)
+
+    # Worker processes must not run the export code (stdout is not redirected,
+    # so workers printing zeros would overwrite/corrupt the master's output).
+    if "--worker" in sys.argv:
+        return
+    collector = MetricsCollector("python_throughput")
+
+    benchmarks = [
+        ("node_insert", bm_node_insert),
+        ("node_read",   bm_node_read),
+        ("node_update", bm_node_update),
+        ("edge_insert", bm_edge_insert),
+        ("edge_read",   bm_edge_read),
+    ]
+    for name, bm in benchmarks:
+        stats = pyperf_to_latency_stats(bm)
+        collector.record_latency_stats(name, stats)
+        if stats.mean_ns > 0:
+            collector.record_throughput(name, 1, stats.mean_ns / 1e9)
+        print(f"{name}: mean={stats.mean_us:.2f} µs  stddev={stats.stddev_ns/1000:.2f} µs")
+
+    results_dir = os.environ.get(
+        "BENCH_RESULTS_DIR",
+        os.path.join(os.path.dirname(__file__), "..", "results"),
+    )
+    os.makedirs(results_dir, exist_ok=True)
+    collector.export_json(os.path.join(results_dir, "python_throughput.json"))
+    collector.export_csv(os.path.join(results_dir, "python_throughput.csv"))
+    print(f"\nResults exported to {results_dir}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/python/bench_utils.py b/benchmarks/python/bench_utils.py
new file mode 100644
index 0000000..c2f60f4
--- /dev/null
+++ b/benchmarks/python/bench_utils.py
@@ -0,0 +1,295 @@
+"""
+Utility functions for DSR Python benchmarks.
+"""
+
+import time
+import statistics
+import json
+import csv
+import os
+from dataclasses import dataclass, field
+from typing import Callable, List, Dict, Any, Optional
+from contextlib import contextmanager
+
+
+@dataclass
+class LatencyStats:
+    """Statistics from latency measurements."""
+    count: int = 0
+    mean_ns: float = 0.0
+    stddev_ns: float = 0.0
+    min_ns: float = 0.0
+    max_ns: float = 0.0
+    p50_ns: float = 0.0
+    p90_ns: float = 0.0
+    p95_ns: float = 0.0
+    p99_ns: float = 0.0
+
+    @property
+    def mean_us(self) -> float:
+        return self.mean_ns / 1000.0
+
+    @property
+    def mean_ms(self) -> float:
+        return self.mean_ns / 1_000_000.0
+
+    @property
+    def p99_us(self) -> float:
+        return self.p99_ns / 1000.0
+
+    @property
+    def p99_ms(self) -> float:
+        return self.p99_ns / 1_000_000.0
+
+
+class LatencyTracker:
+    """Collects latency samples and computes statistics."""
+
+    def __init__(self, expected_samples: int = 100):
+        self.samples: List[float] = []
+
+    def record(self, latency_ns: float):
+        """Record a latency sample in nanoseconds."""
+        self.samples.append(latency_ns)
+
+    def record_seconds(self, latency_sec: float):
+        """Record a latency sample in seconds."""
+        self.samples.append(latency_sec * 1_000_000_000)
+
+    @contextmanager
+    def measure(self):
+        """Context manager for measuring latency."""
+        start = time.perf_counter_ns()
+        yield
+        self.samples.append(time.perf_counter_ns() - start)
+
+    def clear(self):
+        self.samples.clear()
+
+    @property
+    def count(self) -> int:
+        return len(self.samples)
+
+    def stats(self) -> LatencyStats:
+        """Compute and return statistics."""
+        if not self.samples:
+            return LatencyStats()
+
+        sorted_samples = sorted(self.samples)
+        n = len(sorted_samples)
+
+        def percentile(p: float) -> float:
+            idx = p * (n - 1)
+            lower = int(idx)
+            upper = min(lower + 1, n - 1)
+            frac = idx - lower
+            return sorted_samples[lower] * (1 - frac) + sorted_samples[upper] * frac
+
+        return LatencyStats(
+            count=n,
+            mean_ns=statistics.mean(sorted_samples),
+            stddev_ns=statistics.stdev(sorted_samples) if n > 1 else 0.0,
+            min_ns=sorted_samples[0],
+            max_ns=sorted_samples[-1],
+            p50_ns=percentile(0.50),
+            p90_ns=percentile(0.90),
+            p95_ns=percentile(0.95),
+            p99_ns=percentile(0.99),
+        )
+
+
+@dataclass
+class Metric:
+    """Individual metric measurement."""
+    name: str
+    category: str
+    value: float
+    unit: str = ""
+    additional: Dict[str, float] = field(default_factory=dict)
+    tags: Dict[str, str] = field(default_factory=dict)
+
+
+class MetricsCollector:
+    """Collects benchmark metrics."""
+
+    def __init__(self, benchmark_name: str = ""):
+        self.benchmark_name = benchmark_name
+        self.metrics: List[Metric] = []
+        self.metadata: Dict[str, str] = {}
+        self.start_time = time.time()
+
+    def record(self, name: str, category: str, value: float,
+               unit: str = "", tags: Optional[Dict[str, str]] = None):
+        self.metrics.append(Metric(
+            name=name,
+            category=category,
+            value=value,
+            unit=unit,
+            tags=tags or {},
+        ))
+
+    def record_latency_stats(self, name: str, stats: LatencyStats,
+                             tags: Optional[Dict[str, str]] = None):
+        m = Metric(
+            name=name,
+            category="latency",
+            value=stats.mean_ns,
+            unit="ns",
+            tags=tags or {},
+            additional={
+                "count": stats.count,
+                "mean_ns": stats.mean_ns,
+                "stddev_ns": stats.stddev_ns,
+                "min_ns": stats.min_ns,
+                "max_ns": stats.max_ns,
+                "p50_ns": stats.p50_ns,
+                "p90_ns": stats.p90_ns,
+                "p95_ns": stats.p95_ns,
+                "p99_ns": stats.p99_ns,
+            }
+        )
+        self.metrics.append(m)
+
+    def record_scalability(self, name: str, scale_factor: int, value: float,
+                           unit: str = "", tags: Optional[Dict[str, str]] = None):
+        m = Metric(name=name, category="scalability", value=value, unit=unit,
+                   tags=tags or {}, additional={"scale_factor": float(scale_factor)})
+        self.metrics.append(m)
+
+    def record_throughput(self, name: str, operations: int,
+                          duration_sec: float, tags: Optional[Dict[str, str]] = None):
+        ops_per_sec = operations / duration_sec if duration_sec > 0 else 0
+        m = Metric(
+            name=name,
+            category="throughput",
+            value=ops_per_sec,
+            unit="ops/sec",
+            tags=tags or {},
+            additional={
+                "total_operations": operations,
+                "duration_sec": duration_sec,
+            }
+        )
+        self.metrics.append(m)
+
+    def export_json(self, filepath: str):
+        """Export metrics to JSON."""
+        os.makedirs(os.path.dirname(filepath) or ".", exist_ok=True)
+        result = {
+            "benchmark_name": self.benchmark_name,
+            "timestamp": time.strftime("%Y-%m-%dT%H:%M:%S"),
+            "total_duration_sec": time.time() - self.start_time,
+            "metadata": self.metadata,
+            "metrics": [
+                {
+                    "name": m.name,
+                    "category": m.category,
+                    "value": m.value,
+                    "unit": m.unit,
+                    "additional": m.additional,
+                    "tags": m.tags,
+                }
+                for m in self.metrics
+            ]
+        }
+        with open(filepath, "w") as f:
+            json.dump(result, f, indent=2)
+
+    def export_csv(self, filepath: str):
+        """Export metrics to CSV."""
+        os.makedirs(os.path.dirname(filepath) or ".", exist_ok=True)
+        with open(filepath, "w", newline="") as f:
+            writer = csv.writer(f)
+            writer.writerow([
+                "benchmark_name", "metric_name", "category", "value", "unit",
+                "mean_ns", "p50_ns", "p95_ns", "p99_ns", "count"
+            ])
+            for m in self.metrics:
+                writer.writerow([
+                    self.benchmark_name, m.name, m.category, m.value, m.unit,
+                    m.additional.get("mean_ns", ""),
+                    m.additional.get("p50_ns", ""),
+                    m.additional.get("p95_ns", ""),
+                    m.additional.get("p99_ns", ""),
+                    m.additional.get("count", ""),
+                ])
+
+
+def make_temp_config_file() -> str:
+    """Create a minimal DSR config file."""
+    import tempfile
+    config = {
+        "DSRModel": {
+            "symbols": {
+                "100": {
+                    "attribute": {
+                        "level": {"type": 1, "value": 0}
+                    },
+                    "id": "100",
+                    "links": [],
+                    "name": "root",
+                    "type": "root"
+                }
+            }
+        }
+    }
+    fd, path = tempfile.mkstemp(suffix=".json", prefix="dsr_bench_")
+    with os.fdopen(fd, "w") as f:
+        json.dump(config, f)
+    return path
+
+
+def warmup(func: Callable, iterations: int = 10):
+    """Run warmup iterations."""
+    for _ in range(iterations):
+        func()
+
+
+# ── pyperf integration ────────────────────────────────────────────────────────
+
+try:
+    import pyperf as _pyperf  # type: ignore
+    HAS_PYPERF = True
+except ImportError:
+    _pyperf = None  # type: ignore
+    HAS_PYPERF = False
+
+
+def pyperf_to_latency_stats(bm) -> LatencyStats:
+    """Convert a pyperf Benchmark to LatencyStats.
+
+    pyperf 'values' are mean elapsed time per operation (in seconds) for each
+    run.  With the default 3 processes × 5 values we get ~15 data points.
+    Note: these are per-run averages, not individual-op samples, so percentiles
+    reflect variability across runs rather than per-op tail latency.
+    """
+    if bm is None:
+        return LatencyStats()
+    try:
+        values_ns = [v * 1e9 for v in bm.get_values()]
+    except Exception:
+        return LatencyStats()
+    if not values_ns:
+        return LatencyStats()
+
+    sorted_v = sorted(values_ns)
+    n = len(sorted_v)
+
+    def pct(p: float) -> float:
+        idx = p * (n - 1)
+        lo = int(idx)
+        hi = min(lo + 1, n - 1)
+        f = idx - lo
+        return sorted_v[lo] * (1 - f) + sorted_v[hi] * f
+
+    return LatencyStats(
+        count=n,
+        mean_ns=statistics.mean(sorted_v),
+        stddev_ns=statistics.stdev(sorted_v) if n > 1 else 0.0,
+        min_ns=sorted_v[0],
+        max_ns=sorted_v[-1],
+        p50_ns=pct(0.50),
+        p90_ns=pct(0.90),
+        p95_ns=pct(0.95),
+        p99_ns=pct(0.99),
+    )
diff --git a/benchmarks/python/run_all.py b/benchmarks/python/run_all.py
new file mode 100644
index 0000000..9e362fe
--- /dev/null
+++ b/benchmarks/python/run_all.py
@@ -0,0 +1,250 @@
+#!/usr/bin/env python3
+"""
+Run all DSR Python benchmarks and record the results as a named run.
+
+Usage:
+    python run_all.py                       # auto-timestamped run
+    python run_all.py --label "after-fix"   # labelled run
+    python run_all.py --list                # list previous runs
+    python run_all.py --delete <run-id>     # remove a run from the index
+"""
+
+import sys
+import os
+import subprocess
+import time
+import json
+import argparse
+import platform
+from datetime import datetime
+
+ALL_BENCHMARKS = [
+    "bench_binding_overhead.py",
+    "bench_baseline_graph.py",
+    "bench_graph_operations.py",
+    "bench_throughput.py",
+    "bench_signals.py",
+]
+
+BASELINE_BENCHMARKS = [
+    "bench_baseline_graph.py",
+]
+
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+DEFAULT_RESULTS_ROOT = os.path.abspath(os.path.join(SCRIPT_DIR, "..", "results"))
+RUNS_INDEX = os.path.join(DEFAULT_RESULTS_ROOT, "runs.json")
+
+
+# ── Index helpers ─────────────────────────────────────────────────────────────
+
+def load_runs() -> list:
+    if not os.path.isfile(RUNS_INDEX):
+        return []
+    try:
+        with open(RUNS_INDEX) as f:
+            return json.load(f)
+    except PermissionError:
+        print(f"WARNING: cannot read benchmark index: {RUNS_INDEX} (permission denied)")
+        return []
+
+
+def save_runs(runs: list):
+    os.makedirs(DEFAULT_RESULTS_ROOT, exist_ok=True)
+    try:
+        with open(RUNS_INDEX, "w") as f:
+            json.dump(runs, f, indent=2)
+    except PermissionError:
+        print(f"WARNING: cannot update benchmark index: {RUNS_INDEX} (permission denied)")
+
+
+def register_run(run_info: dict):
+    runs = load_runs()
+    runs = [r for r in runs if r["id"] != run_info["id"]]
+    runs.append(run_info)
+    runs.sort(key=lambda r: r["id"])
+    save_runs(runs)
+
+
+# ── Commands ──────────────────────────────────────────────────────────────────
+
+def cmd_list():
+    runs = load_runs()
+    if not runs:
+        print("No runs recorded yet.")
+        return
+    print(f"{'ID':<22}  {'Label':<20}  {'Pass/Total':>10}  {'Duration':>9}")
+    print("-" * 70)
+    for r in runs:
+        ratio = f"{r.get('benchmarks_passed', 0)}/{r.get('benchmarks_run', 0)}"
+        dur = f"{r.get('total_duration_sec', 0):.1f}s"
+        label = r.get("label") or "-"
+        print(f"{r['id']:<22}  {label:<20}  {ratio:>10}  {dur:>9}")
+
+
+def cmd_delete(run_id: str):
+    runs = load_runs()
+    before = len(runs)
+    runs = [r for r in runs if r["id"] != run_id]
+    if len(runs) == before:
+        print(f"Run '{run_id}' not found in index.")
+        return
+    save_runs(runs)
+    print(f"Removed run '{run_id}' from index (result files kept on disk).")
+
+
+def cmd_run_direct(benchmarks) -> int:
+    """Run benchmarks using BENCH_RESULTS_DIR already set in the environment.
+
+    Called by the top-level run_benchmarks.py wrapper so it can manage the
+    run directory and index registration itself.
+    """
+    results_dir = os.environ.get("BENCH_RESULTS_DIR", ".")
+    print("=" * 70)
+    print("  DSR Python Benchmark Suite")
+    print(f"  Output : {results_dir}")
+    print("=" * 70)
+    print()
+
+    env = dict(os.environ)
+    results = []
+    suite_start = time.time()
+
+    for bench in benchmarks:
+        bench_path = os.path.join(SCRIPT_DIR, bench)
+        print(f"\n{'=' * 70}")
+        print(f"Running: {bench}")
+        print("=" * 70)
+        try:
+            proc = subprocess.run([sys.executable, bench_path], cwd=SCRIPT_DIR, env=env, timeout=300)
+            results.append((bench, proc.returncode == 0))
+        except subprocess.TimeoutExpired:
+            print(f"TIMEOUT: {bench}")
+            results.append((bench, False))
+        except Exception as e:
+            print(f"ERROR: {bench}: {e}")
+            results.append((bench, False))
+
+    total_duration = time.time() - suite_start
+    passed = sum(1 for _, ok in results if ok)
+    print(f"\n  {passed}/{len(results)} benchmarks completed in {total_duration:.1f}s")
+    return 0 if all(ok for _, ok in results) else 1
+
+
+def cmd_run(label, results_root, benchmarks):
+    ts = datetime.now()
+    run_id = ts.strftime("%Y%m%dT%H%M%S")
+    dir_name = run_id if not label else f"{run_id}_{label.replace(' ', '-')}"
+    run_dir = os.path.join(results_root, dir_name)
+    os.makedirs(run_dir, exist_ok=True)
+
+    print("=" * 70)
+    print(f"  DSR Python Benchmark Suite")
+    print(f"  Run ID : {run_id}")
+    if label:
+        print(f"  Label  : {label}")
+    print(f"  Output : {run_dir}")
+    print("=" * 70)
+    print()
+
+    env = {**os.environ, "BENCH_RESULTS_DIR": run_dir}
+
+    results = []
+    suite_start = time.time()
+
+    for bench in benchmarks:
+        bench_path = os.path.join(SCRIPT_DIR, bench)
+        print(f"\n{'=' * 70}")
+        print(f"Running: {bench}")
+        print("=" * 70)
+
+        try:
+            proc = subprocess.run(
+                [sys.executable, bench_path],
+                cwd=SCRIPT_DIR,
+                env=env,
+                timeout=300,
+            )
+            results.append((bench, proc.returncode == 0))
+        except subprocess.TimeoutExpired:
+            print(f"TIMEOUT: {bench}")
+            results.append((bench, False))
+        except Exception as e:
+            print(f"ERROR: {bench}: {e}")
+            results.append((bench, False))
+
+    total_duration = time.time() - suite_start
+
+    try:
+        git_hash = subprocess.check_output(
+            ["git", "rev-parse", "--short", "HEAD"],
+            cwd=SCRIPT_DIR, stderr=subprocess.DEVNULL,
+        ).decode().strip()
+    except Exception:
+        git_hash = ""
+
+    run_info = {
+        "id": run_id,
+        "label": label or "",
+        "dir": dir_name,
+        "timestamp": ts.isoformat(),
+        "total_duration_sec": round(total_duration, 2),
+        "benchmarks_run": len(results),
+        "benchmarks_passed": sum(1 for _, ok in results if ok),
+        "git_hash": git_hash,
+        "platform": platform.platform(),
+        "python": sys.version.split()[0],
+    }
+
+    with open(os.path.join(run_dir, "run_info.json"), "w") as f:
+        json.dump(run_info, f, indent=2)
+
+    register_run(run_info)
+
+    print("\n" + "=" * 70)
+    print("  Summary")
+    print("=" * 70)
+    for bench, ok in results:
+        print(f"  [{'PASS' if ok else 'FAIL'}] {bench}")
+
+    passed = sum(1 for _, ok in results if ok)
+    print(f"\n  {passed}/{len(results)} benchmarks completed in {total_duration:.1f}s")
+    print(f"  Run ID  : {run_id}")
+    print(f"  Results : {run_dir}")
+    print(f"  Index   : {RUNS_INDEX}")
+
+    return 0 if all(ok for _, ok in results) else 1
+
+
+# ── Entry point ───────────────────────────────────────────────────────────────
+
+def main():
+    parser = argparse.ArgumentParser(description="Run DSR benchmarks and track results")
+    parser.add_argument("--label", "-l", help="Human-readable label for this run")
+    parser.add_argument("--results-root", default=DEFAULT_RESULTS_ROOT,
+                        help="Root directory for all run results")
+    parser.add_argument("--list", action="store_true", help="List all recorded runs")
+    parser.add_argument("--delete", metavar="RUN_ID", help="Remove a run from the index")
+    parser.add_argument("--direct", action="store_true",
+                        help="Run benchmarks using BENCH_RESULTS_DIR from env, skip index registration")
+    parser.add_argument("--baseline", action="store_true",
+                        help="Run only the curated low-noise Python baseline set")
+    args = parser.parse_args()
+
+    benchmarks = BASELINE_BENCHMARKS if args.baseline else ALL_BENCHMARKS
+
+    if args.list:
+        cmd_list()
+        return 0
+
+    if args.delete:
+        cmd_delete(args.delete)
+        return 0
+
+    if args.direct:
+        return cmd_run_direct(benchmarks)
+
+    return cmd_run(args.label, args.results_root, benchmarks)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/benchmarks/report.py b/benchmarks/report.py
new file mode 100644
index 0000000..fda994f
--- /dev/null
+++ b/benchmarks/report.py
@@ -0,0 +1,1514 @@
+#!/usr/bin/env python3
+"""
+Generate a visual HTML report from benchmark results.
+
+Single run:
+    python report.py                            # latest run
+    python report.py --run 20260314T153000
+
+Compare two runs:
+    python report.py --run 20260314T153000 --baseline 20260313T090000
+
+List available runs:
+    python report.py --list
+"""
+
+import json
+import os
+import sys
+import glob
+import argparse
+from typing import Optional
+from datetime import datetime
+
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+DEFAULT_RESULTS_ROOT = os.path.abspath(os.path.join(SCRIPT_DIR, "results"))
+RUNS_INDEX = os.path.join(DEFAULT_RESULTS_ROOT, "runs.json")
+
+
+# ── Data loading ──────────────────────────────────────────────────────────────
+
+def load_runs_index() -> list:
+    if not os.path.isfile(RUNS_INDEX):
+        return []
+    with open(RUNS_INDEX) as f:
+        return json.load(f)
+
+
+def load_run_metrics(run_dir: str) -> list:
+    """Load all JSON metric files from a run directory.
+
+    Scans two locations:
+      - <run_dir>/*.json          Python benchmark output
+      - <run_dir>/cpp/results/*.json   C++ benchmark output (written by dsr_benchmarks)
+    """
+    SKIP = {"run_info.json", "stability_summary.json"}
+    search_paths = [
+        (run_dir, "*.json"),
+        (os.path.join(run_dir, "cpp", "results"), "*.json"),
+    ]
+
+    cpp_dir = os.path.join(run_dir, "cpp", "results")
+    metrics = []
+    for directory, pattern in search_paths:
+        lang = "cpp" if os.path.abspath(directory) == os.path.abspath(cpp_dir) else "python"
+        for path in sorted(glob.glob(os.path.join(directory, pattern))):
+            if os.path.basename(path) in SKIP:
+                continue
+            try:
+                with open(path) as f:
+                    data = json.load(f)
+                    data["_source_file"] = os.path.basename(path)
+                    data["_lang"] = lang
+                    metrics.append(data)
+            except Exception as e:
+                print(f"Warning: could not load {path}: {e}", file=sys.stderr)
+    return metrics
+
+
+def load_run_info(run_dir: str) -> dict:
+    path = os.path.join(run_dir, "run_info.json")
+    if os.path.isfile(path):
+        with open(path) as f:
+            return json.load(f)
+    return {}
+
+
+def resolve_run_dir(run_id: str, results_root: str) -> str:
+    """Find the directory for a run_id (handles labelled dirs like 20260314T153000_label)."""
+    # Direct match
+    direct = os.path.join(results_root, run_id)
+    if os.path.isdir(direct):
+        return direct
+    # Prefix match (labelled)
+    for entry in os.listdir(results_root):
+        if entry.startswith(run_id):
+            candidate = os.path.join(results_root, entry)
+            if os.path.isdir(candidate):
+                return candidate
+    # Look up in index
+    for r in load_runs_index():
+        if r["id"] == run_id:
+            candidate = os.path.join(results_root, r["dir"])
+            if os.path.isdir(candidate):
+                return candidate
+    raise FileNotFoundError(f"Run directory not found for id '{run_id}'")
+
+
+_UNIT_TO_NS = {"ns": 1, "us": 1_000, "µs": 1_000, "ms": 1_000_000, "s": 1_000_000_000}
+
+
+def _to_ns(value: float, unit: str) -> float:
+    return value * _UNIT_TO_NS.get(unit.strip(), 1)
+
+
+def infer_profile(bench: dict, metric: Optional[dict] = None) -> str:
+    metadata = bench.get("metadata", {}) or {}
+    meta_profile = str(metadata.get("profile", "")).strip().lower()
+    if meta_profile in {"baseline", "extended", "other"}:
+        return meta_profile
+
+    tags = (metric or {}).get("tags", {}) or {}
+    tag_values = {str(v).upper() for v in tags.values()}
+    tag_keys = {str(k).upper() for k in tags.keys()}
+
+    if "BASELINE" in tag_keys or "BASELINE" in tag_values:
+        return "baseline"
+    if "EXTENDED" in tag_keys or "EXTENDED" in tag_values:
+        return "extended"
+
+    source = bench.get("_source_file", "").lower()
+    bench_name = bench.get("benchmark_name", "").lower()
+    if "baseline" in source or "baseline" in bench_name:
+        return "baseline"
+    if "extended" in source or "extended" in bench_name:
+        return "extended"
+    if bench_name.startswith("crdt_") or source.startswith("crdt_"):
+        return "baseline"
+    return "other"
+
+
+def flatten_metrics(bench_files: list) -> tuple[list, list]:
+    """Return (latency_metrics, throughput_metrics) as flat lists."""
+    latency, throughput = [], []
+    latency_keys: set = set()   # (bench_name, metric_name) pairs with real latency data
+    for bench in bench_files:
+        bench_name = bench.get("benchmark_name", bench["_source_file"])
+        lang = bench.get("_lang", "python")
+        for m in bench.get("metrics", []):
+            add = m.get("additional", {})
+            tags = m.get("tags", {})
+            unit = m.get("unit", "")
+            category = m.get("category", "")
+            profile = infer_profile(bench, m)
+
+            # For scalability metrics with repeated names, append the tag that
+            # differentiates them (e.g. graph_size) so each row is unique.
+            metric_name = m["name"]
+            if tags:
+                tag_suffix = "_".join(f"{k}={v}" for k, v in tags.items()
+                                      if k in ("graph_size", "num_threads", "threads", "scale_factor"))
+                if tag_suffix:
+                    metric_name = f"{metric_name}@{tag_suffix}"
+
+            entry = {
+                "benchmark": bench_name,
+                "metric": metric_name,
+                "lang": lang,
+                "profile": profile,
+                "value": m["value"],
+                "unit": unit,
+                "additional": add,
+            }
+
+            if category == "latency":
+                entry.update({
+                    "mean_ns": add.get("mean_ns", m["value"]),
+                    "p50_ns": add.get("p50_ns", 0),
+                    "p95_ns": add.get("p95_ns", 0),
+                    "p99_ns": add.get("p99_ns", 0),
+                    "min_ns": add.get("min_ns", 0),
+                    "max_ns": add.get("max_ns", 0),
+                    "count": int(add.get("count", 0)),
+                    "has_percentiles": True,
+                })
+                latency.append(entry)
+                latency_keys.add((bench_name, metric_name))
+            elif category == "throughput":
+                entry.update({
+                    "ops_per_sec": m["value"],
+                    "total_ops": add.get("total_operations", 0),
+                    "duration_sec": add.get("duration_sec", add.get("duration_ms", 0) / 1000),
+                })
+                throughput.append(entry)
+            elif category == "scalability" and unit in _UNIT_TO_NS:
+                # Only promote scalability entries that have no proper latency
+                # counterpart — avoids duplicates and preserves percentile data.
+                if (bench_name, metric_name) in latency_keys:
+                    continue
+                mean_ns = _to_ns(m["value"], unit)
+                entry.update({
+                    "mean_ns": mean_ns,
+                    "p50_ns": 0,
+                    "p95_ns": 0,
+                    "p99_ns": 0,
+                    "min_ns": 0,
+                    "max_ns": 0,
+                    "count": int(add.get("count", 0)),
+                    "has_percentiles": False,
+                })
+                latency.append(entry)
+    return latency, throughput
+
+
+# ── Scalability flattening ────────────────────────────────────────────────────
+
+SCALE_DIMS = ("threads", "graph_size", "agents")
+
+
+def flatten_scalability(bench_files: list) -> list:
+    """Return a flat list of scalability data points.
+
+    Any metric tagged with a recognised scale dimension (threads, graph_size,
+    or agents) is included — regardless of category — so latency, throughput,
+    and scalability records all contribute.
+    """
+    rows = []
+    for bench in bench_files:
+        lang = bench.get("_lang", "python")
+        bench_name = bench.get("benchmark_name", bench["_source_file"])
+        for m in bench.get("metrics", []):
+            tags = m.get("tags", {})
+            add = m.get("additional", {})
+            scale_dim = next((d for d in SCALE_DIMS if d in tags), None)
+            if scale_dim is None:
+                continue
+            try:
+                scale_val = int(tags[scale_dim])
+            except (ValueError, KeyError):
+                continue
+            cat = m.get("category", "")
+            rows.append({
+                "benchmark": bench_name,
+                "operation": m["name"],
+                "lang": lang,
+                "profile": infer_profile(bench, m),
+                "category": cat,
+                "scale_dim": scale_dim,
+                "scale_val": scale_val,
+                "value": m["value"],
+                "unit": m.get("unit", ""),
+                "mean_ns": add.get("mean_ns", 0.0),
+                "p99_ns": add.get("p99_ns", 0.0),
+                "ops_per_sec": m["value"] if cat == "throughput" else 0.0,
+            })
+    return rows
+
+
+def compute_efficiency(rows: list) -> list:
+    """Compute a normalised-performance series for each (benchmark, op, dim).
+
+    threads / agents  →  parallel efficiency = thr_N / (N × thr_1) × 100
+    graph_size        →  relative throughput  = thr_N / thr_min × 100
+                         (100 % at smallest graph, declining as graph grows)
+
+    Returns a list of {benchmark, operation, scale_dim, scale_val, efficiency,
+    ops_per_sec} dicts.  The JS chart uses the same field regardless of which
+    formula was applied; the label/title is updated per-dimension in JS.
+    """
+    from collections import defaultdict
+
+    groups: dict = defaultdict(list)
+    for r in rows:
+        if r["category"] != "throughput":
+            continue
+        key = (r["benchmark"], r["operation"], r["scale_dim"])
+        groups[key].append(r)
+
+    result = []
+    for (bench, op, dim), pts in groups.items():
+        pts_sorted = sorted(pts, key=lambda p: p["scale_val"])
+
+        if dim in ("threads", "agents"):
+            baseline = next((p for p in pts_sorted if p["scale_val"] == 1), None)
+            if baseline is None or baseline["ops_per_sec"] == 0:
+                continue
+            thr_1 = baseline["ops_per_sec"]
+            for p in pts_sorted:
+                N = p["scale_val"]
+                if N == 0:
+                    continue
+                efficiency = (p["ops_per_sec"] / (N * thr_1)) * 100.0
+                result.append({
+                    "benchmark": bench, "operation": op, "scale_dim": dim,
+                    "scale_val": N, "efficiency": round(efficiency, 2),
+                    "ops_per_sec": p["ops_per_sec"],
+                })
+
+        elif dim == "graph_size":
+            if not pts_sorted or pts_sorted[0]["ops_per_sec"] == 0:
+                continue
+            thr_min = pts_sorted[0]["ops_per_sec"]
+            for p in pts_sorted:
+                relative = (p["ops_per_sec"] / thr_min) * 100.0
+                result.append({
+                    "benchmark": bench, "operation": op, "scale_dim": dim,
+                    "scale_val": p["scale_val"], "efficiency": round(relative, 2),
+                    "ops_per_sec": p["ops_per_sec"],
+                })
+
+    return result
+
+
+# ── HTML generation ───────────────────────────────────────────────────────────
+
+def generate_html(
+    run_info: dict,
+    bench_files: list,
+    output_path: str,
+    baseline_info: Optional[dict] = None,
+    baseline_files: Optional[list] = None,
+):
+    latency, throughput = flatten_metrics(bench_files)
+    b_latency, b_throughput = (flatten_metrics(baseline_files) if baseline_files else ([], []))
+
+    scl_rows = flatten_scalability(bench_files)
+    eff_rows = compute_efficiency(scl_rows)
+    b_scl_rows = flatten_scalability(baseline_files) if baseline_files else []
+
+    run_id = run_info.get("id", "unknown")
+    run_label = run_info.get("label") or run_id
+    b_id = baseline_info.get("id", "") if baseline_info else ""
+    b_label = (baseline_info.get("label") or b_id) if baseline_info else ""
+    comparing = bool(baseline_files)
+    generated_at = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+    latency_json = json.dumps(latency)
+    throughput_json = json.dumps(throughput)
+    b_latency_json = json.dumps(b_latency)
+    b_throughput_json = json.dumps(b_throughput)
+    run_info_json = json.dumps(run_info)
+    b_info_json = json.dumps(baseline_info or {})
+    scl_json = json.dumps(scl_rows)
+    eff_json = json.dumps(eff_rows)
+    b_scl_json = json.dumps(b_scl_rows)
+
+    # Summary rows
+    summary = []
+    for b in bench_files:
+        summary.append({
+            "benchmark": b.get("benchmark_name", b["_source_file"]),
+            "profile": infer_profile(b),
+            "timestamp": b.get("timestamp", ""),
+            "duration": f"{b.get('total_duration_sec', 0):.1f}s",
+            "metrics": len(b.get("metrics", [])),
+            "source": b["_source_file"],
+        })
+    summary_json = json.dumps(summary)
+
+    compare_tab = '<button onclick="showTab(\'compare\', this)">Compare</button>' if comparing else ""
+    compare_panel = ""
+    if comparing:
+        compare_panel = '<div id="tab-compare" class="tab-panel"></div>'
+
+    html = f"""<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>Cortex Benchmark Report — {run_label}</title>
+<script src="https://cdn.jsdelivr.net/npm/chart.js@4.4.0/dist/chart.umd.min.js"></script>
+<script src="https://cdn.jsdelivr.net/npm/hammerjs@2.0.8/hammer.min.js"></script>
+<script src="https://cdn.jsdelivr.net/npm/chartjs-plugin-zoom@2.0.1/dist/chartjs-plugin-zoom.min.js"></script>
+<style>
+  :root {{
+    --bg: #0f1117; --surface: #1a1d27; --border: #2a2d3a;
+    --accent: #7c6af7; --accent2: #4fc3f7; --accent3: #81c995; --accent4: #f28b82;
+    --text: #e0e0e0; --muted: #8b8fa8;
+    --font: 'Segoe UI', system-ui, sans-serif;
+    --mono: 'Cascadia Code', 'Consolas', monospace;
+  }}
+  * {{ box-sizing: border-box; margin: 0; padding: 0; }}
+  body {{ background: var(--bg); color: var(--text); font-family: var(--font); }}
+
+  header {{
+    border-bottom: 1px solid var(--border);
+    padding: 20px 40px;
+    background: var(--surface);
+    display: flex;
+    align-items: baseline;
+    gap: 16px;
+  }}
+  header h1 {{ font-size: 1.35rem; font-weight: 600; }}
+  header .run-id {{ color: var(--accent); font-family: var(--mono); font-size: 0.85rem; }}
+  header .meta {{ margin-left: auto; color: var(--muted); font-size: 0.8rem; text-align: right; line-height: 1.6; }}
+
+  nav {{
+    background: var(--surface); border-bottom: 1px solid var(--border);
+    padding: 0 40px; display: flex; gap: 4px;
+  }}
+  nav button {{
+    background: none; border: none; border-bottom: 3px solid transparent;
+    color: var(--muted); cursor: pointer; font-family: var(--font);
+    font-size: 0.9rem; padding: 12px 16px;
+    transition: color 0.15s, border-color 0.15s;
+  }}
+  nav button:hover {{ color: var(--text); }}
+  nav button.active {{ color: var(--accent); border-color: var(--accent); }}
+
+  main {{ padding: 32px 40px; max-width: 1400px; margin: 0 auto; }}
+  .tab-panel {{ display: none; }}
+  .tab-panel.active {{ display: block; }}
+
+  h2 {{ font-size: 1.1rem; font-weight: 600; margin-bottom: 20px; }}
+  h3 {{ font-size: 0.85rem; font-weight: 600; color: var(--muted); text-transform: uppercase;
+       letter-spacing: 0.05em; margin-bottom: 12px; }}
+
+  .grid-2 {{ display: grid; grid-template-columns: 1fr 1fr; gap: 24px; }}
+  .grid-3 {{ display: grid; grid-template-columns: repeat(3, 1fr); gap: 16px; }}
+  @media (max-width: 900px) {{ .grid-2, .grid-3 {{ grid-template-columns: 1fr; }} }}
+
+  .card {{ background: var(--surface); border: 1px solid var(--border); border-radius: 8px; padding: 24px; }}
+  .stat-card {{
+    background: var(--surface); border: 1px solid var(--border);
+    border-radius: 8px; padding: 20px; text-align: center;
+  }}
+  .stat-card .val {{ font-size: 1.9rem; font-weight: 700; color: var(--accent); font-family: var(--mono); }}
+  .stat-card .lbl {{ color: var(--muted); font-size: 0.78rem; margin-top: 4px; }}
+
+  .chart-wrap {{ position: relative; height: 320px; }}
+  .chart-wrap.tall {{ height: 420px; }}
+
+  table {{ width: 100%; border-collapse: collapse; font-size: 0.875rem; }}
+  th {{
+    text-align: left; padding: 10px 14px; color: var(--muted); font-weight: 500;
+    border-bottom: 1px solid var(--border); font-size: 0.78rem;
+    text-transform: uppercase; letter-spacing: 0.04em;
+  }}
+  td {{ padding: 10px 14px; border-bottom: 1px solid var(--border); font-family: var(--mono); font-size: 0.85rem; }}
+  tr:last-child td {{ border-bottom: none; }}
+  tr:hover td {{ background: rgba(124,106,247,0.04); }}
+
+  .badge {{
+    display: inline-block; padding: 2px 8px; border-radius: 4px;
+    font-size: 0.72rem; font-weight: 600;
+  }}
+  .badge-latency {{ background: rgba(124,106,247,0.18); color: var(--accent); }}
+  .badge-throughput {{ background: rgba(79,195,247,0.18); color: var(--accent2); }}
+  .badge-cpp {{ background: rgba(251,146,60,0.18); color: #fb923c; }}
+  .badge-python {{ background: rgba(96,165,250,0.18); color: #60a5fa; }}
+  .badge-baseline {{ background: rgba(129,201,149,0.18); color: var(--accent3); }}
+  .badge-extended {{ background: rgba(255,183,77,0.18); color: #ffb74d; }}
+  .badge-other {{ background: rgba(139,143,168,0.18); color: var(--muted); }}
+  .lang-toggle {{ display: flex; gap: 0; }}
+  .lang-toggle button {{
+    background: var(--surface); border: 1px solid var(--border);
+    color: var(--muted); cursor: pointer; font-family: var(--font);
+    font-size: 0.82rem; padding: 5px 12px; transition: all 0.15s;
+  }}
+  .lang-toggle button:first-child {{ border-radius: 6px 0 0 6px; }}
+  .lang-toggle button:last-child  {{ border-radius: 0 6px 6px 0; border-left: none; }}
+  .lang-toggle button:not(:first-child):not(:last-child) {{ border-left: none; }}
+  .lang-toggle button.active {{ background: rgba(124,106,247,0.15); color: var(--accent); border-color: var(--accent); }}
+
+  .delta {{ font-weight: 600; }}
+  .delta-good {{ color: var(--accent3); }}
+  .delta-bad {{ color: var(--accent4); }}
+  .delta-neutral {{ color: var(--muted); }}
+
+  .lang-section-title {{
+    display: flex; align-items: center; gap: 10px;
+    margin: 32px 0 16px; font-size: 1.05rem; font-weight: 600;
+    border-bottom: 1px solid var(--border); padding-bottom: 10px;
+  }}
+  .cmp-chips {{
+    display: flex; flex-wrap: wrap; gap: 6px; margin-top: 12px;
+  }}
+  .cmp-chip {{
+    display: inline-flex; align-items: center; gap: 4px;
+    padding: 3px 9px; border-radius: 20px; font-size: 0.72rem; font-weight: 500;
+    cursor: pointer; border: 1px solid var(--border);
+    background: var(--surface); color: var(--text);
+    transition: all 0.15s; user-select: none;
+  }}
+  .cmp-chip:hover {{ border-color: var(--accent); color: var(--accent); }}
+  .cmp-chip.hidden-chip {{
+    opacity: 0.35; text-decoration: line-through; color: var(--muted);
+  }}
+  .btn-sm {{
+    background: var(--surface); border: 1px solid var(--border); border-radius: 6px;
+    color: var(--muted); cursor: pointer; font-family: var(--font);
+    font-size: 0.78rem; padding: 4px 10px; transition: all 0.15s;
+  }}
+  .btn-sm:hover {{ color: var(--text); border-color: var(--accent); }}
+
+  .section {{ margin-bottom: 36px; }}
+  .subsection-title {{
+    display:flex; align-items:center; justify-content:space-between;
+    margin: 20px 0 10px; font-size: 0.95rem; font-weight: 600;
+  }}
+  .filter-bar {{ display: flex; gap: 8px; flex-wrap: wrap; margin-bottom: 14px; }}
+  .filter-bar select {{
+    background: var(--surface); border: 1px solid var(--border); border-radius: 6px;
+    color: var(--text); font-family: var(--font); font-size: 0.85rem; padding: 6px 12px; cursor: pointer;
+  }}
+  .empty {{ color: var(--muted); font-style: italic; padding: 24px 0; text-align: center; }}
+
+  .run-pill {{
+    display: inline-flex; align-items: center; gap: 6px;
+    background: var(--surface); border: 1px solid var(--border);
+    border-radius: 20px; padding: 4px 12px; font-size: 0.8rem;
+  }}
+  .run-pill .dot {{ width: 8px; height: 8px; border-radius: 50%; }}
+</style>
+</head>
+<body>
+
+<header>
+  <div>
+    <h1>Cortex Benchmark Report</h1>
+    <span class="run-id">{run_label}</span>
+    {f'<span style="color:var(--muted);font-size:0.8rem"> vs baseline: <span style="color:var(--accent2)">{b_label}</span></span>' if comparing else ""}
+  </div>
+  <div class="meta">
+    Generated: {generated_at}<br>
+    {run_info.get("git_hash") and f"git: {run_info['git_hash']}" or ""}
+  </div>
+</header>
+
+<nav>
+  <button class="active" onclick="showTab('overview', this)">Overview</button>
+  <button onclick="showTab('latency', this)">Latency</button>
+  <button onclick="showTab('throughput', this)">Throughput</button>
+  <button onclick="showTab('scalability', this)">Scalability</button>
+  {compare_tab}
+  <button onclick="showTab('raw', this)">Raw Data</button>
+</nav>
+
+<main>
+
+<!-- OVERVIEW -->
+<div id="tab-overview" class="tab-panel active">
+  <div class="section"><div class="grid-3" id="stat-cards"></div></div>
+  <div class="section grid-2">
+    <div class="card">
+      <h3>Latency — Mean (µs)</h3>
+      <div class="chart-wrap"><canvas id="ov-latency"></canvas></div>
+    </div>
+    <div class="card">
+      <h3>Throughput (ops/sec)</h3>
+      <div class="chart-wrap"><canvas id="ov-throughput"></canvas></div>
+    </div>
+  </div>
+  <div class="section">
+    <div class="card">
+      <h3>Run Info</h3>
+      <table><tbody id="run-info-table"></tbody></table>
+    </div>
+  </div>
+  <div class="section">
+    <div class="card">
+      <h3>Benchmark Files</h3>
+      <table>
+        <thead><tr><th>Benchmark</th><th>Timestamp</th><th>Duration</th><th>Metrics</th><th>File</th></tr></thead>
+        <tbody id="summary-table"></tbody>
+      </table>
+    </div>
+  </div>
+</div>
+
+<!-- LATENCY -->
+<div id="tab-latency" class="tab-panel">
+  <div class="section">
+    <div class="filter-bar">
+      <select id="lat-filter" onchange="renderLatency()"><option value="">All benchmarks</option></select>
+      <select id="lat-profile-filter" onchange="renderLatency()">
+        <option value="">All profiles</option>
+        <option value="baseline">Baseline</option>
+        <option value="extended">Extended</option>
+        <option value="other">Other</option>
+      </select>
+      <div class="lang-toggle">
+        <button class="active" onclick="setLangFilter('lat','',this)">All</button>
+        <button onclick="setLangFilter('lat','cpp',this)">C++</button>
+        <button onclick="setLangFilter('lat','python',this)">Python</button>
+      </div>
+      <button onclick="if(latChart) latChart.resetZoom()" style="background:var(--surface);border:1px solid var(--border);border-radius:6px;color:var(--muted);cursor:pointer;font-family:var(--font);font-size:0.85rem;padding:6px 12px;">Reset zoom</button>
+    </div>
+    <div style="color:var(--muted);font-size:0.78rem;margin-bottom:8px;">Scroll to zoom · Click &amp; drag to pan · Double-click to reset</div>
+    <div class="card" style="margin-bottom:24px;">
+      <h3>Latency Distribution — Mean / p50 / p95 / p99</h3>
+      <div class="chart-wrap tall"><canvas id="lat-dist"></canvas></div>
+    </div>
+    <div class="card">
+      <h3>Latency Detail</h3>
+      <div id="lat-detail-sections"></div>
+    </div>
+  </div>
+</div>
+
+<!-- THROUGHPUT -->
+<div id="tab-throughput" class="tab-panel">
+  <div class="section">
+    <div class="filter-bar">
+      <select id="thr-filter" onchange="renderThroughput()"><option value="">All benchmarks</option></select>
+      <select id="thr-profile-filter" onchange="renderThroughput()">
+        <option value="">All profiles</option>
+        <option value="baseline">Baseline</option>
+        <option value="extended">Extended</option>
+        <option value="other">Other</option>
+      </select>
+      <div class="lang-toggle">
+        <button class="active" onclick="setLangFilter('thr','',this)">All</button>
+        <button onclick="setLangFilter('thr','cpp',this)">C++</button>
+        <button onclick="setLangFilter('thr','python',this)">Python</button>
+      </div>
+    </div>
+    <div class="card" style="margin-bottom:24px;">
+      <h3>Operations per Second</h3>
+      <div class="chart-wrap tall"><canvas id="thr-bar"></canvas></div>
+    </div>
+    <div class="card">
+      <h3>Throughput Detail</h3>
+      <div id="thr-detail-sections"></div>
+    </div>
+  </div>
+</div>
+
+<!-- SCALABILITY -->
+<div id="tab-scalability" class="tab-panel">
+  <div class="section">
+    <div class="filter-bar">
+      <select id="scl-dim" onchange="renderScalability()">
+        <option value="threads">Threads</option>
+        <option value="graph_size">Graph size</option>
+        <option value="agents">Agents</option>
+      </select>
+      <select id="scl-op" onchange="renderScalability()"><option value="">All operations</option></select>
+    </div>
+    <div class="grid-2 section">
+      <div class="card">
+        <h3>Throughput (ops/sec)</h3>
+        <div class="chart-wrap"><canvas id="scl-thr-chart"></canvas></div>
+      </div>
+      <div class="card">
+        <h3>Mean Latency (µs)</h3>
+        <div class="chart-wrap"><canvas id="scl-lat-chart"></canvas></div>
+      </div>
+    </div>
+    <div class="card section">
+      <h3 id="scl-eff-title">Scaling Efficiency (% of ideal linear)</h3>
+      <div class="chart-wrap"><canvas id="scl-eff-chart"></canvas></div>
+    </div>
+    <div class="card">
+      <h3>Scalability Detail</h3>
+      <table>
+        <thead><tr>
+          <th>Benchmark</th><th>Operation</th><th>Dimension</th><th>Scale</th>
+          <th>Throughput</th><th>Mean Latency</th><th>Efficiency %</th>
+        </tr></thead>
+        <tbody id="scl-table"></tbody>
+      </table>
+    </div>
+  </div>
+</div>
+
+<!-- COMPARE (injected if comparing) -->
+{compare_panel}
+
+<!-- RAW -->
+<div id="tab-raw" class="tab-panel">
+  <div class="section" id="raw-content"></div>
+</div>
+
+</main>
+
+<script>
+// ── Data ─────────────────────────────────────────────────────────────────────
+const LAT   = {latency_json};
+const THR   = {throughput_json};
+const B_LAT = {b_latency_json};
+const B_THR = {b_throughput_json};
+const RUN_INFO = {run_info_json};
+const B_INFO   = {b_info_json};
+const SUMMARY  = {summary_json};
+const COMPARING = {json.dumps(comparing)};
+const SCL = {scl_json};
+const EFF = {eff_json};
+const B_SCL = {b_scl_json};
+
+const RUN_COLOR  = '#7c6af7';
+const BASE_COLOR = '#4fc3f7';
+const PALETTE = ['#7c6af7','#4fc3f7','#81c995','#f28b82','#ffb74d','#e879f9','#34d399','#fb923c'];
+
+// ── Formatters ───────────────────────────────────────────────────────────────
+function fmtNs(ns) {{
+  if (ns >= 1e9) return (ns/1e9).toFixed(2) + ' s';
+  if (ns >= 1e6) return (ns/1e6).toFixed(2) + ' ms';
+  if (ns >= 1e3) return (ns/1e3).toFixed(2) + ' µs';
+  return ns.toFixed(1) + ' ns';
+}}
+function fmtOps(v) {{
+  if (v >= 1e6) return (v/1e6).toFixed(2) + ' M ops/s';
+  if (v >= 1e3) return (v/1e3).toFixed(2) + ' K ops/s';
+  return v.toFixed(1) + ' ops/s';
+}}
+function deltaClass(pct, higherIsBetter) {{
+  if (Math.abs(pct) < 1) return 'delta-neutral';
+  return (pct > 0) === higherIsBetter ? 'delta-good' : 'delta-bad';
+}}
+function fmtDelta(pct, higherIsBetter) {{
+  const sign = pct > 0 ? '+' : '';
+  const cls = deltaClass(pct, higherIsBetter);
+  return `<span class="delta ${{cls}}">${{sign}}${{pct.toFixed(1)}}%</span>`;
+}}
+function profileLabel(profile) {{
+  if (profile === 'baseline') return 'Baseline';
+  if (profile === 'extended') return 'Extended';
+  return 'Other';
+}}
+function profileBadge(profile) {{
+  return `<span class="badge badge-${{profile}}">${{profileLabel(profile)}}</span>`;
+}}
+function profileOrder(profile) {{
+  return profile === 'baseline' ? 0 : profile === 'extended' ? 1 : 2;
+}}
+
+// ── Chart defaults ────────────────────────────────────────────────────────────
+const CD = {{
+  responsive: true, maintainAspectRatio: false,
+  plugins: {{
+    legend: {{ labels: {{ color: '#8b8fa8', font: {{ size: 11 }} }} }},
+    tooltip: {{ backgroundColor: '#1a1d27', borderColor: '#2a2d3a', borderWidth: 1,
+               titleColor: '#e0e0e0', bodyColor: '#8b8fa8' }},
+  }},
+  scales: {{
+    x: {{ ticks: {{ color: '#8b8fa8', font: {{ size: 11 }} }}, grid: {{ color: '#2a2d3a' }} }},
+    y: {{ ticks: {{ color: '#8b8fa8', font: {{ size: 11 }} }}, grid: {{ color: '#2a2d3a' }} }},
+  }},
+}};
+
+// ── Tab navigation ────────────────────────────────────────────────────────────
+let compareRendered = false;
+let sclRendered = false;
+function showTab(name, btn) {{
+  document.querySelectorAll('.tab-panel').forEach(p => p.classList.remove('active'));
+  document.querySelectorAll('nav button').forEach(b => b.classList.remove('active'));
+  document.getElementById('tab-' + name).classList.add('active');
+  if (btn) btn.classList.add('active');
+  // Lazy-render compare tab the first time it becomes visible so Chart.js
+  // gets correct canvas dimensions (canvas in a hidden div has width=0).
+  if (name === 'compare' && COMPARING && !compareRendered) {{
+    compareRendered = true;
+    renderCompare();
+  }}
+  // Lazy-render scalability tab the first time it becomes visible.
+  if (name === 'scalability' && !sclRendered) {{
+    sclRendered = true;
+    const sclOpSel = document.getElementById('scl-op');
+    [...new Set(SCL.map(r => r.operation))].forEach(op => {{
+      const o = document.createElement('option'); o.value = op; o.textContent = op;
+      sclOpSel.appendChild(o);
+    }});
+    renderScalability();
+  }}
+  // Resize latency/throughput charts when their tabs become visible.
+  if (name === 'latency' && latChart) latChart.resize();
+  if (name === 'throughput' && thrChart) thrChart.resize();
+}}
+
+// ── Overview ──────────────────────────────────────────────────────────────────
+function renderOverview() {{
+  // Stat cards
+  const total = LAT.length + THR.length;
+  const benchNames = [...new Set([...LAT.map(m=>m.benchmark), ...THR.map(m=>m.benchmark)])];
+  const avgMean = LAT.length ? LAT.reduce((s,m)=>s+m.mean_ns,0)/LAT.length : 0;
+  const baselineCount = new Set([...LAT.filter(m=>m.profile==='baseline').map(m=>m.benchmark), ...THR.filter(m=>m.profile==='baseline').map(m=>m.benchmark)]).size;
+  const extendedCount = new Set([...LAT.filter(m=>m.profile==='extended').map(m=>m.benchmark), ...THR.filter(m=>m.profile==='extended').map(m=>m.benchmark)]).size;
+  document.getElementById('stat-cards').innerHTML = `
+    <div class="stat-card"><div class="val">${{benchNames.length}}</div><div class="lbl">Benchmark Suites</div></div>
+    <div class="stat-card"><div class="val">${{total}}</div><div class="lbl">Total Metrics</div></div>
+    <div class="stat-card"><div class="val">${{LAT.length ? fmtNs(avgMean) : 'N/A'}}</div><div class="lbl">Avg Mean Latency</div></div>
+    <div class="stat-card"><div class="val">${{baselineCount}}</div><div class="lbl">Baseline Suites</div></div>
+    <div class="stat-card"><div class="val">${{extendedCount}}</div><div class="lbl">Extended Suites</div></div>
+  `;
+
+  // Run info table
+  const riRows = Object.entries(RUN_INFO)
+    .filter(([k]) => !k.startsWith('_'))
+    .map(([k,v]) => `<tr><td style="color:var(--muted);width:160px">${{k}}</td><td>${{v}}</td></tr>`)
+    .join('');
+  document.getElementById('run-info-table').innerHTML = riRows || '<tr><td class="empty">No metadata</td></tr>';
+
+  // Summary table
+  document.getElementById('summary-table').innerHTML = SUMMARY.map(r => `
+    <tr>
+      <td>${{r.benchmark}} ${{profileBadge(r.profile)}}</td><td>${{r.timestamp}}</td>
+      <td>${{r.duration}}</td><td>${{r.metrics}}</td>
+      <td style="color:var(--muted)">${{r.source}}</td>
+    </tr>`).join('');
+
+  // Overview latency chart
+  if (LAT.length) {{
+    const labels = LAT.map(m => m.metric);
+    const datasets = [{{ label: 'Mean (µs)', data: LAT.map(m => m.mean_ns/1000), backgroundColor: RUN_COLOR+'cc', borderColor: RUN_COLOR, borderWidth: 1 }}];
+    if (COMPARING && B_LAT.length) {{
+      const bMap = Object.fromEntries(B_LAT.map(m => [m.benchmark+'/'+m.metric, m]));
+      datasets.push({{ label: 'Baseline Mean (µs)', data: LAT.map(m => (bMap[m.benchmark+'/'+m.metric]?.mean_ns||0)/1000), backgroundColor: BASE_COLOR+'88', borderColor: BASE_COLOR, borderWidth: 1 }});
+    }}
+    new Chart(document.getElementById('ov-latency'), {{
+      type: 'bar', data: {{ labels, datasets }},
+      options: {{ ...CD, plugins: {{ ...CD.plugins, tooltip: {{ ...CD.plugins.tooltip,
+        callbacks: {{ label: ctx => ` ${{ctx.dataset.label}}: ${{fmtNs(ctx.raw*1000)}}` }} }} }} }},
+    }});
+  }} else document.getElementById('ov-latency').parentElement.innerHTML = '<p class="empty">No latency data</p>';
+
+  // Overview throughput chart
+  if (THR.length) {{
+    const labels = THR.map(m => m.metric);
+    const datasets = [{{ label: 'Ops/sec', data: THR.map(m => m.ops_per_sec), backgroundColor: PALETTE[1]+'cc', borderColor: PALETTE[1], borderWidth: 1 }}];
+    if (COMPARING && B_THR.length) {{
+      const bMap = Object.fromEntries(B_THR.map(m => [m.benchmark+'/'+m.metric, m]));
+      datasets.push({{ label: 'Baseline', data: THR.map(m => bMap[m.benchmark+'/'+m.metric]?.ops_per_sec||0), backgroundColor: BASE_COLOR+'88', borderColor: BASE_COLOR, borderWidth: 1 }});
+    }}
+    new Chart(document.getElementById('ov-throughput'), {{
+      type: 'bar', data: {{ labels, datasets }},
+      options: {{ ...CD, plugins: {{ ...CD.plugins, tooltip: {{ ...CD.plugins.tooltip,
+        callbacks: {{ label: ctx => ` ${{fmtOps(ctx.raw)}}` }} }} }} }},
+    }});
+  }} else document.getElementById('ov-throughput').parentElement.innerHTML += '<p class="empty">No throughput data</p>';
+}}
+
+// ── Latency ───────────────────────────────────────────────────────────────────
+let latChart = null;
+function populateFilter(selId, data) {{
+  const sel = document.getElementById(selId);
+  [...new Set(data.map(m => m.benchmark))].forEach(n => {{
+    const o = document.createElement('option'); o.value = n; o.textContent = n; sel.appendChild(o);
+  }});
+}}
+
+// ── Lang filter state ─────────────────────────────────────────────────────────
+const langFilter = {{ lat: '', thr: '' }};
+function setLangFilter(tab, lang, btn) {{
+  langFilter[tab] = lang;
+  btn.closest('.lang-toggle').querySelectorAll('button').forEach(b => b.classList.remove('active'));
+  btn.classList.add('active');
+  if (tab === 'lat') renderLatency();
+  else renderThroughput();
+}}
+function langBadge(lang) {{
+  return `<span class="badge badge-${{lang}}">${{lang}}</span>`;
+}}
+
+function renderLatency() {{
+  let data = LAT;
+  const benchF = document.getElementById('lat-filter').value;
+  const profileF = document.getElementById('lat-profile-filter').value;
+  if (benchF) data = data.filter(m => m.benchmark === benchF);
+  if (profileF) data = data.filter(m => m.profile === profileF);
+  if (langFilter.lat) data = data.filter(m => m.lang === langFilter.lat);
+  const bMap = COMPARING ? Object.fromEntries(B_LAT.map(m => [m.benchmark+'/'+m.metric, m])) : {{}};
+
+  const labels = data.map(m => m.metric);
+  const toUs = ns => ns/1000;
+
+  if (latChart) {{ latChart.destroy(); latChart = null; }}
+  if (data.length === 0) {{
+    const otherLang = langFilter.lat === 'python' ? 'C++' : langFilter.lat === 'cpp' ? 'Python' : '';
+    const hint = otherLang ? ` Try switching to the ${{otherLang}} filter.` : '';
+    const canvas = document.getElementById('lat-dist');
+    canvas.style.display = 'none';
+    let p = canvas.parentElement.querySelector('.lat-empty-msg');
+    if (!p) {{ p = document.createElement('p'); p.className = 'empty lat-empty-msg'; canvas.parentElement.appendChild(p); }}
+    p.textContent = `No latency data for current filter.${{hint}}`;
+    document.getElementById('lat-detail-sections').innerHTML = '<p class="empty">No data</p>';
+    return;
+  }}
+  // Restore canvas if previously hidden
+  const latDistCanvas = document.getElementById('lat-dist');
+  latDistCanvas.style.display = '';
+  const latEmptyMsg = latDistCanvas.parentElement.querySelector('.lat-empty-msg');
+  if (latEmptyMsg) latEmptyMsg.remove();
+  const datasets = [
+    {{ label: 'Mean',  data: data.map(m=>toUs(m.mean_ns)), backgroundColor: RUN_COLOR+'aa' }},
+    {{ label: 'p50',   data: data.map(m=>toUs(m.p50_ns)),  backgroundColor: PALETTE[2]+'aa' }},
+    {{ label: 'p95',   data: data.map(m=>toUs(m.p95_ns)),  backgroundColor: PALETTE[1]+'aa' }},
+    {{ label: 'p99',   data: data.map(m=>toUs(m.p99_ns)),  backgroundColor: PALETTE[3]+'aa' }},
+  ];
+  if (COMPARING && B_LAT.length) {{
+    datasets.push({{ label: 'Baseline Mean', data: data.map(m=>toUs(bMap[m.benchmark+'/'+m.metric]?.mean_ns||0)), backgroundColor: BASE_COLOR+'55', borderColor: BASE_COLOR, borderWidth: 1, borderDash: [4,2] }});
+  }}
+  latChart = new Chart(document.getElementById('lat-dist'), {{
+    type: 'bar', data: {{ labels, datasets }},
+    options: {{ ...CD,
+      scales: {{ ...CD.scales, y: {{ ...CD.scales.y, title: {{ display:true, text:'µs', color:'#8b8fa8' }} }} }},
+      plugins: {{ ...CD.plugins,
+        tooltip: {{ ...CD.plugins.tooltip,
+          callbacks: {{ label: ctx => ` ${{ctx.dataset.label}}: ${{fmtNs(ctx.raw*1000)}}` }} }},
+        zoom: {{
+          pan:  {{ enabled: true, mode: 'xy' }},
+          zoom: {{
+            wheel:  {{ enabled: true }},
+            pinch:  {{ enabled: true }},
+            mode:   'xy',
+            onZoomComplete: ({{ chart }}) => chart.update('none'),
+          }},
+          limits: {{ y: {{ min: 0 }} }},
+        }},
+      }},
+    }},
+  }});
+
+  const renderLatencyRows = rows => rows.map(m => {{
+    const b = bMap[m.benchmark+'/'+m.metric];
+    const deltaCell = b ? fmtDelta(((m.mean_ns - b.mean_ns) / b.mean_ns)*100, false) : '';
+    const stddev = m.additional?.stddev_ns ?? 0;
+    const cv = m.mean_ns > 0 ? (stddev / m.mean_ns) * 100 : 0;
+    const cvColor = cv < 10 ? 'var(--accent3)' : cv < 30 ? '#ffb74d' : 'var(--accent4)';
+    const cvCell = m.count > 1 && m.has_percentiles ? `<span style="color:${{cvColor}};font-weight:600">${{cv.toFixed(1)}}%</span>` : '—';
+    const na = `<span style="color:var(--muted)">—</span>`;
+    const p50  = m.has_percentiles ? fmtNs(m.p50_ns) : na;
+    const p95  = m.has_percentiles ? fmtNs(m.p95_ns) : na;
+    const p99  = m.has_percentiles ? `<span style="color:var(--accent4)">${{fmtNs(m.p99_ns)}}</span>` : na;
+    const pmin = m.has_percentiles ? fmtNs(m.min_ns) : na;
+    const pmax = m.has_percentiles ? fmtNs(m.max_ns) : na;
+    return `<tr>
+      <td>${{langBadge(m.lang)}}</td>
+      <td><span class="badge badge-latency">${{m.benchmark}}</span></td>
+      <td>${{m.metric}}</td>
+      <td style="color:var(--muted)">${{m.count > 0 ? m.count.toLocaleString() : na}}</td>
+      <td>${{fmtNs(m.mean_ns)}}${{deltaCell ? ' ' + deltaCell : ''}}</td>
+      <td>${{p50}}</td>
+      <td>${{p95}}</td>
+      <td>${{p99}}</td>
+      <td style="color:var(--muted)">${{pmin}}</td>
+      <td style="color:var(--muted)">${{pmax}}</td>
+      <td>${{cvCell}}</td>
+    </tr>`;
+  }}).join('');
+
+  const grouped = ['baseline', 'extended', 'other'].map(profile => {{
+    const rows = data.filter(m => m.profile === profile);
+    if (!rows.length) return '';
+    return `
+      <div class="subsection-title">
+        <span>${{profileLabel(profile)}} ${{profileBadge(profile)}}</span>
+        <span style="color:var(--muted);font-size:0.78rem">${{rows.length}} metric(s)</span>
+      </div>
+      <table style="margin-bottom:18px;">
+        <thead><tr>
+          <th>Lang</th><th>Benchmark</th><th>Operation</th><th>n</th>
+          <th>Mean</th><th>p50</th><th>p95</th><th>p99</th><th>Min</th><th>Max</th>
+          <th title="Coefficient of Variation = stddev/mean. Green &lt;10%, Yellow 10-30%, Red &gt;30%">CV%</th>
+        </tr></thead>
+        <tbody>${{renderLatencyRows(rows) || '<tr><td colspan="11" class="empty">No data</td></tr>'}}</tbody>
+      </table>`;
+  }}).join('');
+  document.getElementById('lat-detail-sections').innerHTML = grouped || '<p class="empty">No data</p>';
+}}
+
+// ── Throughput ────────────────────────────────────────────────────────────────
+let thrChart = null;
+function renderThroughput() {{
+  let data = THR;
+  const benchF = document.getElementById('thr-filter').value;
+  const profileF = document.getElementById('thr-profile-filter').value;
+  if (benchF) data = data.filter(m => m.benchmark === benchF);
+  if (profileF) data = data.filter(m => m.profile === profileF);
+  if (langFilter.thr) data = data.filter(m => m.lang === langFilter.thr);
+  const bMap = COMPARING ? Object.fromEntries(B_THR.map(m => [m.benchmark+'/'+m.metric, m])) : {{}};
+
+  if (thrChart) thrChart.destroy();
+  const datasets = [{{ label: 'Ops/sec', data: data.map(m=>m.ops_per_sec), backgroundColor: PALETTE[1]+'cc', borderColor: PALETTE[1], borderWidth: 1 }}];
+  if (COMPARING && B_THR.length) {{
+    datasets.push({{ label: 'Baseline', data: data.map(m=>bMap[m.benchmark+'/'+m.metric]?.ops_per_sec||0), backgroundColor: BASE_COLOR+'55', borderColor: BASE_COLOR, borderWidth: 1 }});
+  }}
+  thrChart = new Chart(document.getElementById('thr-bar'), {{
+    type: 'bar', data: {{ labels: data.map(m=>m.metric), datasets }},
+    options: {{ ...CD, indexAxis: 'y',
+      scales: {{ ...CD.scales, x: {{ ...CD.scales.x, title: {{ display:true, text:'ops/sec', color:'#8b8fa8' }} }} }},
+      plugins: {{ ...CD.plugins, tooltip: {{ ...CD.plugins.tooltip,
+        callbacks: {{ label: ctx => ' ' + fmtOps(ctx.raw) }} }} }},
+    }},
+  }});
+
+  const renderThroughputRows = rows => rows.map(m => {{
+    const b = bMap[m.benchmark+'/'+m.metric];
+    const deltaCell = b ? fmtDelta(((m.ops_per_sec - b.ops_per_sec) / b.ops_per_sec)*100, true) : '';
+    return `<tr>
+      <td>${{langBadge(m.lang)}}</td>
+      <td><span class="badge badge-throughput">${{m.benchmark}}</span> ${{profileBadge(m.profile)}}</td>
+      <td>${{m.metric}}</td>
+      <td style="color:var(--accent2)">${{fmtOps(m.ops_per_sec)}}${{deltaCell ? ' ' + deltaCell : ''}}</td>
+      <td style="color:var(--muted)">${{m.total_ops.toLocaleString()}}</td>
+      <td style="color:var(--muted)">${{m.duration_sec.toFixed(2)}}s</td>
+    </tr>`;
+  }}).join('');
+
+  const grouped = ['baseline', 'extended', 'other'].map(profile => {{
+    const rows = data.filter(m => m.profile === profile);
+    if (!rows.length) return '';
+    return `
+      <div class="subsection-title">
+        <span>${{profileLabel(profile)}} ${{profileBadge(profile)}}</span>
+        <span style="color:var(--muted);font-size:0.78rem">${{rows.length}} metric(s)</span>
+      </div>
+      <table style="margin-bottom:18px;">
+        <thead><tr>
+          <th>Lang</th><th>Benchmark</th><th>Operation</th><th>Ops/sec</th><th>Total Ops</th><th>Duration</th>
+        </tr></thead>
+        <tbody>${{renderThroughputRows(rows) || '<tr><td colspan="6" class="empty">No data</td></tr>'}}</tbody>
+      </table>`;
+  }}).join('');
+  document.getElementById('thr-detail-sections').innerHTML = grouped || '<p class="empty">No data</p>';
+}}
+
+// ── Scalability Tab ───────────────────────────────────────────────────────────
+let sclThrChart = null, sclLatChart = null, sclEffChart = null;
+
+// Show or hide an empty-state message on a canvas card.
+// Hides/shows the <canvas> and adds/removes a sibling <p class="empty">.
+function sclShowEmpty(canvasId, isEmpty, msg) {{
+  const canvas = document.getElementById(canvasId);
+  if (!canvas) return;
+  canvas.style.display = isEmpty ? 'none' : '';
+  let p = canvas.parentElement.querySelector('.scl-empty-msg');
+  if (isEmpty) {{
+    if (!p) {{ p = document.createElement('p'); p.className = 'empty scl-empty-msg'; canvas.parentElement.appendChild(p); }}
+    p.innerHTML = msg;
+  }} else {{
+    if (p) p.remove();
+  }}
+}}
+
+function renderScalability() {{
+  const dim    = document.getElementById('scl-dim').value;
+  const opSel  = document.getElementById('scl-op').value;
+
+  let rows = SCL.filter(r => r.scale_dim === dim);
+  if (opSel) rows = rows.filter(r => r.operation === opSel);
+
+  // Baseline rows for this dimension (only relevant when COMPARING)
+  let bRows = COMPARING ? B_SCL.filter(r => r.scale_dim === dim) : [];
+  if (opSel) bRows = bRows.filter(r => r.operation === opSel);
+
+  const ops = [...new Set(rows.map(r => r.operation))];
+
+  // ── Empty state ───────────────────────────────────────────────────────────
+  const noDataMsg = dim === 'agents'
+    ? 'No agent-scaling data — run with <code>[SCALABILITY][agents]</code> filter to generate it.'
+    : `No scalability data for dimension: <strong>${{dim}}</strong>`;
+
+  // ── Throughput line chart ──────────────────────────────────────────────────
+  const thrData = ops.map((op, i) => {{
+    const pts = rows.filter(r => r.operation === op && r.category === 'throughput')
+                    .sort((a, b) => a.scale_val - b.scale_val);
+    return {{
+      label: op,
+      data: pts.map(p => ({{x: p.scale_val, y: p.ops_per_sec}})),
+      borderColor: PALETTE[i % PALETTE.length],
+      backgroundColor: PALETTE[i % PALETTE.length] + '33',
+      tension: 0.3, parsing: false,
+    }};
+  }}).filter(ds => ds.data.length > 0);
+
+  // Add dashed baseline series when comparing
+  if (COMPARING && bRows.length) {{
+    const bOps = [...new Set(bRows.map(r => r.operation))];
+    bOps.forEach((op, i) => {{
+      const pts = bRows.filter(r => r.operation === op && r.category === 'throughput')
+                       .sort((a, b) => a.scale_val - b.scale_val);
+      if (pts.length === 0) return;
+      thrData.push({{
+        label: op + ' (baseline)',
+        data: pts.map(p => ({{x: p.scale_val, y: p.ops_per_sec}})),
+        borderColor: PALETTE[i % PALETTE.length] + '88',
+        backgroundColor: 'transparent',
+        borderDash: [5, 3],
+        tension: 0.3, parsing: false,
+      }});
+    }});
+  }}
+
+  if (sclThrChart) {{ sclThrChart.destroy(); sclThrChart = null; }}
+  sclShowEmpty('scl-thr-chart', thrData.length === 0, noDataMsg);
+  if (thrData.length > 0) {{
+    sclThrChart = new Chart(document.getElementById('scl-thr-chart'), {{
+      type: 'line', data: {{ datasets: thrData }},
+      options: {{ ...CD,
+        scales: {{ ...CD.scales,
+          x: {{ ...CD.scales.x, type: 'linear', title: {{ display: true, text: dim, color: '#8b8fa8' }} }},
+          y: {{ ...CD.scales.y, title: {{ display: true, text: 'ops/sec', color: '#8b8fa8' }} }},
+        }},
+        plugins: {{ ...CD.plugins,
+          tooltip: {{ ...CD.plugins.tooltip,
+            callbacks: {{ label: ctx => ` ${{ctx.dataset.label}}: ${{fmtOps(ctx.parsed.y)}}` }} }},
+        }},
+      }},
+    }});
+  }}
+
+  // ── Latency line chart ────────────────────────────────────────────────────
+  const latData = ops.map((op, i) => {{
+    const pts = rows.filter(r => r.operation === op && r.category === 'latency')
+                    .sort((a, b) => a.scale_val - b.scale_val);
+    return {{
+      label: op,
+      data: pts.map(p => ({{x: p.scale_val, y: p.mean_ns / 1000}})),
+      borderColor: PALETTE[i % PALETTE.length],
+      backgroundColor: PALETTE[i % PALETTE.length] + '33',
+      tension: 0.3, parsing: false,
+    }};
+  }}).filter(ds => ds.data.length > 0);
+
+  // Add dashed baseline latency series when comparing
+  if (COMPARING && bRows.length) {{
+    const bOps = [...new Set(bRows.map(r => r.operation))];
+    bOps.forEach((op, i) => {{
+      const pts = bRows.filter(r => r.operation === op && r.category === 'latency')
+                       .sort((a, b) => a.scale_val - b.scale_val);
+      if (pts.length === 0) return;
+      latData.push({{
+        label: op + ' (baseline)',
+        data: pts.map(p => ({{x: p.scale_val, y: p.mean_ns / 1000}})),
+        borderColor: PALETTE[i % PALETTE.length] + '88',
+        backgroundColor: 'transparent',
+        borderDash: [5, 3],
+        tension: 0.3, parsing: false,
+      }});
+    }});
+  }}
+
+  if (sclLatChart) {{ sclLatChart.destroy(); sclLatChart = null; }}
+  sclShowEmpty('scl-lat-chart', latData.length === 0, noDataMsg);
+  if (latData.length > 0) {{
+    sclLatChart = new Chart(document.getElementById('scl-lat-chart'), {{
+      type: 'line', data: {{ datasets: latData }},
+      options: {{ ...CD,
+        scales: {{ ...CD.scales,
+          x: {{ ...CD.scales.x, type: 'linear', title: {{ display: true, text: dim, color: '#8b8fa8' }} }},
+          y: {{ ...CD.scales.y, title: {{ display: true, text: 'µs', color: '#8b8fa8' }} }},
+        }},
+        plugins: {{ ...CD.plugins,
+          tooltip: {{ ...CD.plugins.tooltip,
+            callbacks: {{ label: ctx => ` ${{ctx.dataset.label}}: ${{fmtNs(ctx.parsed.y * 1000)}}` }} }},
+        }},
+      }},
+    }});
+  }}
+
+  // ── Efficiency line chart ─────────────────────────────────────────────────
+  // Update title based on dimension
+  const effTitleEl = document.getElementById('scl-eff-title');
+  if (effTitleEl) {{
+    effTitleEl.textContent = dim === 'graph_size'
+      ? 'Relative Throughput (% vs smallest graph)'
+      : 'Scaling Efficiency (% of ideal linear)';
+  }}
+
+  let effRows = EFF.filter(r => r.scale_dim === dim);
+  if (opSel) effRows = effRows.filter(r => r.operation === opSel);
+  const effOps = [...new Set(effRows.map(r => r.operation))];
+
+  const effData = effOps.map((op, i) => {{
+    const pts = effRows.filter(r => r.operation === op)
+                       .sort((a, b) => a.scale_val - b.scale_val);
+    return {{
+      label: op,
+      data: pts.map(p => ({{x: p.scale_val, y: p.efficiency}})),
+      borderColor: PALETTE[i % PALETTE.length],
+      backgroundColor: 'transparent',
+      tension: 0.3, parsing: false,
+    }};
+  }});
+
+  // Add ideal 100% reference line
+  const allScales = [...new Set(effRows.map(r => r.scale_val))].sort((a,b)=>a-b);
+  if (allScales.length >= 2) {{
+    effData.push({{
+      label: dim === 'graph_size' ? 'Reference (100%)' : 'Ideal linear (100%)',
+      data: allScales.map(x => ({{x, y: 100}})),
+      borderColor: '#8b8fa855',
+      backgroundColor: 'transparent',
+      borderDash: [6, 3],
+      pointRadius: 0,
+      tension: 0, parsing: false,
+    }});
+  }}
+
+  if (sclEffChart) {{ sclEffChart.destroy(); sclEffChart = null; }}
+  sclShowEmpty('scl-eff-chart', effData.length === 0, noDataMsg);
+  if (effData.length > 0) {{
+    sclEffChart = new Chart(document.getElementById('scl-eff-chart'), {{
+      type: 'line', data: {{ datasets: effData }},
+      options: {{ ...CD,
+        scales: {{ ...CD.scales,
+          x: {{ ...CD.scales.x, type: 'linear', title: {{ display: true, text: dim, color: '#8b8fa8' }} }},
+          y: {{ ...CD.scales.y,
+            title: {{ display: true, text: 'Efficiency %', color: '#8b8fa8' }},
+            ticks: {{ callback: v => v + '%', color: '#8b8fa8', font: {{size: 11}} }},
+          }},
+        }},
+        plugins: {{ ...CD.plugins,
+          tooltip: {{ ...CD.plugins.tooltip,
+            callbacks: {{ label: ctx => ` ${{ctx.dataset.label}}: ${{ctx.parsed.y.toFixed(1)}}%` }} }},
+        }},
+      }},
+    }});
+  }}
+
+  // ── Detail table ──────────────────────────────────────────────────────────
+  const effMap = {{}};
+  EFF.forEach(r => {{ effMap[r.operation + '/' + r.scale_dim + '/' + r.scale_val] = r.efficiency; }});
+
+  const tableRows = rows.map(r => {{
+    const effKey = r.operation + '/' + r.scale_dim + '/' + r.scale_val;
+    const eff = effMap[effKey];
+    const effCell = eff !== undefined ? eff.toFixed(1) + '%' : '—';
+    const thrCell = r.category === 'throughput' ? fmtOps(r.ops_per_sec) : '—';
+    const latCell = r.mean_ns > 0 ? fmtNs(r.mean_ns) : '—';
+    return `<tr>
+      <td style="color:var(--muted)">${{r.benchmark}}</td>
+      <td>${{r.operation}}</td>
+      <td style="color:var(--accent2)">${{r.scale_dim}}</td>
+      <td style="font-weight:600">${{r.scale_val}}</td>
+      <td style="color:var(--accent2)">${{thrCell}}</td>
+      <td>${{latCell}}</td>
+      <td style="color:var(--accent3)">${{effCell}}</td>
+    </tr>`;
+  }}).join('') || `<tr><td colspan="7" class="empty">${{noDataMsg}}</td></tr>`;
+
+  document.getElementById('scl-table').innerHTML = tableRows;
+}}
+
+// ── Compare Tab ───────────────────────────────────────────────────────────────
+const cmpHidden = {{ python: new Set(), cpp: new Set() }};
+const cmpCharts = {{ python: null, cpp: null }};
+
+function cmpToggle(lang, key) {{
+  const h = cmpHidden[lang];
+  if (h.has(key)) h.delete(key); else h.add(key);
+  updateCmpChart(lang);
+}}
+
+function cmpShowAll(lang) {{
+  cmpHidden[lang].clear();
+  updateCmpChart(lang);
+}}
+
+function updateCmpChart(lang) {{
+  const chart = cmpCharts[lang];
+  if (!chart) return;
+  const h = cmpHidden[lang];
+  chart.data.datasets[0].data = chart._cmpMeta.map(d => h.has(d.key) ? null : d.pct);
+  chart.data.datasets[0].backgroundColor = chart._cmpMeta.map(d => {{
+    if (h.has(d.key)) return 'transparent';
+    return d.pct > 5 ? '#f28b8288' : d.pct < -5 ? '#81c99588' : '#8b8fa855';
+  }});
+  chart.update('none');
+  document.querySelectorAll(`#cmp-chips-${{lang}} .cmp-chip`).forEach(chip => {{
+    chip.classList.toggle('hidden-chip', h.has(chip.dataset.key));
+  }});
+}}
+
+function buildCmpDeltaChart(lang, items) {{
+  const canvasId = `cmp-delta-${{lang}}`;
+  const canvas = document.getElementById(canvasId);
+  if (!canvas || !items.length) return;
+  if (cmpCharts[lang]) cmpCharts[lang].destroy();
+  const h = cmpHidden[lang];
+  const labels = items.map(d => d.label);
+  const data   = items.map(d => h.has(d.key) ? null : d.pct);
+  const colors = items.map(d => h.has(d.key) ? 'transparent' : (d.pct > 5 ? '#f28b8288' : d.pct < -5 ? '#81c99588' : '#8b8fa855'));
+  const chart = new Chart(canvas, {{
+    type: 'bar',
+    data: {{ labels, datasets: [{{ label: 'Δ Mean Latency (%)', data, backgroundColor: colors, borderWidth: 0 }}] }},
+    options: {{ ...CD, indexAxis: 'y',
+      scales: {{ ...CD.scales,
+        x: {{ ...CD.scales.x,
+          title: {{ display:true, text:'% change (positive = slower)', color:'#8b8fa8' }},
+          ticks: {{ callback: v => v + '%', color:'#8b8fa8', font:{{size:11}} }},
+        }},
+        y: {{ ...CD.scales.y, ticks: {{ ...CD.scales.y.ticks, font:{{size:11}} }} }},
+      }},
+      plugins: {{ ...CD.plugins,
+        tooltip: {{ ...CD.plugins.tooltip, callbacks: {{ label: ctx => ctx.raw !== null ? ` ${{ctx.raw.toFixed(1)}}%` : ' (hidden)' }} }},
+      }},
+      onClick: (evt) => {{
+        const pts = chart.getElementsAtEventForMode(evt, 'index', {{intersect: false}}, false);
+        if (pts.length) cmpToggle(lang, items[pts[0].index].key);
+      }},
+    }},
+  }});
+  chart._cmpMeta = items;
+  cmpCharts[lang] = chart;
+
+  // Render chips
+  const chipsEl = document.getElementById(`cmp-chips-${{lang}}`);
+  if (chipsEl) {{
+    chipsEl.innerHTML = items.map(d => {{
+      const col = d.pct > 5 ? '#f28b82' : d.pct < -5 ? '#81c995' : '#8b8fa8';
+      return `<span class="cmp-chip" data-key="${{d.key}}" onclick="cmpToggle('${{lang}}', '${{d.key}}')">`
+           + `<span style="width:8px;height:8px;border-radius:50%;background:${{col}};display:inline-block"></span>`
+           + `${{d.label}}</span>`;
+    }}).join('');
+  }}
+}}
+
+function buildLangSection(lang, lat, bLatMap, thr, bThrMap) {{
+  const langLat = lat.filter(m => m.lang === lang);
+  const langThr = thr.filter(m => m.lang === lang);
+  if (!langLat.length && !langThr.length) return {{ html: '', items: [] }};
+
+  const badgeCls  = lang === 'cpp' ? 'badge-cpp' : 'badge-python';
+  const langLabel = lang === 'cpp' ? 'C++' : 'Python';
+
+  // Deduplicate for chart (first-seen per key)
+  const seenChart = new Set(), chartItems = [];
+  for (const m of langLat) {{
+    const key = m.benchmark + '/' + m.metric;
+    const b = bLatMap[key];
+    if (b && !seenChart.has(key)) {{
+      seenChart.add(key);
+      const pct = ((m.mean_ns - b.mean_ns) / b.mean_ns) * 100;
+      chartItems.push({{ key, label: m.metric + '  [' + m.benchmark + ']', pct }});
+    }}
+  }}
+
+  const chartH = Math.max(240, chartItems.length * 26);
+  const chartSection = chartItems.length ? `
+    <div class="card section">
+      <div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:12px;">
+        <h3>Latency Change vs Baseline (%)</h3>
+        <button class="btn-sm" onclick="cmpShowAll('${{lang}}')">Show All</button>
+      </div>
+      <p style="font-size:0.75rem;color:var(--muted);margin-bottom:10px;">Click a bar or chip to hide/show it</p>
+      <div class="chart-wrap" style="height:${{chartH}}px"><canvas id="cmp-delta-${{lang}}"></canvas></div>
+      <div id="cmp-chips-${{lang}}" class="cmp-chips"></div>
+    </div>` : '';
+
+  const latRows = langLat.map(m => {{
+    const b = bLatMap[m.benchmark+'/'+m.metric];
+    if (!b) return `<tr><td><span class="badge badge-latency">${{m.benchmark}}</span></td><td>${{m.metric}}</td><td colspan="4" style="color:var(--muted)">no baseline</td></tr>`;
+    const pct    = ((m.mean_ns - b.mean_ns) / b.mean_ns) * 100;
+    const p99pct = (m.has_percentiles && b.has_percentiles && b.p99_ns) ? ((m.p99_ns - b.p99_ns) / b.p99_ns) * 100 : null;
+    return `<tr>
+      <td><span class="badge badge-latency">${{m.benchmark}}</span></td>
+      <td>${{m.metric}}</td>
+      <td>${{fmtNs(b.mean_ns)}}</td>
+      <td>${{fmtNs(m.mean_ns)}}</td>
+      <td>${{fmtDelta(pct, false)}}</td>
+      <td>${{p99pct !== null ? fmtDelta(p99pct, false) : '<span class="delta delta-neutral">—</span>'}}</td>
+    </tr>`;
+  }}).join('');
+
+  const thrRows = langThr.map(m => {{
+    const b = bThrMap[m.benchmark+'/'+m.metric];
+    if (!b) return `<tr><td><span class="badge badge-throughput">${{m.benchmark}}</span></td><td>${{m.metric}}</td><td colspan="3" style="color:var(--muted)">no baseline</td></tr>`;
+    const pct = ((m.ops_per_sec - b.ops_per_sec) / b.ops_per_sec) * 100;
+    return `<tr>
+      <td><span class="badge badge-throughput">${{m.benchmark}}</span></td>
+      <td>${{m.metric}}</td>
+      <td style="color:var(--muted)">${{fmtOps(b.ops_per_sec)}}</td>
+      <td style="color:var(--accent2)">${{fmtOps(m.ops_per_sec)}}</td>
+      <td>${{fmtDelta(pct, true)}}</td>
+    </tr>`;
+  }}).join('');
+
+  const latSection = langLat.length ? `
+    <div class="card section">
+      <h3>Latency Comparison</h3>
+      <table>
+        <thead><tr>
+          <th>Benchmark</th><th>Operation</th>
+          <th>Baseline Mean</th><th>Current Mean</th><th>Δ Mean</th><th>Δ p99</th>
+        </tr></thead>
+        <tbody>${{latRows}}</tbody>
+      </table>
+    </div>` : '';
+
+  const thrSection = langThr.length ? `
+    <div class="card section">
+      <h3>Throughput Comparison</h3>
+      <table>
+        <thead><tr>
+          <th>Benchmark</th><th>Operation</th>
+          <th>Baseline</th><th>Current</th><th>Δ</th>
+        </tr></thead>
+        <tbody>${{thrRows}}</tbody>
+      </table>
+    </div>` : '';
+
+  const html = `
+    <div class="lang-section-title">
+      <span class="badge ${{badgeCls}}" style="font-size:0.88rem;padding:4px 14px;">${{langLabel}}</span>
+    </div>
+    ${{chartSection}}
+    ${{latSection}}
+    ${{thrSection}}`;
+
+  return {{ html, items: chartItems }};
+}}
+
+function renderCompare() {{
+  const el = document.getElementById('tab-compare');
+  if (!el || !COMPARING) return;
+
+  const bLatMap = Object.fromEntries(B_LAT.map(m => [m.benchmark+'/'+m.metric, m]));
+  const bThrMap = Object.fromEntries(B_THR.map(m => [m.benchmark+'/'+m.metric, m]));
+
+  const pySection  = buildLangSection('python', LAT, bLatMap, THR, bThrMap);
+  const cppSection = buildLangSection('cpp',    LAT, bLatMap, THR, bThrMap);
+
+  el.innerHTML = `
+    <div class="section">
+      <div style="display:flex;gap:16px;align-items:center;margin-bottom:20px;flex-wrap:wrap;">
+        <div class="run-pill"><span class="dot" style="background:${{RUN_COLOR}}"></span>{run_label}</div>
+        <span style="color:var(--muted)">vs baseline</span>
+        <div class="run-pill"><span class="dot" style="background:${{BASE_COLOR}}"></span>{b_label}</div>
+      </div>
+      ${{pySection.html}}
+      ${{cppSection.html}}
+    </div>`;
+
+  buildCmpDeltaChart('python', pySection.items);
+  buildCmpDeltaChart('cpp',    cppSection.items);
+}}
+
+// ── Raw ───────────────────────────────────────────────────────────────────────
+function renderRaw() {{
+  const all = [...LAT.map(m=>({{'type':'latency',...m}})), ...THR.map(m=>({{'type':'throughput',...m}}))];
+  const byProfile = ['baseline', 'extended', 'other'].sort((a,b)=>profileOrder(a)-profileOrder(b)).map(profile => {{
+    const rows = all.filter(m => m.profile === profile);
+    if (!rows.length) return '';
+    return `
+      <div class="card" style="margin-bottom:18px;">
+        <div class="subsection-title" style="margin-top:0;">
+          <span>${{profileLabel(profile)}} ${{profileBadge(profile)}}</span>
+          <span style="color:var(--muted);font-size:0.78rem">${{rows.length}} metric(s)</span>
+        </div>
+        <pre style="overflow:auto;font-family:var(--mono);font-size:0.78rem;color:var(--muted);max-height:600px;">${{JSON.stringify(rows, null, 2)}}</pre>
+      </div>`;
+  }}).join('');
+  document.getElementById('raw-content').innerHTML = byProfile || `
+    <div class="card">
+      <p class="empty">No data</p>
+    </div>`;
+}}
+
+// ── Init ──────────────────────────────────────────────────────────────────────
+populateFilter('lat-filter', LAT);
+populateFilter('thr-filter', THR);
+// scl-op is populated lazily on first tab activation (needs SCL data)
+renderOverview();
+renderLatency();
+renderThroughput();
+renderRaw();
+</script>
+</body>
+</html>
+"""
+
+    with open(output_path, "w", encoding="utf-8") as f:
+        f.write(html)
+    print(f"Report written to: {os.path.abspath(output_path)}")
+
+
+# ── Entry point ───────────────────────────────────────────────────────────────
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate visual HTML benchmark report")
+    parser.add_argument("--run", "-r", help="Run ID to report on (default: latest)")
+    parser.add_argument("--baseline", "-b", help="Run ID to compare against")
+    parser.add_argument("--results-root", default=DEFAULT_RESULTS_ROOT)
+    parser.add_argument("--output", "-o", help="Output HTML file (default: <run_dir>/report.html)")
+    parser.add_argument("--list", action="store_true", help="List available runs")
+    args = parser.parse_args()
+
+    runs = load_runs_index()
+
+    if args.list:
+        if not runs:
+            print("No runs recorded. Run 'python run_all.py' first.")
+            return
+        print(f"{'ID':<22}  {'Label':<20}  Dir")
+        print("-" * 70)
+        for r in runs:
+            print(f"{r['id']:<22}  {(r.get('label') or '-'):<20}  {r['dir']}")
+        return
+
+    # Resolve target run
+    if args.run:
+        run_dir = resolve_run_dir(args.run, args.results_root)
+    elif runs:
+        # Latest run
+        latest = runs[-1]
+        run_dir = os.path.join(args.results_root, latest["dir"])
+        print(f"Using latest run: {latest['id']}")
+    else:
+        # Fallback: flat results directory (old layout)
+        run_dir = args.results_root
+        print(f"No runs index found, reading from: {run_dir}")
+
+    run_info = load_run_info(run_dir)
+    bench_files = load_run_metrics(run_dir)
+    if not bench_files:
+        print(f"No metric JSON files found in: {run_dir}", file=sys.stderr)
+        sys.exit(1)
+    print(f"Loaded {len(bench_files)} metric file(s) from run '{run_info.get('id', run_dir)}'")
+
+    # Resolve baseline
+    baseline_info, baseline_files = None, None
+    if args.baseline:
+        b_dir = resolve_run_dir(args.baseline, args.results_root)
+        baseline_info = load_run_info(b_dir)
+        baseline_files = load_run_metrics(b_dir)
+        print(f"Baseline: {len(baseline_files)} file(s) from run '{baseline_info.get('id', b_dir)}'")
+
+    output_path = args.output or os.path.join(run_dir, "report.html")
+    generate_html(run_info, bench_files, output_path, baseline_info, baseline_files)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/run_benchmarks.py b/benchmarks/run_benchmarks.py
new file mode 100644
index 0000000..34ffbf1
--- /dev/null
+++ b/benchmarks/run_benchmarks.py
@@ -0,0 +1,873 @@
+#!/usr/bin/env python3
+"""
+Top-level DSR benchmark runner — executes C++ and Python suites in one shot.
+
+Usage:
+    python run_benchmarks.py                         # run both suites
+    python run_benchmarks.py --label "after-fix"     # named run
+    python run_benchmarks.py --cpp-only              # skip Python
+    python run_benchmarks.py --python-only           # skip C++
+    python run_benchmarks.py --build                 # cmake build before running
+    python run_benchmarks.py --all                   # include hidden tests ([.multi], [.extended])
+    python run_benchmarks.py --cpp-filter "[LATENCY]"# pass filter to dsr_benchmarks
+    python run_benchmarks.py --report                # open HTML report when done
+    python run_benchmarks.py --compare <run-id>      # compare against a previous run
+    python run_benchmarks.py --list                  # list recorded runs
+    python run_benchmarks.py --delete <run-id>       # remove a run from the index
+    python run_benchmarks.py --repeat 5              # run C++ 5× and report median
+    python run_benchmarks.py --priority -10          # run with higher OS priority (requires root)
+    python run_benchmarks.py --taskset 0,1           # pin C++ benchmarks to CPU cores 0 and 1
+    python run_benchmarks.py --no-cpu-tune           # skip governor/turbo tuning (Linux)
+"""
+
+import sys
+import os
+import subprocess
+import time
+import json
+import argparse
+import platform
+import shlex
+import tempfile
+from typing import Optional
+from datetime import datetime
+
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+PYTHON_DIR = os.path.join(SCRIPT_DIR, "python")
+BUILD_DIR = os.path.join(SCRIPT_DIR, "build")
+RESULTS_ROOT = os.path.join(SCRIPT_DIR, "results")
+RUNS_INDEX = os.path.join(RESULTS_ROOT, "runs.json")
+BASELINE_CPP_FILTER = "[BASELINE]~[.multi]"
+# Catch2 v3 has no single spec that matches both visible and hidden tests.
+# _run_cpp_once detects this sentinel and runs the binary twice:
+#   1. no filter   → all visible tests
+#   2. "[.]"        → all hidden tests (tags starting with '.')
+ALL_CPP_FILTER = "__ALL_INCLUDING_HIDDEN__"
+DEFAULT_STABILITY_WARN_PCT = 5.0
+
+
+# ── Index helpers (mirrors python/run_all.py) ──────────────────────────────────
+
+def load_runs() -> list:
+    if not os.path.isfile(RUNS_INDEX):
+        return []
+    try:
+        with open(RUNS_INDEX) as f:
+            return json.load(f)
+    except PermissionError:
+        print(f"WARNING: cannot read benchmark index: {RUNS_INDEX} (permission denied)")
+        return []
+
+
+def save_runs(runs: list):
+    os.makedirs(RESULTS_ROOT, exist_ok=True)
+    try:
+        fd, tmp_path = tempfile.mkstemp(prefix="runs.", suffix=".json.tmp", dir=RESULTS_ROOT)
+        try:
+            with os.fdopen(fd, "w") as f:
+                json.dump(runs, f, indent=2)
+            os.replace(tmp_path, RUNS_INDEX)
+        except Exception:
+            try:
+                os.unlink(tmp_path)
+            except OSError:
+                pass
+            raise
+    except PermissionError:
+        print(f"WARNING: cannot update benchmark index: {RUNS_INDEX} (permission denied)")
+
+
+def register_run(run_info: dict):
+    runs = load_runs()
+    runs = [r for r in runs if r["id"] != run_info["id"]]
+    runs.append(run_info)
+    runs.sort(key=lambda r: r["id"])
+    save_runs(runs)
+
+
+# ── Locate C++ binary ─────────────────────────────────────────────────────────
+
+def find_cpp_binary(override: Optional[str]) -> Optional[str]:
+    if override:
+        return override if os.path.isfile(override) else None
+    candidate = os.path.join(BUILD_DIR, "dsr_benchmarks")
+    return candidate if os.path.isfile(candidate) else None
+
+
+def win_to_wsl(path: str) -> str:
+    """Convert a Windows absolute path to a WSL /mnt/... path."""
+    path = path.replace("\\", "/")
+    if len(path) >= 2 and path[1] == ":":
+        drive = path[0].lower()
+        path = f"/mnt/{drive}{path[2:]}"
+    return path
+
+
+def is_wsl_needed() -> bool:
+    """Return True if we're on Windows and wsl.exe is available (ELF binary)."""
+    if platform.system() != "Windows":
+        return False
+    try:
+        subprocess.run(["wsl", "--version"], capture_output=True, timeout=3)
+        return True
+    except Exception:
+        return False
+
+
+# ── Build step ────────────────────────────────────────────────────────────────
+
+def build_cpp() -> bool:
+    if not os.path.isdir(BUILD_DIR):
+        print(f"Build directory not found: {BUILD_DIR}")
+        return False
+    print("Building C++ benchmarks...")
+    if is_wsl_needed():
+        wsl_build = win_to_wsl(BUILD_DIR)
+        result = subprocess.run(
+            ["wsl", "-e", "bash", "-c", f"cmake --build {wsl_build} --parallel"],
+            cwd=SCRIPT_DIR,
+        )
+    else:
+        result = subprocess.run(
+            ["cmake", "--build", BUILD_DIR, "--parallel"],
+            cwd=SCRIPT_DIR,
+        )
+    return result.returncode == 0
+
+
+# ── Median merge ──────────────────────────────────────────────────────────────
+
+def _median(values: list) -> float:
+    """Return the median of a list of numbers (handles even-length lists)."""
+    import statistics
+    return statistics.median(values) if values else 0.0
+
+
+def _summarize_repeat_stability(src_dirs: list[str], dest_dir: str,
+                                warn_pct: Optional[float] = DEFAULT_STABILITY_WARN_PCT):
+    import statistics
+
+    summaries = []
+    all_files: set[str] = set()
+    for d in src_dirs:
+        results_d = os.path.join(d, "results")
+        if os.path.isdir(results_d):
+            for f in os.listdir(results_d):
+                if f.endswith(".json"):
+                    all_files.add(f)
+
+    def metric_key(m: dict) -> str:
+        tags = m.get("tags", {})
+        tag_str = ",".join(f"{k}={v}" for k, v in sorted(tags.items()))
+        return f"{m.get('category', '')}|{m['name']}|{m.get('unit', '')}|{tag_str}"
+
+    for basename in sorted(all_files):
+        loaded = []
+        for d in src_dirs:
+            path = os.path.join(d, "results", basename)
+            if os.path.isfile(path):
+                with open(path) as fh:
+                    loaded.append(json.load(fh))
+
+        metric_runs: dict[str, list[dict]] = {}
+        for run_data in loaded:
+            for m in run_data.get("metrics", []):
+                metric_runs.setdefault(metric_key(m), []).append(m)
+
+        for key, peers in sorted(metric_runs.items()):
+            values = [p["value"] for p in peers if isinstance(p.get("value"), (int, float))]
+            if len(values) < 2:
+                continue
+            median = statistics.median(values)
+            min_v = min(values)
+            max_v = max(values)
+            spread_pct = ((max_v - min_v) / median * 100.0) if median else 0.0
+            stdev_pct = ((statistics.stdev(values) / median) * 100.0) if len(values) > 1 and median else 0.0
+            exemplar = peers[0]
+            summaries.append({
+                "source_file": basename,
+                "name": exemplar["name"],
+                "category": exemplar.get("category", ""),
+                "unit": exemplar.get("unit", ""),
+                "tags": exemplar.get("tags", {}),
+                "repeat_values": values,
+                "median": median,
+                "min": min_v,
+                "max": max_v,
+                "spread_pct": round(spread_pct, 2),
+                "stdev_pct": round(stdev_pct, 2),
+            })
+
+    os.makedirs(dest_dir, exist_ok=True)
+    out_path = os.path.join(dest_dir, "stability_summary.json")
+    with open(out_path, "w") as fh:
+        json.dump({"metrics": summaries}, fh, indent=2)
+
+    warnings = []
+    if summaries:
+        print("\nRepeat stability summary:")
+        for s in summaries:
+            print(f"  {s['category']}/{s['name']}: median={s['median']:.3f} {s['unit']} "
+                  f"spread={s['spread_pct']:.2f}% stdev={s['stdev_pct']:.2f}%")
+            if warn_pct is not None and s["spread_pct"] > warn_pct:
+                warnings.append(s)
+
+    if warnings:
+        print(f"\nStability warnings (spread > {warn_pct:.2f}%):")
+        for s in warnings:
+            print(f"  {s['category']}/{s['name']} tags={s['tags']} spread={s['spread_pct']:.2f}%")
+
+    return {
+        "warn_threshold_pct": warn_pct,
+        "warning_count": len(warnings),
+        "warnings": warnings,
+        "metrics": summaries,
+    }
+
+
+def merge_cpp_results(src_dirs: list[str], dest_dir: str):
+    """
+    Load the same JSON result files from N run directories and write a merged
+    copy to dest_dir where each metric's numerical fields are replaced by the
+    median across all N runs.  Non-numeric fields (name, unit, tags, category)
+    are taken from the first run.
+
+    This cancels OS-scheduler noise: a single run that was preempted by a
+    Windows background process no longer inflates the reported mean.
+    """
+    import statistics as _stats
+
+    os.makedirs(dest_dir, exist_ok=True)
+
+    # Collect all JSON basenames present in any source directory
+    all_files: set[str] = set()
+    for d in src_dirs:
+        results_d = os.path.join(d, "results")
+        if os.path.isdir(results_d):
+            for f in os.listdir(results_d):
+                if f.endswith(".json"):
+                    all_files.add(f)
+
+    merged_count = 0
+    for basename in sorted(all_files):
+        # Load this file from every run that has it
+        loaded = []
+        for d in src_dirs:
+            path = os.path.join(d, "results", basename)
+            if os.path.isfile(path):
+                try:
+                    with open(path) as fh:
+                        loaded.append(json.load(fh))
+                except Exception as e:
+                    print(f"  Warning: could not load {path}: {e}", file=sys.stderr)
+
+        if not loaded:
+            continue
+
+        if len(loaded) == 1:
+            # Only one run has this file — copy as-is
+            import shutil
+            shutil.copy(os.path.join(src_dirs[0], "results", basename),
+                        os.path.join(dest_dir, basename))
+            continue
+
+        # Build merged result: start from first run's structure
+        merged = json.loads(json.dumps(loaded[0]))  # deep copy
+
+        # Index metrics by category+name+unit+tags so latency/throughput records
+        # for the same operation do not get merged into each other.
+        def metric_key(m: dict) -> str:
+            tags = m.get("tags", {})
+            tag_str = ",".join(f"{k}={v}" for k, v in sorted(tags.items()))
+            return f"{m.get('category', '')}|{m['name']}|{m.get('unit', '')}|{tag_str}"
+
+        per_run_metrics: dict[str, list[dict]] = {}
+        for run_data in loaded:
+            for m in run_data.get("metrics", []):
+                k = metric_key(m)
+                per_run_metrics.setdefault(k, []).append(m)
+
+        merged_metrics = []
+        for m in merged.get("metrics", []):
+            k = metric_key(m)
+            peers = per_run_metrics.get(k, [m])
+            if len(peers) < 2:
+                merged_metrics.append(m)
+                continue
+
+            merged_m = json.loads(json.dumps(m))  # deep copy
+            # Median the top-level value
+            values = [p["value"] for p in peers if isinstance(p.get("value"), (int, float))]
+            if values:
+                merged_m["value"] = _median(values)
+
+            # Median all additional numeric fields
+            all_add_keys: set[str] = set()
+            for p in peers:
+                all_add_keys.update(p.get("additional", {}).keys())
+            for key in all_add_keys:
+                vals = [p.get("additional", {}).get(key)
+                        for p in peers if isinstance(p.get("additional", {}).get(key), (int, float))]
+                if vals:
+                    merged_m.setdefault("additional", {})[key] = _median(vals)
+
+            merged_metrics.append(merged_m)
+
+        merged["metrics"] = merged_metrics
+        merged.setdefault("metadata", {})["repeat_runs"] = str(len(loaded))
+        merged["metadata"]["aggregation"] = "median"
+
+        out_path = os.path.join(dest_dir, basename)
+        with open(out_path, "w") as fh:
+            json.dump(merged, fh, indent=2)
+        merged_count += 1
+
+    print(f"  Merged {merged_count} result file(s) from {len(src_dirs)} runs (median)")
+
+
+# ── CPU tuning ────────────────────────────────────────────────────────────────
+
+def _cpu_count() -> int:
+    try:
+        import multiprocessing
+        return multiprocessing.cpu_count()
+    except Exception:
+        return 1
+
+
+def _read_sysfs(path: str) -> Optional[str]:
+    try:
+        with open(path) as f:
+            return f.read().strip()
+    except OSError:
+        return None
+
+
+def _write_sysfs(path: str, value: str) -> bool:
+    try:
+        with open(path, "w") as f:
+            f.write(value + "\n")
+        return True
+    except OSError:
+        return False
+
+
+def setup_cpu_for_benchmarking() -> dict:
+    """
+    Configure the CPU for stable benchmarking:
+      - Set scaling governor to 'performance' on all CPUs
+      - Disable turbo boost (Intel pstate or generic cpufreq boost)
+
+    Returns a dict of original settings so restore_cpu_settings() can revert them.
+    Prints a warning and returns an empty dict if the process lacks write permission.
+    """
+    if platform.system() != "Linux":
+        return {}
+
+    saved = {"governors": {}, "intel_no_turbo": None, "amd_boost": None}
+    any_written = False
+    permission_error = False
+
+    n_cpus = _cpu_count()
+    for i in range(n_cpus):
+        gov_path = f"/sys/devices/system/cpu/cpu{i}/cpufreq/scaling_governor"
+        current = _read_sysfs(gov_path)
+        if current is None:
+            continue
+        saved["governors"][gov_path] = current
+        if current != "performance":
+            if _write_sysfs(gov_path, "performance"):
+                any_written = True
+            else:
+                permission_error = True
+
+    # Intel pstate: write "1" to disable turbo
+    intel_path = "/sys/devices/system/cpu/intel_pstate/no_turbo"
+    val = _read_sysfs(intel_path)
+    if val is not None:
+        saved["intel_no_turbo"] = val
+        if val != "1":
+            if _write_sysfs(intel_path, "1"):
+                any_written = True
+            else:
+                permission_error = True
+
+    # AMD / generic: write "0" to disable boost
+    amd_path = "/sys/devices/system/cpu/cpufreq/boost"
+    val = _read_sysfs(amd_path)
+    if val is not None:
+        saved["amd_boost"] = val
+        if val != "0":
+            if _write_sysfs(amd_path, "0"):
+                any_written = True
+            else:
+                permission_error = True
+
+    if permission_error:
+        print(
+            "\nWARNING: Could not set CPU governor/turbo (permission denied).\n"
+            "  Run with sudo, or manually run:  sudo pyperf system tune\n"
+            "  Benchmarks may show instability due to frequency scaling.\n"
+        )
+        return {}
+
+    if any_written:
+        print("  CPU tuning: governor=performance, turbo disabled")
+
+    return saved
+
+
+def restore_cpu_settings(saved: dict):
+    """Revert CPU governor and turbo settings to the values captured by setup_cpu_for_benchmarking()."""
+    if not saved:
+        return
+
+    for path, value in saved.get("governors", {}).items():
+        _write_sysfs(path, value)
+
+    if saved.get("intel_no_turbo") is not None:
+        _write_sysfs("/sys/devices/system/cpu/intel_pstate/no_turbo", saved["intel_no_turbo"])
+
+    if saved.get("amd_boost") is not None:
+        _write_sysfs("/sys/devices/system/cpu/cpufreq/boost", saved["amd_boost"])
+
+    print("  CPU settings restored")
+
+
+# ── Run C++ suite ─────────────────────────────────────────────────────────────
+
+def _build_cpp_cmd(binary: str, catch2_filter: Optional[str], verbose: bool,
+                   priority: Optional[int], taskset: Optional[str]) -> str:
+    """Build the shell command string for one C++ benchmark invocation."""
+    parts = []
+    if taskset:
+        parts += [f"taskset -c {shlex.quote(taskset)}"]
+    if priority is not None:
+        parts += [f"nice -n {priority}"]
+    wsl_binary = win_to_wsl(binary) if is_wsl_needed() else binary
+    parts.append(shlex.quote(wsl_binary))
+    if catch2_filter:
+        parts.append(shlex.quote(catch2_filter))
+    if verbose:
+        parts.append("--verbose")
+    return " ".join(parts)
+
+
+def _run_cpp_once(binary: str, cpp_cwd: str, catch2_filter: Optional[str],
+                  verbose: bool, priority: Optional[int], taskset: Optional[str]) -> tuple[bool, float]:
+    # Catch2 v3 has no single-spec "run everything including hidden".
+    # Handle the sentinel by running visible tests then hidden tests in the same cwd.
+    if catch2_filter == ALL_CPP_FILTER:
+        ok1, dur1 = _run_cpp_once(binary, cpp_cwd, None,  verbose, priority, taskset)
+        ok2, dur2 = _run_cpp_once(binary, cpp_cwd, "[.]", verbose, priority, taskset)
+        return ok1 and ok2, dur1 + dur2
+
+    os.makedirs(cpp_cwd, exist_ok=True)
+    start = time.time()
+    if is_wsl_needed():
+        wsl_cwd = win_to_wsl(cpp_cwd)
+        cmd_str = _build_cpp_cmd(binary, catch2_filter, verbose, priority, taskset)
+        bash_cmd = f"cd {wsl_cwd} && {cmd_str}"
+        result = subprocess.run(["wsl", "-e", "bash", "-c", bash_cmd])
+    else:
+        cmd = []
+        if taskset:
+            cmd += ["taskset", "-c", taskset]
+        if priority is not None:
+            cmd += ["nice", "-n", str(priority)]
+        cmd.append(binary)
+        if catch2_filter:
+            cmd.append(catch2_filter)
+        if verbose:
+            cmd.append("--verbose")
+        result = subprocess.run(cmd, cwd=cpp_cwd)
+    duration = time.time() - start
+    return result.returncode == 0, duration
+
+
+def run_cpp(binary: str, run_dir: str, catch2_filter: Optional[str], verbose: bool,
+            repeat: int = 1, priority: Optional[int] = None, taskset: Optional[str] = None,
+            stability_warn_pct: Optional[float] = DEFAULT_STABILITY_WARN_PCT):
+    """
+    Run dsr_benchmarks 'repeat' times.  If repeat > 1, each invocation writes
+    to a separate cpp_N/ subdirectory; results are then median-merged into
+    cpp/results/ so the rest of the pipeline sees a single stable result set.
+    """
+    print(f"\n{'=' * 70}")
+    print(f"Running: C++ benchmarks ({os.path.basename(binary)})")
+    if catch2_filter == ALL_CPP_FILTER:
+        print("Filter : (all — visible + hidden)")
+    elif catch2_filter:
+        print(f"Filter : {catch2_filter}")
+    if repeat > 1:
+        print(f"Repeat : {repeat}× (median aggregation)")
+    if priority is not None:
+        print(f"Priority: nice {priority:+d}")
+    if taskset:
+        print(f"CPU affinity: {taskset}")
+    print("=" * 70)
+
+    total_start = time.time()
+    all_ok = True
+    stability = None
+
+    if repeat <= 1:
+        # Single run — original behaviour
+        cpp_cwd = os.path.join(run_dir, "cpp")
+        print(f"Output : {cpp_cwd}/results/")
+        ok, dur = _run_cpp_once(binary, cpp_cwd, catch2_filter, verbose, priority, taskset)
+        all_ok = ok
+    else:
+        # Multiple runs → median merge
+        run_cwds = []
+        for r in range(1, repeat + 1):
+            cpp_cwd = os.path.join(run_dir, f"cpp_{r}")
+            print(f"\n--- Run {r}/{repeat} → {cpp_cwd}/results/ ---")
+            ok, dur = _run_cpp_once(binary, cpp_cwd, catch2_filter, verbose, priority, taskset)
+            if not ok:
+                print(f"  Warning: run {r} exited non-zero")
+                all_ok = False
+            run_cwds.append(cpp_cwd)
+
+        # Merge into canonical cpp/results/
+        dest = os.path.join(run_dir, "cpp", "results")
+        print(f"\nMerging {repeat} runs → {dest}")
+        merge_cpp_results(run_cwds, dest)
+        stability = _summarize_repeat_stability(run_cwds, dest, warn_pct=stability_warn_pct)
+
+    total_dur = time.time() - total_start
+    print(f"\nC++ suite {'PASSED' if all_ok else 'FAILED'} in {total_dur:.1f}s")
+    return all_ok, total_dur, stability
+
+
+# ── Run Python suite ──────────────────────────────────────────────────────────
+
+def run_python(run_dir: str, label: Optional[str], baseline: bool = False):
+    """
+    Delegate to python/run_all.py passing BENCH_RESULTS_DIR so Python files
+    land directly in <run_dir>/ (not a subdirectory).
+    """
+    print(f"\n{'=' * 70}")
+    print("Running: Python benchmarks")
+    print(f"Output : {run_dir}/")
+    print("=" * 70)
+
+    env = {**os.environ, "BENCH_RESULTS_DIR": run_dir}
+    cmd = [sys.executable, os.path.join(PYTHON_DIR, "run_all.py"), "--direct"]
+    if baseline:
+        cmd.append("--baseline")
+    # --direct: benchmarks write to BENCH_RESULTS_DIR, skip run_all.py's own
+    # index registration so run_benchmarks.py stays the single source of truth.
+
+    start = time.time()
+    result = subprocess.run(cmd, cwd=PYTHON_DIR, env=env)
+    duration = time.time() - start
+
+    ok = result.returncode == 0
+    print(f"\nPython suite {'PASSED' if ok else 'FAILED'} in {duration:.1f}s")
+    return ok, duration
+
+
+# ── Ownership / permission helpers ───────────────────────────────────────────
+
+def _fix_run_permissions(run_dir: str):
+    """
+    When the script is run via sudo, chown the run directory and the shared
+    results index back to the original user so they remain accessible without
+    root.  Falls back to world-readable permissions when the original user
+    cannot be determined (e.g. direct root login).
+    """
+    if os.getuid() != 0:
+        return  # Not running as root — nothing to do.
+
+    sudo_uid_str = os.environ.get("SUDO_UID")
+    sudo_gid_str = os.environ.get("SUDO_GID")
+
+    if sudo_uid_str:
+        uid = int(sudo_uid_str)
+        gid = int(sudo_gid_str) if sudo_gid_str else uid
+
+        def _chown_tree(path: str):
+            for dirpath, dirnames, filenames in os.walk(path, topdown=False):
+                for name in filenames:
+                    try:
+                        os.chown(os.path.join(dirpath, name), uid, gid)
+                    except OSError:
+                        pass
+                try:
+                    os.chown(dirpath, uid, gid)
+                except OSError:
+                    pass
+
+        _chown_tree(run_dir)
+
+        # Also fix the shared index file and RESULTS_ROOT itself so the user
+        # can write new runs later without sudo.
+        for path in (RUNS_INDEX, RESULTS_ROOT):
+            try:
+                os.chown(path, uid, gid)
+            except OSError:
+                pass
+
+        try:
+            import pwd as _pwd
+            username = _pwd.getpwuid(uid).pw_name
+            print(f"  Ownership transferred to {username} (uid={uid}, gid={gid})")
+        except Exception:
+            print(f"  Ownership transferred to uid={uid}, gid={gid}")
+    else:
+        # Direct root login — make results world-readable as a fallback.
+        import stat
+        _file_mode = (stat.S_IRUSR | stat.S_IWUSR |
+                      stat.S_IRGRP |
+                      stat.S_IROTH)
+        _dir_mode  = _file_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH
+
+        for dirpath, dirnames, filenames in os.walk(run_dir, topdown=False):
+            for name in filenames:
+                try:
+                    os.chmod(os.path.join(dirpath, name), _file_mode)
+                except OSError:
+                    pass
+            try:
+                os.chmod(dirpath, _dir_mode)
+            except OSError:
+                pass
+
+        print("  Results made world-readable (root without sudo; SUDO_UID not set)")
+
+
+# ── Commands ──────────────────────────────────────────────────────────────────
+
+def cmd_list():
+    runs = load_runs()
+    if not runs:
+        print("No runs recorded yet.")
+        return
+    print(f"{'ID':<22}  {'Label':<20}  {'Suites':<12}  {'Duration':>9}")
+    print("-" * 70)
+    for r in runs:
+        suites = ", ".join(r.get("suites_run", [])) or "-"
+        dur = f"{r.get('total_duration_sec', 0):.1f}s"
+        label = r.get("label") or "-"
+        print(f"{r['id']:<22}  {label:<20}  {suites:<12}  {dur:>9}")
+
+
+def cmd_delete(run_id: str):
+    runs = load_runs()
+    before = len(runs)
+    runs = [r for r in runs if r["id"] != run_id]
+    if len(runs) == before:
+        print(f"Run '{run_id}' not found in index.")
+        return
+    save_runs(runs)
+    print(f"Removed run '{run_id}' from index (files kept on disk).")
+
+
+def cmd_run(args):
+    ts = datetime.now()
+    run_id = ts.strftime("%Y%m%dT%H%M%S%f")
+    dir_name = run_id if not args.label else f"{run_id}_{args.label.replace(' ', '-')}"
+    run_dir = os.path.join(RESULTS_ROOT, dir_name)
+    os.makedirs(run_dir, exist_ok=True)
+
+    print("=" * 70)
+    print("  DSR Benchmark Suite (C++ + Python)")
+    print(f"  Run ID : {run_id}")
+    if args.label:
+        print(f"  Label  : {args.label}")
+    print(f"  Output : {run_dir}")
+    print("=" * 70)
+
+    effective_cpp_filter = args.cpp_filter
+    if args.all and not effective_cpp_filter:
+        effective_cpp_filter = ALL_CPP_FILTER
+    elif args.baseline and not effective_cpp_filter:
+        effective_cpp_filter = BASELINE_CPP_FILTER
+
+    # Optionally build C++
+    if args.build:
+        if not build_cpp():
+            print("Build failed — aborting.")
+            return 1
+
+    suites_run = []
+    results = {}
+    total_start = time.time()
+
+    # CPU tuning (Linux only, skipped with --no-cpu-tune or when Python-only)
+    cpu_saved = {}
+    if not getattr(args, "no_cpu_tune", False) and not args.python_only:
+        cpu_saved = setup_cpu_for_benchmarking()
+
+    try:
+        # C++ suite
+        if not args.python_only:
+            binary = find_cpp_binary(args.cpp_binary)
+            if binary:
+                ok, dur, stability = run_cpp(
+                    binary, run_dir, effective_cpp_filter, args.verbose,
+                    repeat=args.repeat, priority=args.priority, taskset=args.taskset,
+                    stability_warn_pct=args.stability_warn_pct,
+                )
+                results["cpp"] = {"ok": ok, "duration_sec": dur, "stability": stability}
+                suites_run.append("cpp")
+            else:
+                print("\nWARNING: C++ binary not found. Use --cpp-binary or --build.")
+                print(f"  Searched: {os.path.join(BUILD_DIR, 'dsr_benchmarks')}")
+                results["cpp"] = {"ok": False, "duration_sec": 0, "skipped": True}
+
+        # Python suite
+        if not args.cpp_only:
+            ok, dur = run_python(run_dir, args.label, baseline=args.baseline)
+            results["python"] = {"ok": ok, "duration_sec": dur}
+            suites_run.append("python")
+
+    finally:
+        restore_cpu_settings(cpu_saved)
+
+    total_duration = time.time() - total_start
+
+    # Gather git hash
+    try:
+        git_hash = subprocess.check_output(
+            ["git", "rev-parse", "--short", "HEAD"],
+            cwd=SCRIPT_DIR, stderr=subprocess.DEVNULL,
+        ).decode().strip()
+    except Exception:
+        git_hash = ""
+
+    run_info = {
+        "id": run_id,
+        "label": args.label or "",
+        "dir": dir_name,
+        "timestamp": ts.isoformat(),
+        "total_duration_sec": round(total_duration, 2),
+        "suites_run": suites_run,
+        "suites_passed": [s for s in suites_run if results.get(s, {}).get("ok")],
+        "git_hash": git_hash,
+        "platform": platform.platform(),
+        "python": sys.version.split()[0],
+    }
+
+    cpp_stability = results.get("cpp", {}).get("stability")
+    if cpp_stability:
+        run_info["cpp_stability"] = {
+            "warn_threshold_pct": cpp_stability.get("warn_threshold_pct"),
+            "warning_count": cpp_stability.get("warning_count", 0),
+        }
+
+    with open(os.path.join(run_dir, "run_info.json"), "w") as f:
+        json.dump(run_info, f, indent=2)
+
+    register_run(run_info)
+
+    # Summary
+    print("\n" + "=" * 70)
+    print("  Summary")
+    print("=" * 70)
+    all_ok = True
+    for suite in ["cpp", "python"]:
+        if suite not in results:
+            continue
+        r = results[suite]
+        if r.get("skipped"):
+            print(f"  [SKIP] {suite}")
+        else:
+            status = "PASS" if r["ok"] else "FAIL"
+            print(f"  [{status}] {suite} ({r['duration_sec']:.1f}s)")
+            if not r["ok"]:
+                all_ok = False
+
+    print(f"\n  Run ID  : {run_id}")
+    print(f"  Results : {run_dir}")
+    print(f"  Index   : {RUNS_INDEX}")
+
+    # Generate report
+    if args.report or args.compare:
+        report_args = ["--run", run_id, "--results-root", RESULTS_ROOT]
+        if args.compare:
+            report_args += ["--baseline", args.compare]
+        report_path = os.path.join(run_dir, "report.html")
+        report_args += ["--output", report_path]
+
+        print(f"\nGenerating report...")
+        subprocess.run(
+            [sys.executable, os.path.join(SCRIPT_DIR, "report.py")] + report_args,
+            cwd=SCRIPT_DIR,
+        )
+
+        if args.open_report and os.path.isfile(report_path):
+            import webbrowser
+            webbrowser.open(f"file://{report_path}")
+
+    _fix_run_permissions(run_dir)
+    return 0 if all_ok else 1
+
+
+# ── Entry point ───────────────────────────────────────────────────────────────
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Run DSR C++ and Python benchmarks together",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument("--label", "-l", help="Human-readable label for this run")
+    parser.add_argument("--cpp-binary", metavar="PATH",
+                        help=f"Path to dsr_benchmarks binary (default: {os.path.join(BUILD_DIR, 'dsr_benchmarks')})")
+    parser.add_argument("--cpp-filter", metavar="FILTER",
+                        help='Catch2 test filter, e.g. "[LATENCY]" or "[THROUGHPUT]"')
+    parser.add_argument("--build", action="store_true",
+                        help="Build C++ benchmarks before running")
+    parser.add_argument("--cpp-only", action="store_true", help="Skip Python suite")
+    parser.add_argument("--python-only", action="store_true", help="Skip C++ suite")
+    parser.add_argument("--all", action="store_true",
+                        help="Run all C++ tests including hidden ones ([.multi], [.extended])")
+    parser.add_argument("--baseline", action="store_true",
+                        help="Run only the curated low-noise baseline benchmark set")
+    parser.add_argument("--verbose", "-v", action="store_true",
+                        help="Pass --verbose to C++ binary (shows Qt debug messages)")
+    parser.add_argument("--report", action="store_true",
+                        help="Generate HTML report after the run")
+    parser.add_argument("--open", dest="open_report", action="store_true",
+                        help="Open the HTML report in a browser after generation")
+    parser.add_argument("--compare", metavar="RUN_ID",
+                        help="Generate a comparison report against this baseline run")
+    parser.add_argument("--list", action="store_true", help="List all recorded runs")
+    parser.add_argument("--delete", metavar="RUN_ID",
+                        help="Remove a run from the index")
+    parser.add_argument("--repeat", "-r", type=int, default=1, metavar="N",
+                        help="Run C++ benchmarks N times and report the median (reduces OS noise)")
+    parser.add_argument("--priority", type=int, default=None, metavar="NICE",
+                        help="Set process nice level (e.g. -10); values < 0 require root/sudo")
+    parser.add_argument("--taskset", metavar="CPULIST",
+                        help="Pin C++ benchmarks to CPU cores via taskset (e.g. '0,1')")
+    parser.add_argument("--no-cpu-tune", action="store_true",
+                        help="Skip automatic CPU governor/turbo configuration (Linux only)")
+    parser.add_argument("--stability-warn-pct", type=float, default=DEFAULT_STABILITY_WARN_PCT,
+                        metavar="PCT",
+                        help="Warn when repeated C++ metrics exceed this spread percentage")
+
+    args = parser.parse_args()
+
+    if args.list:
+        cmd_list()
+        return 0
+
+    if args.delete:
+        cmd_delete(args.delete)
+        return 0
+
+    if args.all and args.baseline:
+        print("Error: --all and --baseline are mutually exclusive.")
+        return 1
+
+    if args.cpp_only and args.python_only:
+        print("Error: --cpp-only and --python-only are mutually exclusive.")
+        return 1
+
+    return cmd_run(args)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/benchmarks/scalability/agent_scaling_bench.cpp b/benchmarks/scalability/agent_scaling_bench.cpp
new file mode 100644
index 0000000..3764f0d
--- /dev/null
+++ b/benchmarks/scalability/agent_scaling_bench.cpp
@@ -0,0 +1,300 @@
+#include <catch2/catch_test_macros.hpp>
+#include <thread>
+#include <atomic>
+#include <barrier>
+#include <vector>
+#include <chrono>
+#include <string>
+#include <iostream>
+
+#include "../core/timing_utils.h"
+#include "../core/metrics_collector.h"
+#include "../core/report_generator.h"
+#include "../fixtures/multi_agent_fixture.h"
+#include "../fixtures/graph_generator.h"
+
+using namespace DSR;
+using namespace DSR::Benchmark;
+using namespace std::chrono;
+
+// Multi-agent scaling benchmarks.  Tagged [.multi] so they are excluded from
+// the default test run (DDS multi-agent tests are slow and require specific
+// network setup).  Opt in with: --cpp-filter "[SCALABILITY][agents]"
+//
+// Loop over {1, 2, 4} agents.  One thread per agent operates on its own
+// DSRGraph instance; a 3-second window measures total throughput and latency.
+
+static constexpr auto AGENT_DUR = std::chrono::seconds(3);
+
+// ── Node insert ───────────────────────────────────────────────────────────────
+
+TEST_CASE("Node insert agent scaling", "[SCALABILITY][agents][.multi][PROFILE][MULTIAGENT]") {
+    GraphGenerator generator;
+    MetricsCollector collector("node_insert_agent_scaling");
+
+    for (uint32_t N : {1u, 2u, 4u}) {
+        MultiAgentFixture fixture;
+        auto config_file = generator.generate_empty_graph();
+        REQUIRE(fixture.create_agents(N, config_file));
+        fixture.wait_for_sync();
+
+        std::atomic<uint64_t> total_ops{0};
+        std::atomic<uint64_t> failed_ops{0};
+        std::atomic<bool> stop_flag{false};
+        std::barrier sync_point(N);
+
+        std::vector<std::vector<uint64_t>> per_thread_samples(N);
+        for (auto& s : per_thread_samples) s.reserve(500000 / N);
+
+        std::vector<std::thread> threads;
+        threads.reserve(N);
+
+        auto wall_start = steady_clock::now();
+
+        for (uint32_t i = 0; i < N; ++i) {
+            threads.emplace_back([&, agent_idx = i]() {
+                auto* graph = fixture.get_agent(agent_idx);
+                uint64_t base_id = 800000ULL + agent_idx * 200000ULL;
+                uint64_t local_ops = 0;
+                auto& samples = per_thread_samples[agent_idx];
+
+                sync_point.arrive_and_wait();
+
+                while (!stop_flag.load(std::memory_order_relaxed)) {
+                    auto node = GraphGenerator::create_test_node(
+                        base_id + local_ops, graph->get_agent_id());
+                    uint64_t ts = bench_now();
+                    auto res = graph->insert_node(node);
+                    samples.push_back(bench_now() - ts);
+                    if (!res.has_value())
+                        failed_ops.fetch_add(1, std::memory_order_relaxed);
+                    local_ops++;
+                }
+
+                total_ops.fetch_add(local_ops, std::memory_order_relaxed);
+            });
+        }
+
+        std::this_thread::sleep_for(AGENT_DUR);
+        stop_flag.store(true, std::memory_order_relaxed);
+        for (auto& th : threads) th.join();
+
+        if (failed_ops.load() > 0)
+            std::cerr << "[BENCH node_insert agents=" << N << "] "
+                      << failed_ops.load() << " insert_node calls failed\n";
+
+        auto dur = duration_cast<milliseconds>(steady_clock::now() - wall_start);
+
+        LatencyTracker merged;
+        for (auto& s : per_thread_samples)
+            for (auto v : s) merged.record(v);
+
+        const std::string n_str = std::to_string(N);
+        collector.record_throughput("node_insert", total_ops.load(), dur,
+            {{"agents", n_str}});
+        if (!merged.empty())
+            collector.record_latency_stats("node_insert", merged.stats(),
+                {{"agents", n_str}});
+
+        double ops_per_sec = static_cast<double>(total_ops.load()) /
+                             (static_cast<double>(dur.count()) / 1000.0);
+        collector.record_scalability("node_insert", N, ops_per_sec, "ops/sec",
+            {{"agents", n_str}, {"scale_dim", "agents"}});
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "node_insert_agent_scaling");
+}
+
+// ── Node read ─────────────────────────────────────────────────────────────────
+
+TEST_CASE("Node read agent scaling", "[SCALABILITY][agents][.multi][PROFILE][MULTIAGENT]") {
+    GraphGenerator generator;
+    MetricsCollector collector("node_read_agent_scaling");
+
+    for (uint32_t N : {1u, 2u, 4u}) {
+        MultiAgentFixture fixture;
+        auto config_file = generator.generate_empty_graph();
+        REQUIRE(fixture.create_agents(N, config_file));
+        fixture.wait_for_sync();
+
+        // Pre-populate 1000 nodes on agent 0; they sync to all agents.
+        auto* graph0 = fixture.get_agent(0);
+        std::vector<uint64_t> node_ids;
+        node_ids.reserve(1000);
+        for (uint64_t i = 0; i < 1000; ++i) {
+            auto node = GraphGenerator::create_test_node(0, graph0->get_agent_id());
+            auto res = graph0->insert_node(node);
+            REQUIRE(res.has_value());
+            node_ids.push_back(res.value());
+        }
+        fixture.wait_for_sync();
+
+        const size_t pool_size = node_ids.size();
+
+        std::atomic<uint64_t> total_ops{0};
+        std::atomic<uint64_t> failed_ops{0};
+        std::atomic<bool> stop_flag{false};
+        std::barrier sync_point(N);
+
+        std::vector<std::vector<uint64_t>> per_thread_samples(N);
+        for (auto& s : per_thread_samples) s.reserve(500000 / N);
+
+        std::vector<std::thread> threads;
+        threads.reserve(N);
+
+        auto wall_start = steady_clock::now();
+
+        for (uint32_t i = 0; i < N; ++i) {
+            threads.emplace_back([&, agent_idx = i]() {
+                auto* graph = fixture.get_agent(agent_idx);
+                uint64_t local_ops = 0;
+                auto& samples = per_thread_samples[agent_idx];
+
+                sync_point.arrive_and_wait();
+
+                while (!stop_flag.load(std::memory_order_relaxed)) {
+                    uint64_t id = node_ids[local_ops % pool_size];
+                    uint64_t ts = bench_now();
+                    auto node = graph->get_node(id);
+                    samples.push_back(bench_now() - ts);
+                    if (!node.has_value())
+                        failed_ops.fetch_add(1, std::memory_order_relaxed);
+                    local_ops++;
+                }
+
+                total_ops.fetch_add(local_ops, std::memory_order_relaxed);
+            });
+        }
+
+        std::this_thread::sleep_for(AGENT_DUR);
+        stop_flag.store(true, std::memory_order_relaxed);
+        for (auto& th : threads) th.join();
+
+        if (failed_ops.load() > 0)
+            std::cerr << "[BENCH node_read agents=" << N << "] "
+                      << failed_ops.load() << " get_node calls returned empty\n";
+
+        auto dur = duration_cast<milliseconds>(steady_clock::now() - wall_start);
+
+        LatencyTracker merged;
+        for (auto& s : per_thread_samples)
+            for (auto v : s) merged.record(v);
+
+        const std::string n_str = std::to_string(N);
+        collector.record_throughput("node_read", total_ops.load(), dur,
+            {{"agents", n_str}});
+        if (!merged.empty())
+            collector.record_latency_stats("node_read", merged.stats(),
+                {{"agents", n_str}});
+
+        double ops_per_sec = static_cast<double>(total_ops.load()) /
+                             (static_cast<double>(dur.count()) / 1000.0);
+        collector.record_scalability("node_read", N, ops_per_sec, "ops/sec",
+            {{"agents", n_str}, {"scale_dim", "agents"}});
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "node_read_agent_scaling");
+}
+
+// ── Node update ───────────────────────────────────────────────────────────────
+
+TEST_CASE("Node update agent scaling", "[SCALABILITY][agents][.multi][PROFILE][MULTIAGENT]") {
+    GraphGenerator generator;
+    MetricsCollector collector("node_update_agent_scaling");
+
+    for (uint32_t N : {1u, 2u, 4u}) {
+        MultiAgentFixture fixture;
+        auto config_file = generator.generate_empty_graph();
+        REQUIRE(fixture.create_agents(N, config_file));
+        fixture.wait_for_sync();
+
+        // Each agent gets its own dedicated node to avoid update contention.
+        std::vector<uint64_t> agent_node_ids(N);
+        for (uint32_t i = 0; i < N; ++i) {
+            auto* graph = fixture.get_agent(i);
+            auto node = GraphGenerator::create_test_node(
+                700000 + i, graph->get_agent_id(),
+                "agent_update_node_" + std::to_string(i));
+            auto res = graph->insert_node(node);
+            REQUIRE(res.has_value());
+            agent_node_ids[i] = res.value();
+        }
+        fixture.wait_for_sync();
+
+        std::atomic<uint64_t> total_ops{0};
+        std::atomic<uint64_t> failed_ops{0};
+        std::atomic<bool> stop_flag{false};
+        std::barrier sync_point(N);
+
+        std::vector<std::vector<uint64_t>> per_thread_samples(N);
+        for (auto& s : per_thread_samples) s.reserve(500000 / N);
+
+        std::vector<std::thread> threads;
+        threads.reserve(N);
+
+        auto wall_start = steady_clock::now();
+
+        for (uint32_t i = 0; i < N; ++i) {
+            threads.emplace_back([&, agent_idx = i]() {
+                auto* graph = fixture.get_agent(agent_idx);
+                uint64_t nid = agent_node_ids[agent_idx];
+                uint64_t local_ops = 0;
+                auto& samples = per_thread_samples[agent_idx];
+
+                sync_point.arrive_and_wait();
+
+                while (!stop_flag.load(std::memory_order_relaxed)) {
+                    auto node = graph->get_node(nid);
+                    if (node) {
+                        graph->add_or_modify_attrib_local<level_att>(
+                            *node, static_cast<int32_t>(local_ops % 1000));
+                        uint64_t ts = bench_now();
+                        bool ok = graph->update_node(*node);
+                        samples.push_back(bench_now() - ts);
+                        if (!ok)
+                            failed_ops.fetch_add(1, std::memory_order_relaxed);
+                        local_ops++;
+                    } else {
+                        failed_ops.fetch_add(1, std::memory_order_relaxed);
+                    }
+                }
+
+                total_ops.fetch_add(local_ops, std::memory_order_relaxed);
+            });
+        }
+
+        std::this_thread::sleep_for(AGENT_DUR);
+        stop_flag.store(true, std::memory_order_relaxed);
+        for (auto& th : threads) th.join();
+
+        if (failed_ops.load() > 0)
+            std::cerr << "[BENCH node_update agents=" << N << "] "
+                      << failed_ops.load() << " get_node/update_node calls failed\n";
+
+        auto dur = duration_cast<milliseconds>(steady_clock::now() - wall_start);
+
+        LatencyTracker merged;
+        for (auto& s : per_thread_samples)
+            for (auto v : s) merged.record(v);
+
+        const std::string n_str = std::to_string(N);
+        collector.record_throughput("node_update", total_ops.load(), dur,
+            {{"agents", n_str}});
+        if (!merged.empty())
+            collector.record_latency_stats("node_update", merged.stats(),
+                {{"agents", n_str}});
+
+        double ops_per_sec = static_cast<double>(total_ops.load()) /
+                             (static_cast<double>(dur.count()) / 1000.0);
+        collector.record_scalability("node_update", N, ops_per_sec, "ops/sec",
+            {{"agents", n_str}, {"scale_dim", "agents"}});
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "node_update_agent_scaling");
+}
diff --git a/benchmarks/scalability/graph_size_impact_bench.cpp b/benchmarks/scalability/graph_size_impact_bench.cpp
new file mode 100644
index 0000000..618ea84
--- /dev/null
+++ b/benchmarks/scalability/graph_size_impact_bench.cpp
@@ -0,0 +1,294 @@
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/benchmark/catch_benchmark.hpp>
+
+#include "../core/nanobench_adapter.h"
+#include "../core/metrics_collector.h"
+#include "../core/report_generator.h"
+#include "../fixtures/multi_agent_fixture.h"
+#include "../fixtures/graph_generator.h"
+
+using namespace DSR;
+using namespace DSR::Benchmark;
+
+TEST_CASE("Graph size impact on performance", "[SCALABILITY][graphsize]") {
+    MetricsCollector collector("graph_size_impact");
+    GraphGenerator generator;
+
+    SECTION("Node lookup performance vs graph size") {
+        for (uint32_t size : {100, 1000, 10000}) {
+            MultiAgentFixture fixture;
+            auto config_file = generator.generate_empty_graph();
+            REQUIRE(fixture.create_agents(1, config_file));
+
+            auto* graph = fixture.get_agent(0);
+            REQUIRE(graph != nullptr);
+
+            // Populate graph and store actual IDs
+            std::vector<uint64_t> node_ids;
+            node_ids.reserve(size);
+            for (uint32_t i = 0; i < static_cast<uint32_t>(size); ++i) {
+                auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+                auto result = graph->insert_node(node);
+                REQUIRE(result.has_value());
+                node_ids.push_back(result.value());
+            }
+
+            // Cache warmup: touch every node once
+            for (const auto id : node_ids) { (void)graph->get_node(id); }
+
+            size_t idx = 0;
+            bool last_ok = true;
+            auto bench = make_latency_bench(1000, 0); // manual warmup done above
+            bench.run("node_lookup", [&] {
+                auto node = graph->get_node(node_ids[idx++ % node_ids.size()]);
+                last_ok = node.has_value();
+                ankerl::nanobench::doNotOptimizeAway(node);
+            });
+            REQUIRE(last_ok);
+
+            auto stats = nb_to_stats(bench);
+            collector.record_scalability(
+                "node_lookup",
+                size,
+                stats.mean_ns,
+                "ns",
+                {{"graph_size", std::to_string(size)}});
+
+            INFO(size << " nodes - Lookup: " << stats.mean_ns << " ns");
+        }
+    }
+
+    SECTION("Node insertion performance vs graph size") {
+        for (uint32_t size : {100, 1000, 10000}) {
+            MultiAgentFixture fixture;
+            auto config_file = generator.generate_empty_graph();
+            REQUIRE(fixture.create_agents(1, config_file));
+
+            auto* graph = fixture.get_agent(0);
+            REQUIRE(graph != nullptr);
+
+            // Populate graph to target size
+            for (uint32_t i = 0; i < static_cast<uint32_t>(size); ++i) {
+                auto node = GraphGenerator::create_test_node(
+                    2000000 + i, graph->get_agent_id());
+                auto res = graph->insert_node(node);
+                REQUIRE(res.has_value());
+            }
+
+            // ~35µs/op: 300 iters/epoch × 50 epochs ≈ 0.53 s
+            uint64_t id_counter = 3000000;
+            auto bench = make_latency_bench(50);
+            bench.minEpochIterations(300);
+            bench.run("node_insert", [&] {
+                auto node = GraphGenerator::create_test_node(
+                    id_counter++, graph->get_agent_id());
+                auto res = graph->insert_node(node);
+                REQUIRE(res.has_value());
+                ankerl::nanobench::doNotOptimizeAway(res);
+            });
+
+            auto stats = nb_to_stats(bench);
+            collector.record_scalability(
+                "node_insert_latency",
+                size,
+                stats.mean_us(),
+                "us",
+                {{"graph_size", std::to_string(size)}});
+
+            INFO(size << " existing nodes - Insert: " << stats.mean_us() << " us");
+        }
+    }
+
+    SECTION("Edge operations vs edge count") {
+        for (uint32_t edge_count : {100, 1000, 5000}) {
+            MultiAgentFixture fixture;
+            auto config_file = generator.generate_empty_graph();
+            REQUIRE(fixture.create_agents(1, config_file));
+
+            auto* graph = fixture.get_agent(0);
+            REQUIRE(graph != nullptr);
+
+            auto root = graph->get_node_root();
+            REQUIRE(root.has_value());
+
+            // Create nodes for edges and store actual IDs
+            std::vector<uint64_t> node_ids;
+            node_ids.reserve(edge_count + 100);
+            for (uint32_t i = 0; i < edge_count + 100; ++i) {
+                auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+                auto result = graph->insert_node(node);
+                REQUIRE(result.has_value());
+                node_ids.push_back(result.value());
+            }
+
+            // Create edges for the first edge_count nodes
+            for (uint32_t i = 0; i < edge_count; ++i) {
+                auto edge = GraphGenerator::create_test_edge(
+                    root->id(), node_ids[i], graph->get_agent_id());
+                REQUIRE(graph->insert_or_assign_edge(edge));
+            }
+
+            // Cache warmup: touch every existing edge once
+            for (uint32_t i = 0; i < edge_count; ++i) {
+                (void)graph->get_edge(root->id(), node_ids[i], "test_edge");
+            }
+
+            // Measure edge lookup performance
+            // ~32µs at edge_count=100 (unstable, needs 300 iters); larger counts are stable.
+            size_t lookup_min_iters = (edge_count <= 100) ? 300 : 1;
+            size_t lookup_epochs   = (edge_count <= 100) ? 50  : 200;
+            size_t lookup_idx = 0;
+            bool last_ok = true;
+            auto lookup_bench = make_latency_bench(lookup_epochs, 0); // manual warmup done above
+            lookup_bench.minEpochIterations(lookup_min_iters);
+            lookup_bench.run("edge_lookup", [&] {
+                uint64_t target = node_ids[lookup_idx++ % edge_count];
+                auto edge = graph->get_edge(root->id(), target, "test_edge");
+                last_ok = edge.has_value();
+                ankerl::nanobench::doNotOptimizeAway(edge);
+            });
+            REQUIRE(last_ok);
+
+            auto lookup_stats = nb_to_stats(lookup_bench);
+            collector.record_scalability(
+                "edge_lookup",
+                edge_count,
+                lookup_stats.mean_ns,
+                "ns",
+                {{"edge_count", std::to_string(edge_count)}});
+
+            // Measure edge insertion performance (last 100 nodes have no edges yet)
+            // ~13µs/op (idempotent upsert): 800 iters/epoch × 50 epochs ≈ 0.52 s
+            size_t insert_idx = 0;
+            auto insert_bench = make_latency_bench(50);
+            insert_bench.minEpochIterations(800);
+            insert_bench.run("edge_insert", [&] {
+                uint64_t target = node_ids[edge_count + (insert_idx++ % 100)];
+                auto edge = GraphGenerator::create_test_edge(
+                    root->id(), target, graph->get_agent_id());
+                bool ok = graph->insert_or_assign_edge(edge);
+                REQUIRE(ok);
+                ankerl::nanobench::doNotOptimizeAway(ok);
+            });
+
+            auto insert_stats = nb_to_stats(insert_bench);
+            collector.record_scalability(
+                "edge_insert_latency",
+                edge_count,
+                insert_stats.mean_us(),
+                "us",
+                {{"edge_count", std::to_string(edge_count)}});
+
+            INFO(edge_count << " edges - Lookup: " << lookup_stats.mean_ns
+                 << " ns, Insert: " << insert_stats.mean_us() << " us");
+        }
+    }
+
+    SECTION("get_nodes performance vs graph size") {
+        for (uint32_t size : {100, 1000, 5000}) {
+            MultiAgentFixture fixture;
+            auto config_file = generator.generate_empty_graph();
+            REQUIRE(fixture.create_agents(1, config_file));
+
+            auto* graph = fixture.get_agent(0);
+            REQUIRE(graph != nullptr);
+
+            // Populate
+            for (uint32_t i = 0; i < static_cast<uint32_t>(size); ++i) {
+                auto node = GraphGenerator::create_test_node(
+                    5000000 + i, graph->get_agent_id());
+                auto res = graph->insert_node(node);
+                REQUIRE(res.has_value());
+            }
+
+            auto bench = make_latency_bench(100);
+            bench.run("get_all_nodes", [&] {
+                auto nodes = graph->get_nodes();
+                ankerl::nanobench::doNotOptimizeAway(nodes);
+            });
+
+            auto stats = nb_to_stats(bench);
+            collector.record_scalability(
+                "get_all_nodes",
+                size,
+                stats.mean_us(),
+                "us",
+                {{"graph_size", std::to_string(size)}});
+
+            INFO(size << " nodes - get_nodes: " << stats.mean_us() << " us");
+        }
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "graph_size_impact");
+}
+
+TEST_CASE("Memory pressure impact", "[SCALABILITY][memory]") {
+    MetricsCollector collector("memory_pressure");
+    GraphGenerator generator;
+
+    SECTION("Operation latency under memory pressure") {
+        MultiAgentFixture fixture;
+        auto config_file = generator.generate_empty_graph();
+        REQUIRE(fixture.create_agents(1, config_file));
+
+        auto* graph = fixture.get_agent(0);
+        REQUIRE(graph != nullptr);
+
+        // Create increasingly large graph and measure periodically
+        std::vector<std::pair<uint32_t, double>> size_vs_latency;
+
+        for (uint32_t target_size : {1000, 5000, 10000, 20000}) {
+            // Add nodes to reach target size
+            uint64_t current_size = graph->get_nodes().size();
+            for (uint64_t i = current_size; i < target_size; ++i) {
+                auto node = GraphGenerator::create_test_node(
+                    6000000 + i, graph->get_agent_id());
+                auto res = graph->insert_node(node);
+                REQUIRE(res.has_value());
+            }
+
+            // ~28–41µs/op: 500 iters/epoch × 50 epochs ≈ 0.7–1.0 s
+            uint64_t id_counter = 7000000 + static_cast<uint64_t>(target_size) * 100;
+            auto bench = make_latency_bench(50);
+            bench.minEpochIterations(500);
+            bench.run("insert_under_pressure", [&] {
+                auto node = GraphGenerator::create_test_node(
+                    id_counter++, graph->get_agent_id());
+                auto res = graph->insert_node(node);
+                REQUIRE(res.has_value());
+                ankerl::nanobench::doNotOptimizeAway(res);
+            });
+
+            auto stats = nb_to_stats(bench);
+            collector.record_scalability(
+                "insert_under_pressure",
+                target_size,
+                stats.mean_us(),
+                "us",
+                {{"graph_size", std::to_string(target_size)}});
+
+            size_vs_latency.push_back({target_size, stats.mean_us()});
+            INFO(target_size << " nodes - Insert latency: " << stats.mean_us() << " us");
+        }
+
+        // Check for non-linear degradation
+        if (size_vs_latency.size() >= 2) {
+            double first_latency = size_vs_latency.front().second;
+            double last_latency = size_vs_latency.back().second;
+            double size_ratio = static_cast<double>(size_vs_latency.back().first) /
+                               static_cast<double>(size_vs_latency.front().first);
+            double latency_ratio = last_latency / first_latency;
+
+            collector.record("latency_degradation_ratio", MetricCategory::Scalability,
+                latency_ratio / size_ratio, "x");
+
+            INFO("Latency degradation ratio: " << latency_ratio / size_ratio << "x");
+        }
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "memory_pressure");
+}
diff --git a/benchmarks/scalability/graph_size_scaling_bench.cpp b/benchmarks/scalability/graph_size_scaling_bench.cpp
new file mode 100644
index 0000000..8195264
--- /dev/null
+++ b/benchmarks/scalability/graph_size_scaling_bench.cpp
@@ -0,0 +1,272 @@
+#include <catch2/catch_test_macros.hpp>
+
+#include "../core/nanobench_adapter.h"
+#include "../core/metrics_collector.h"
+#include "../core/report_generator.h"
+#include "../fixtures/multi_agent_fixture.h"
+#include "../fixtures/graph_generator.h"
+
+using namespace DSR;
+using namespace DSR::Benchmark;
+
+// For each operation, measures latency (1000 samples) and derives throughput
+// from the nanobench mean, at three pre-existing graph sizes: {100, 1000, 10000}.
+// "graph_size" = number of nodes already in the graph before measurement begins.
+
+// ── Node insert ───────────────────────────────────────────────────────────────
+
+TEST_CASE("Node insert graph size scaling", "[SCALABILITY][graphsize]") {
+    GraphGenerator generator;
+    MetricsCollector collector("node_insert_graphsize_scaling");
+
+    for (uint32_t N : {100u, 1000u, 10000u}) {
+        MultiAgentFixture fixture;
+        auto config_file = generator.generate_empty_graph();
+        REQUIRE(fixture.create_agents(1, config_file));
+        auto* graph = fixture.get_agent(0);
+        REQUIRE(graph != nullptr);
+
+        // Pre-populate to target size
+        for (uint32_t i = 0; i < N; ++i) {
+            auto node = GraphGenerator::create_test_node(2000000 + i, graph->get_agent_id());
+            auto res = graph->insert_node(node);
+            REQUIRE(res.has_value());
+        }
+
+        // ~38µs/op: 300 iters/epoch × 50 epochs ≈ 0.57 s
+        uint64_t id_counter = 3000000;
+        auto bench = make_latency_bench(50);
+        bench.minEpochIterations(300);
+        bench.run("node_insert", [&] {
+            auto node = GraphGenerator::create_test_node(id_counter++, graph->get_agent_id());
+            auto res = graph->insert_node(node);
+            REQUIRE(res.has_value());
+            ankerl::nanobench::doNotOptimizeAway(res);
+        });
+
+        auto stats = nb_to_stats(bench);
+        const std::string n_str = std::to_string(N);
+        collector.record_latency_stats("node_insert", stats, {{"graph_size", n_str}});
+        collector.record("node_insert", MetricCategory::Throughput,
+            nb_throughput(bench), "ops/sec", {{"graph_size", n_str}});
+        collector.record_scalability("node_insert", N, stats.mean_ns, "ns",
+            {{"graph_size", n_str}, {"scale_dim", "graph_size"}});
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "node_insert_graphsize_scaling");
+}
+
+// ── Node read ─────────────────────────────────────────────────────────────────
+
+TEST_CASE("Node read graph size scaling", "[SCALABILITY][graphsize]") {
+    GraphGenerator generator;
+    MetricsCollector collector("node_read_graphsize_scaling");
+
+    for (uint32_t N : {100u, 1000u, 10000u}) {
+        MultiAgentFixture fixture;
+        auto config_file = generator.generate_empty_graph();
+        REQUIRE(fixture.create_agents(1, config_file));
+        auto* graph = fixture.get_agent(0);
+        REQUIRE(graph != nullptr);
+
+        std::vector<uint64_t> node_ids;
+        node_ids.reserve(N);
+        for (uint32_t i = 0; i < N; ++i) {
+            auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+            auto res = graph->insert_node(node);
+            REQUIRE(res.has_value());
+            node_ids.push_back(res.value());
+        }
+
+        // Cache warmup: touch every node once so all N entries are in L3
+        for (const auto id : node_ids) { (void)graph->get_node(id); }
+
+        // ~900 ns–1 µs/op: 10 000 iters/epoch × 200 epochs ≈ 1.8 s
+        size_t idx = 0;
+        bool last_ok = true;
+        auto bench = make_latency_bench(200, 0); // manual warmup done above
+        bench.minEpochIterations(10000);
+        bench.run("node_read", [&] {
+            auto node = graph->get_node(node_ids[idx++ % node_ids.size()]);
+            last_ok = node.has_value();
+            ankerl::nanobench::doNotOptimizeAway(node);
+        });
+        REQUIRE(last_ok);
+
+        auto stats = nb_to_stats(bench);
+        const std::string n_str = std::to_string(N);
+        collector.record_latency_stats("node_read", stats, {{"graph_size", n_str}});
+        collector.record("node_read", MetricCategory::Throughput,
+            nb_throughput(bench), "ops/sec", {{"graph_size", n_str}});
+        collector.record_scalability("node_read", N, stats.mean_ns, "ns",
+            {{"graph_size", n_str}, {"scale_dim", "graph_size"}});
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "node_read_graphsize_scaling");
+}
+
+// ── Node update ───────────────────────────────────────────────────────────────
+
+TEST_CASE("Node update graph size scaling", "[SCALABILITY][graphsize]") {
+    GraphGenerator generator;
+    MetricsCollector collector("node_update_graphsize_scaling");
+
+    for (uint32_t N : {100u, 1000u, 10000u}) {
+        MultiAgentFixture fixture;
+        auto config_file = generator.generate_empty_graph();
+        REQUIRE(fixture.create_agents(1, config_file));
+        auto* graph = fixture.get_agent(0);
+        REQUIRE(graph != nullptr);
+
+        std::vector<uint64_t> node_ids;
+        node_ids.reserve(N);
+        for (uint32_t i = 0; i < N; ++i) {
+            auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+            auto res = graph->insert_node(node);
+            REQUIRE(res.has_value());
+            node_ids.push_back(res.value());
+        }
+
+        // ~35µs/op: 350 iters/epoch × 50 epochs ≈ 0.61 s
+        uint64_t update_counter = 0;
+        size_t idx = 0;
+        auto bench = make_latency_bench(50);
+        bench.minEpochIterations(350);
+        bench.run("node_update", [&] {
+            auto node = graph->get_node(node_ids[idx++ % node_ids.size()]);
+            REQUIRE(node.has_value());
+            graph->add_or_modify_attrib_local<level_att>(
+                *node, static_cast<int32_t>(update_counter++ % 1000));
+            bool ok = graph->update_node(*node);
+            REQUIRE(ok);
+            ankerl::nanobench::doNotOptimizeAway(ok);
+        });
+
+        auto stats = nb_to_stats(bench);
+        const std::string n_str = std::to_string(N);
+        collector.record_latency_stats("node_update", stats, {{"graph_size", n_str}});
+        collector.record("node_update", MetricCategory::Throughput,
+            nb_throughput(bench), "ops/sec", {{"graph_size", n_str}});
+        collector.record_scalability("node_update", N, stats.mean_ns, "ns",
+            {{"graph_size", n_str}, {"scale_dim", "graph_size"}});
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "node_update_graphsize_scaling");
+}
+
+// ── Edge insert ───────────────────────────────────────────────────────────────
+
+TEST_CASE("Edge insert graph size scaling", "[SCALABILITY][graphsize]") {
+    GraphGenerator generator;
+    MetricsCollector collector("edge_insert_graphsize_scaling");
+
+    for (uint32_t N : {100u, 1000u, 10000u}) {
+        MultiAgentFixture fixture;
+        auto config_file = generator.generate_empty_graph();
+        REQUIRE(fixture.create_agents(1, config_file));
+        auto* graph = fixture.get_agent(0);
+        REQUIRE(graph != nullptr);
+
+        auto root = graph->get_node_root();
+        REQUIRE(root.has_value());
+
+        // Pre-populate N target nodes
+        std::vector<uint64_t> node_ids;
+        node_ids.reserve(N);
+        for (uint32_t i = 0; i < N; ++i) {
+            auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+            auto res = graph->insert_node(node);
+            REQUIRE(res.has_value());
+            node_ids.push_back(res.value());
+        }
+
+        // ~12µs (N≤1000): 800 iters/epoch × 50 epochs ≈ 0.48 s
+        // ~225µs (N=10000): 100 iters/epoch × 50 epochs ≈ 1.13 s
+        size_t idx = 0;
+        auto bench = make_latency_bench(50);
+        bench.minEpochIterations(N <= 1000 ? 800 : 100);
+        bench.run("edge_insert", [&] {
+            uint64_t target = node_ids[idx++ % node_ids.size()];
+            auto edge = GraphGenerator::create_test_edge(
+                root->id(), target, graph->get_agent_id());
+            bool ok = graph->insert_or_assign_edge(edge);
+            REQUIRE(ok);
+            ankerl::nanobench::doNotOptimizeAway(ok);
+        });
+
+        auto stats = nb_to_stats(bench);
+        const std::string n_str = std::to_string(N);
+        collector.record_latency_stats("edge_insert", stats, {{"graph_size", n_str}});
+        collector.record("edge_insert", MetricCategory::Throughput,
+            nb_throughput(bench), "ops/sec", {{"graph_size", n_str}});
+        collector.record_scalability("edge_insert", N, stats.mean_ns, "ns",
+            {{"graph_size", n_str}, {"scale_dim", "graph_size"}});
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "edge_insert_graphsize_scaling");
+}
+
+// ── Edge read ─────────────────────────────────────────────────────────────────
+
+TEST_CASE("Edge read graph size scaling", "[SCALABILITY][graphsize]") {
+    GraphGenerator generator;
+    MetricsCollector collector("edge_read_graphsize_scaling");
+
+    for (uint32_t N : {100u, 1000u, 10000u}) {
+        MultiAgentFixture fixture;
+        auto config_file = generator.generate_empty_graph();
+        REQUIRE(fixture.create_agents(1, config_file));
+        auto* graph = fixture.get_agent(0);
+        REQUIRE(graph != nullptr);
+
+        auto root = graph->get_node_root();
+        REQUIRE(root.has_value());
+
+        // Pre-populate N nodes + edges
+        std::vector<uint64_t> target_ids;
+        target_ids.reserve(N);
+        for (uint32_t i = 0; i < N; ++i) {
+            auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+            auto res = graph->insert_node(node);
+            REQUIRE(res.has_value());
+            target_ids.push_back(res.value());
+            auto edge = GraphGenerator::create_test_edge(
+                root->id(), res.value(), graph->get_agent_id());
+            REQUIRE(graph->insert_or_assign_edge(edge));
+        }
+
+        // Cache warmup: touch every edge once
+        for (const auto id : target_ids) { (void)graph->get_edge(root->id(), id, "test_edge"); }
+
+        size_t idx = 0;
+        bool last_ok = true;
+        auto bench = make_latency_bench(1000, 0); // manual warmup done above
+        bench.run("edge_read", [&] {
+            uint64_t target = target_ids[idx++ % target_ids.size()];
+            auto edge = graph->get_edge(root->id(), target, "test_edge");
+            last_ok = edge.has_value();
+            ankerl::nanobench::doNotOptimizeAway(edge);
+        });
+        REQUIRE(last_ok);
+
+        auto stats = nb_to_stats(bench);
+        const std::string n_str = std::to_string(N);
+        collector.record_latency_stats("edge_read", stats, {{"graph_size", n_str}});
+        collector.record("edge_read", MetricCategory::Throughput,
+            nb_throughput(bench), "ops/sec", {{"graph_size", n_str}});
+        collector.record_scalability("edge_read", N, stats.mean_ns, "ns",
+            {{"graph_size", n_str}, {"scale_dim", "graph_size"}});
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "edge_read_graphsize_scaling");
+}
diff --git a/benchmarks/scalability/multi_agent_sync_bench.cpp b/benchmarks/scalability/multi_agent_sync_bench.cpp
new file mode 100644
index 0000000..6707751
--- /dev/null
+++ b/benchmarks/scalability/multi_agent_sync_bench.cpp
@@ -0,0 +1,286 @@
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/benchmark/catch_benchmark.hpp>
+#include <atomic>
+#include <vector>
+
+#include "../core/timing_utils.h"
+#include "../core/metrics_collector.h"
+#include "../core/report_generator.h"
+#include "../fixtures/multi_agent_fixture.h"
+#include "../fixtures/graph_generator.h"
+
+using namespace DSR;
+using namespace DSR::Benchmark;
+
+TEST_CASE("Multi-agent synchronization benchmarks", "[SCALABILITY][sync][.multi][PROFILE][MULTIAGENT]") {
+    GraphGenerator generator;
+    MetricsCollector collector("multi_agent_sync");
+
+    SECTION("Initial sync time vs agent count") {
+        for (uint32_t num_agents : {2, 4, 8, 16}) {
+            auto config_file = generator.generate_empty_graph();
+
+            LatencyTracker tracker(10);
+
+            for (int trial = 0; trial < 10; ++trial) {
+                MultiAgentFixture fixture;
+
+                uint64_t start = get_unix_timestamp();
+                bool created = fixture.create_agents(num_agents, config_file);
+                if (!created) {
+                    WARN("Could not create " << num_agents << " agents");
+                    break;
+                }
+
+                fixture.wait_for_sync();
+                bool converged = fixture.verify_convergence();
+                uint64_t elapsed = get_unix_timestamp() - start;
+
+                if (converged) {
+                    tracker.record(elapsed);
+                }
+
+                // Cleanup before next trial
+            }
+
+            if (tracker.count() > 0) {
+                auto stats = tracker.stats();
+                collector.record_scalability(
+                    "initial_sync_time",
+                    num_agents,
+                    stats.mean_ms(),
+                    "ms",
+                    {{"num_agents", std::to_string(num_agents)}});
+
+                INFO(num_agents << " agents - Initial sync: " << stats.mean_ms() << " ms");
+            }
+        }
+    }
+
+    SECTION("Convergence time after operation") {
+        for (uint32_t num_agents : {2, 4, 8}) {
+            MultiAgentFixture fixture;
+            auto config_file = generator.generate_empty_graph();
+
+            if (!fixture.create_agents(num_agents, config_file)) {
+                WARN("Could not create " << num_agents << " agents");
+                continue;
+            }
+            fixture.wait_for_sync();
+
+            LatencyTracker tracker(50);
+
+            // Measure convergence time after node insertion
+            for (int i = 0; i < 50; ++i) {
+                auto* sender = fixture.get_agent(0);
+                auto node = GraphGenerator::create_test_node(
+                    700000 + i, sender->get_agent_id(),
+                    "sync_node_" + std::to_string(i));
+
+                uint64_t start = get_unix_timestamp();
+                sender->insert_node(node);
+
+                auto conv_time = fixture.measure_convergence_time();
+                if (conv_time.count() >= 0) {
+                    tracker.record(static_cast<uint64_t>(conv_time.count()) * 1'000'000);  // ms to ns
+                }
+            }
+
+            if (tracker.count() > 0) {
+                auto stats = tracker.stats();
+                collector.record_scalability(
+                    "convergence_after_insert",
+                    num_agents,
+                    stats.mean_ms(),
+                    "ms",
+                    {{"num_agents", std::to_string(num_agents)}});
+
+                INFO(num_agents << " agents - Convergence time: " << stats.mean_ms() << " ms");
+            }
+        }
+    }
+
+    SECTION("Broadcast time to all agents") {
+        for (uint32_t num_agents : {2, 4, 8}) {
+            MultiAgentFixture fixture;
+            auto config_file = generator.generate_empty_graph();
+
+            if (!fixture.create_agents(num_agents, config_file)) {
+                WARN("Could not create " << num_agents << " agents");
+                continue;
+            }
+            fixture.wait_for_sync();
+
+            LatencyTracker tracker(50);
+
+            // Track when each agent receives the update
+            std::vector<std::atomic<uint64_t>> receive_times(num_agents - 1);
+            std::vector<std::atomic<bool>> received(num_agents - 1);
+
+            for (size_t i = 1; i < num_agents; ++i) {
+                auto* receiver = fixture.get_agent(i);
+                QObject::connect(receiver, &DSR::DSRGraph::update_node_signal, receiver,
+                    [&, idx = i - 1](uint64_t id, const std::string& type, DSR::SignalInfo) {
+                        if (id >= 800000 && id < 900000 && !received[idx].load()) {
+                            receive_times[idx].store(get_unix_timestamp());
+                            received[idx].store(true);
+                        }
+                    }, Qt::DirectConnection);
+            }
+
+            auto* sender = fixture.get_agent(0);
+
+            for (int i = 0; i < 50; ++i) {
+                // Reset tracking
+                for (size_t j = 0; j < num_agents - 1; ++j) {
+                    receive_times[j].store(0);
+                    received[j].store(false);
+                }
+
+                auto node = GraphGenerator::create_test_node(
+                    800000 + i, sender->get_agent_id(),
+                    "broadcast_node_" + std::to_string(i));
+
+                uint64_t send_time = get_unix_timestamp();
+                sender->insert_node(node);
+
+                // Wait for all receivers
+                auto start = std::chrono::steady_clock::now();
+                while (true) {
+                    bool all_received = true;
+                    for (size_t j = 0; j < num_agents - 1; ++j) {
+                        if (!received[j].load()) {
+                            all_received = false;
+                            break;
+                        }
+                    }
+
+                    if (all_received) break;
+
+                    fixture.process_events(1);
+
+                    if (std::chrono::steady_clock::now() - start > std::chrono::seconds(5)) {
+                        break;
+                    }
+                }
+
+                // Find max receive time (last agent to receive)
+                uint64_t max_time = 0;
+                for (size_t j = 0; j < num_agents - 1; ++j) {
+                    if (received[j].load()) {
+                        max_time = std::max(max_time, receive_times[j].load());
+                    }
+                }
+
+                if (max_time > send_time) {
+                    tracker.record(max_time - send_time);
+                }
+            }
+
+            if (tracker.count() > 0) {
+                auto stats = tracker.stats();
+                collector.record_scalability(
+                    "broadcast_to_all",
+                    num_agents,
+                    stats.mean_us(),
+                    "us",
+                    {{"num_agents", std::to_string(num_agents)}});
+
+                INFO(num_agents << " agents - Broadcast time: " << stats.mean_us() << " us");
+            }
+        }
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "multi_agent_sync");
+}
+
+TEST_CASE("Scaling efficiency", "[SCALABILITY][efficiency][.multi][PROFILE][MULTIAGENT]") {
+    GraphGenerator generator;
+    MetricsCollector collector("scaling_efficiency");
+
+    std::map<uint32_t, double> throughputs;
+
+    SECTION("Throughput scaling with agents") {
+        for (uint32_t num_agents : {1, 2, 4, 8}) {
+            MultiAgentFixture fixture;
+            auto config_file = generator.generate_empty_graph();
+
+            if (!fixture.create_agents(num_agents, config_file)) {
+                WARN("Could not create " << num_agents << " agents");
+                continue;
+            }
+            fixture.wait_for_sync();
+
+            constexpr auto TEST_DURATION = std::chrono::seconds(3);
+            std::atomic<uint64_t> total_ops{0};
+            std::atomic<bool> stop_flag{false};
+
+            std::vector<std::thread> threads;
+            threads.reserve(num_agents);
+
+            auto start = std::chrono::steady_clock::now();
+
+            for (size_t i = 0; i < num_agents; ++i) {
+                threads.emplace_back([&, agent_idx = i]() {
+                    auto* graph = fixture.get_agent(agent_idx);
+                    uint64_t base_id = 900000 + agent_idx * 50000;
+                    uint64_t local_ops = 0;
+
+                    while (!stop_flag.load(std::memory_order_relaxed)) {
+                        auto node = GraphGenerator::create_test_node(
+                            base_id + local_ops, graph->get_agent_id());
+                        graph->insert_node(node);
+                        local_ops++;
+                    }
+
+                    total_ops.fetch_add(local_ops, std::memory_order_relaxed);
+                });
+            }
+
+            std::this_thread::sleep_for(TEST_DURATION);
+            stop_flag.store(true);
+
+            for (auto& t : threads) {
+                t.join();
+            }
+
+            auto actual_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+                std::chrono::steady_clock::now() - start);
+
+            double ops_per_sec = static_cast<double>(total_ops.load()) /
+                                (static_cast<double>(actual_duration.count()) / 1000.0);
+
+            throughputs[num_agents] = ops_per_sec;
+
+            collector.record_scalability(
+                "throughput_scaling",
+                num_agents,
+                ops_per_sec,
+                "ops/sec",
+                {{"num_agents", std::to_string(num_agents)}});
+
+            INFO(num_agents << " agents - Throughput: " << ops_per_sec << " ops/sec");
+        }
+
+        // Calculate scaling efficiency
+        if (throughputs.count(1) > 0 && throughputs.count(2) > 0) {
+            double efficiency_2 = throughputs[2] / (2 * throughputs[1]) * 100;
+            collector.record("scaling_efficiency_2_agents", MetricCategory::Scalability,
+                efficiency_2, "%");
+            INFO("Scaling efficiency (2 agents): " << efficiency_2 << "%");
+        }
+
+        if (throughputs.count(1) > 0 && throughputs.count(4) > 0) {
+            double efficiency_4 = throughputs[4] / (4 * throughputs[1]) * 100;
+            collector.record("scaling_efficiency_4_agents", MetricCategory::Scalability,
+                efficiency_4, "%");
+            INFO("Scaling efficiency (4 agents): " << efficiency_4 << "%");
+        }
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "scaling_efficiency");
+}
diff --git a/benchmarks/scalability/thread_scaling_bench.cpp b/benchmarks/scalability/thread_scaling_bench.cpp
new file mode 100644
index 0000000..86f543d
--- /dev/null
+++ b/benchmarks/scalability/thread_scaling_bench.cpp
@@ -0,0 +1,538 @@
+#include <catch2/catch_test_macros.hpp>
+#include <thread>
+#include <atomic>
+#include <barrier>
+#include <vector>
+#include <chrono>
+#include <string>
+#include <iostream>
+
+#include <nanobench.h>
+#include "../core/nanobench_adapter.h"
+#include "../core/timing_utils.h"
+#include "../core/metrics_collector.h"
+#include "../core/report_generator.h"
+#include "../fixtures/multi_agent_fixture.h"
+#include "../fixtures/graph_generator.h"
+
+using namespace DSR;
+using namespace DSR::Benchmark;
+using namespace std::chrono;
+
+// Measures throughput + latency across {1, 2, 4, 8} threads for each
+// operation.  Each iteration runs a 5-second window; per-thread raw latency
+// samples are merged into a single LatencyTracker for aggregate stats.
+// A record_scalability() entry is added so the Scalability tab can plot
+// the efficiency curve (scale_dim = "threads").
+//
+// nanobench wraps each (op, thread-count) run so results appear in the shared
+// nanobench table (stdout + results/nanobench_report.md).  bench.batch() is
+// set to total_ops so the table shows per-operation throughput, not wall time.
+
+static constexpr auto THREAD_DUR = std::chrono::seconds(5);
+
+// ── Node insert ───────────────────────────────────────────────────────────────
+
+TEST_CASE("Node insert thread scaling", "[SCALABILITY][threads]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("node_insert_thread_scaling");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    ankerl::nanobench::Bench bench;
+    bench.output(&nb_report_stream()).warmup(0).epochs(1).epochIterations(1);
+
+    for (uint32_t N : {1u, 2u, 4u, 8u}) {
+        std::atomic<uint64_t> total_ops{0};
+        std::atomic<uint64_t> failed_ops{0};
+        std::atomic<bool> stop_flag{false};
+        std::barrier sync_point(N);
+
+        std::vector<std::vector<uint64_t>> per_thread_samples(N);
+        for (auto& s : per_thread_samples) s.reserve(2000000 / N);
+
+        std::vector<std::thread> threads;
+        threads.reserve(N);
+
+        auto wall_start = steady_clock::now();
+
+        bench.run("node_insert_" + std::to_string(N) + "t", [&] {
+            for (uint32_t t = 0; t < N; ++t) {
+                threads.emplace_back([&, tid = t]() {
+                    uint64_t base_id = 200000ULL + tid * 200000ULL;
+                    uint64_t local_ops = 0;
+                    auto& samples = per_thread_samples[tid];
+
+                    sync_point.arrive_and_wait();
+
+                    while (!stop_flag.load(std::memory_order_relaxed)) {
+                        auto node = GraphGenerator::create_test_node(
+                            base_id + local_ops, graph->get_agent_id());
+                        uint64_t ts = bench_now();
+                        auto res = graph->insert_node(node);
+                        samples.push_back(bench_now() - ts);
+                        if (!res.has_value())
+                            failed_ops.fetch_add(1, std::memory_order_relaxed);
+                        local_ops++;
+                    }
+
+                    total_ops.fetch_add(local_ops, std::memory_order_relaxed);
+                });
+            }
+
+            std::this_thread::sleep_for(THREAD_DUR);
+            stop_flag.store(true, std::memory_order_relaxed);
+            for (auto& th : threads) th.join();
+
+            bench.batch(total_ops.load());
+            ankerl::nanobench::doNotOptimizeAway(total_ops.load());
+        });
+
+        if (failed_ops.load() > 0)
+            std::cerr << "[BENCH node_insert threads=" << N << "] "
+                      << failed_ops.load() << " insert_node calls failed\n";
+
+        auto dur = duration_cast<milliseconds>(steady_clock::now() - wall_start);
+
+        LatencyTracker merged;
+        for (auto& s : per_thread_samples)
+            for (auto v : s) merged.record(v);
+
+        const std::string n_str = std::to_string(N);
+        collector.record_throughput("node_insert", total_ops.load(), dur,
+            {{"threads", n_str}});
+        if (!merged.empty())
+            collector.record_latency_stats("node_insert", merged.stats(),
+                {{"threads", n_str}});
+
+        double ops_per_sec = static_cast<double>(total_ops.load()) /
+                             (static_cast<double>(dur.count()) / 1000.0);
+        collector.record_scalability("node_insert", N, ops_per_sec, "ops/sec",
+            {{"threads", n_str}, {"scale_dim", "threads"}});
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "node_insert_thread_scaling");
+}
+
+// ── Node read ─────────────────────────────────────────────────────────────────
+
+TEST_CASE("Node read thread scaling", "[SCALABILITY][threads]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("node_read_thread_scaling");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    // Pre-populate once; all thread-count iterations share this pool.
+    std::vector<uint64_t> node_ids;
+    node_ids.reserve(1000);
+    for (uint64_t i = 0; i < 1000; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        auto res = graph->insert_node(node);
+        REQUIRE(res.has_value());
+        node_ids.push_back(res.value());
+    }
+    const size_t pool_size = node_ids.size();
+
+    ankerl::nanobench::Bench bench;
+    bench.output(&nb_report_stream()).warmup(0).epochs(1).epochIterations(1);
+
+    for (uint32_t N : {1u, 2u, 4u, 8u}) {
+        std::atomic<uint64_t> total_ops{0};
+        std::atomic<uint64_t> failed_ops{0};
+        std::atomic<bool> stop_flag{false};
+        std::barrier sync_point(N);
+
+        std::vector<std::vector<uint64_t>> per_thread_samples(N);
+        for (auto& s : per_thread_samples) s.reserve(2000000 / N);
+
+        std::vector<std::thread> threads;
+        threads.reserve(N);
+
+        auto wall_start = steady_clock::now();
+
+        bench.run("node_read_" + std::to_string(N) + "t", [&] {
+            for (uint32_t t = 0; t < N; ++t) {
+                threads.emplace_back([&, tid = t]() {
+                    uint64_t local_ops = 0;
+                    auto& samples = per_thread_samples[tid];
+
+                    sync_point.arrive_and_wait();
+
+                    while (!stop_flag.load(std::memory_order_relaxed)) {
+                        uint64_t id = node_ids[local_ops % pool_size];
+                        uint64_t ts = bench_now();
+                        auto node = graph->get_node(id);
+                        samples.push_back(bench_now() - ts);
+                        if (!node.has_value())
+                            failed_ops.fetch_add(1, std::memory_order_relaxed);
+                        local_ops++;
+                    }
+
+                    total_ops.fetch_add(local_ops, std::memory_order_relaxed);
+                });
+            }
+
+            std::this_thread::sleep_for(THREAD_DUR);
+            stop_flag.store(true, std::memory_order_relaxed);
+            for (auto& th : threads) th.join();
+
+            bench.batch(total_ops.load());
+            ankerl::nanobench::doNotOptimizeAway(total_ops.load());
+        });
+
+        if (failed_ops.load() > 0)
+            std::cerr << "[BENCH node_read threads=" << N << "] "
+                      << failed_ops.load() << " get_node calls returned empty\n";
+
+        auto dur = duration_cast<milliseconds>(steady_clock::now() - wall_start);
+
+        LatencyTracker merged;
+        for (auto& s : per_thread_samples)
+            for (auto v : s) merged.record(v);
+
+        const std::string n_str = std::to_string(N);
+        collector.record_throughput("node_read", total_ops.load(), dur,
+            {{"threads", n_str}});
+        if (!merged.empty())
+            collector.record_latency_stats("node_read", merged.stats(),
+                {{"threads", n_str}});
+
+        double ops_per_sec = static_cast<double>(total_ops.load()) /
+                             (static_cast<double>(dur.count()) / 1000.0);
+        collector.record_scalability("node_read", N, ops_per_sec, "ops/sec",
+            {{"threads", n_str}, {"scale_dim", "threads"}});
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "node_read_thread_scaling");
+}
+
+// ── Node update ───────────────────────────────────────────────────────────────
+
+TEST_CASE("Node update thread scaling", "[SCALABILITY][threads]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("node_update_thread_scaling");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    // Pre-insert 8 nodes (one per thread for the largest N); each thread
+    // updates its own node to measure scaling without lock contention.
+    constexpr uint32_t MAX_THREADS = 8;
+    std::vector<uint64_t> node_ids;
+    node_ids.reserve(MAX_THREADS);
+    for (uint32_t t = 0; t < MAX_THREADS; ++t) {
+        auto node = GraphGenerator::create_test_node(
+            500000 + t, graph->get_agent_id(),
+            "update_node_" + std::to_string(t));
+        auto res = graph->insert_node(node);
+        REQUIRE(res.has_value());
+        node_ids.push_back(res.value());
+    }
+
+    ankerl::nanobench::Bench bench;
+    bench.output(&nb_report_stream()).warmup(0).epochs(1).epochIterations(1);
+
+    for (uint32_t N : {1u, 2u, 4u, 8u}) {
+        std::atomic<uint64_t> total_ops{0};
+        std::atomic<uint64_t> failed_ops{0};
+        std::atomic<bool> stop_flag{false};
+        std::barrier sync_point(N);
+
+        std::vector<std::vector<uint64_t>> per_thread_samples(N);
+        for (auto& s : per_thread_samples) s.reserve(2000000 / N);
+
+        std::vector<std::thread> threads;
+        threads.reserve(N);
+
+        auto wall_start = steady_clock::now();
+
+        bench.run("node_update_" + std::to_string(N) + "t", [&] {
+            for (uint32_t t = 0; t < N; ++t) {
+                threads.emplace_back([&, tid = t]() {
+                    uint64_t local_ops = 0;
+                    auto& samples = per_thread_samples[tid];
+                    uint64_t nid = node_ids[tid];
+
+                    sync_point.arrive_and_wait();
+
+                    while (!stop_flag.load(std::memory_order_relaxed)) {
+                        auto node = graph->get_node(nid);
+                        if (node) {
+                            graph->add_or_modify_attrib_local<level_att>(
+                                *node, static_cast<int32_t>(local_ops % 1000));
+                            uint64_t ts = bench_now();
+                            bool ok = graph->update_node(*node);
+                            samples.push_back(bench_now() - ts);
+                            if (!ok)
+                                failed_ops.fetch_add(1, std::memory_order_relaxed);
+                            local_ops++;
+                        } else {
+                            failed_ops.fetch_add(1, std::memory_order_relaxed);
+                        }
+                    }
+
+                    total_ops.fetch_add(local_ops, std::memory_order_relaxed);
+                });
+            }
+
+            std::this_thread::sleep_for(THREAD_DUR);
+            stop_flag.store(true, std::memory_order_relaxed);
+            for (auto& th : threads) th.join();
+
+            bench.batch(total_ops.load());
+            ankerl::nanobench::doNotOptimizeAway(total_ops.load());
+        });
+
+        if (failed_ops.load() > 0)
+            std::cerr << "[BENCH node_update threads=" << N << "] "
+                      << failed_ops.load() << " get_node/update_node calls failed\n";
+
+        auto dur = duration_cast<milliseconds>(steady_clock::now() - wall_start);
+
+        LatencyTracker merged;
+        for (auto& s : per_thread_samples)
+            for (auto v : s) merged.record(v);
+
+        const std::string n_str = std::to_string(N);
+        collector.record_throughput("node_update", total_ops.load(), dur,
+            {{"threads", n_str}});
+        if (!merged.empty())
+            collector.record_latency_stats("node_update", merged.stats(),
+                {{"threads", n_str}});
+
+        double ops_per_sec = static_cast<double>(total_ops.load()) /
+                             (static_cast<double>(dur.count()) / 1000.0);
+        collector.record_scalability("node_update", N, ops_per_sec, "ops/sec",
+            {{"threads", n_str}, {"scale_dim", "threads"}});
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "node_update_thread_scaling");
+}
+
+// ── Edge insert ───────────────────────────────────────────────────────────────
+
+TEST_CASE("Edge insert thread scaling", "[SCALABILITY][threads]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("edge_insert_thread_scaling");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    auto root = graph->get_node_root();
+    REQUIRE(root.has_value());
+
+    // Pre-populate target node pool; shared across all N iterations.
+    constexpr uint32_t POOL_SIZE = 10000;
+    std::vector<uint64_t> pool;
+    pool.reserve(POOL_SIZE);
+    for (uint64_t i = 0; i < POOL_SIZE; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        auto res = graph->insert_node(node);
+        REQUIRE(res.has_value());
+        pool.push_back(res.value());
+    }
+    const size_t pool_size = pool.size();
+
+    ankerl::nanobench::Bench bench;
+    bench.output(&nb_report_stream()).warmup(0).epochs(1).epochIterations(1);
+
+    for (uint32_t N : {1u, 2u, 4u, 8u}) {
+        std::atomic<uint64_t> total_ops{0};
+        std::atomic<uint64_t> failed_ops{0};
+        std::atomic<bool> stop_flag{false};
+        std::barrier sync_point(N);
+
+        std::vector<std::vector<uint64_t>> per_thread_samples(N);
+        for (auto& s : per_thread_samples) s.reserve(2000000 / N);
+
+        std::vector<std::thread> threads;
+        threads.reserve(N);
+
+        const uint32_t stride = static_cast<uint32_t>(pool_size / N) + 1;
+        auto wall_start = steady_clock::now();
+
+        bench.run("edge_insert_" + std::to_string(N) + "t", [&] {
+            for (uint32_t t = 0; t < N; ++t) {
+                threads.emplace_back([&, tid = t]() {
+                    uint64_t local_ops = 0;
+                    auto& samples = per_thread_samples[tid];
+
+                    sync_point.arrive_and_wait();
+
+                    while (!stop_flag.load(std::memory_order_relaxed)) {
+                        uint64_t idx = (local_ops + tid * stride) % pool_size;
+                        auto edge = GraphGenerator::create_test_edge(
+                            root->id(), pool[idx], graph->get_agent_id());
+                        uint64_t ts = bench_now();
+                        bool ok = graph->insert_or_assign_edge(edge);
+                        samples.push_back(bench_now() - ts);
+                        if (!ok)
+                            failed_ops.fetch_add(1, std::memory_order_relaxed);
+                        local_ops++;
+                    }
+
+                    total_ops.fetch_add(local_ops, std::memory_order_relaxed);
+                });
+            }
+
+            std::this_thread::sleep_for(THREAD_DUR);
+            stop_flag.store(true, std::memory_order_relaxed);
+            for (auto& th : threads) th.join();
+
+            bench.batch(total_ops.load());
+            ankerl::nanobench::doNotOptimizeAway(total_ops.load());
+        });
+
+        if (failed_ops.load() > 0)
+            std::cerr << "[BENCH edge_insert threads=" << N << "] "
+                      << failed_ops.load() << " insert_or_assign_edge calls failed\n";
+
+        auto dur = duration_cast<milliseconds>(steady_clock::now() - wall_start);
+
+        LatencyTracker merged;
+        for (auto& s : per_thread_samples)
+            for (auto v : s) merged.record(v);
+
+        const std::string n_str = std::to_string(N);
+        collector.record_throughput("edge_insert", total_ops.load(), dur,
+            {{"threads", n_str}});
+        if (!merged.empty())
+            collector.record_latency_stats("edge_insert", merged.stats(),
+                {{"threads", n_str}});
+
+        double ops_per_sec = static_cast<double>(total_ops.load()) /
+                             (static_cast<double>(dur.count()) / 1000.0);
+        collector.record_scalability("edge_insert", N, ops_per_sec, "ops/sec",
+            {{"threads", n_str}, {"scale_dim", "threads"}});
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "edge_insert_thread_scaling");
+}
+
+// ── Edge read ─────────────────────────────────────────────────────────────────
+
+TEST_CASE("Edge read thread scaling", "[SCALABILITY][threads]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("edge_read_thread_scaling");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    auto root = graph->get_node_root();
+    REQUIRE(root.has_value());
+
+    // Pre-populate 1000 nodes + edges; shared across all N iterations.
+    constexpr uint32_t POOL_SIZE = 1000;
+    std::vector<uint64_t> pool;
+    pool.reserve(POOL_SIZE);
+    for (uint64_t i = 0; i < POOL_SIZE; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        auto res = graph->insert_node(node);
+        REQUIRE(res.has_value());
+        pool.push_back(res.value());
+        auto edge = GraphGenerator::create_test_edge(
+            root->id(), res.value(), graph->get_agent_id());
+        REQUIRE(graph->insert_or_assign_edge(edge));
+    }
+    const size_t pool_size = pool.size();
+
+    ankerl::nanobench::Bench bench;
+    bench.output(&nb_report_stream()).warmup(0).epochs(1).epochIterations(1);
+
+    for (uint32_t N : {1u, 2u, 4u, 8u}) {
+        std::atomic<uint64_t> total_ops{0};
+        std::atomic<uint64_t> failed_ops{0};
+        std::atomic<bool> stop_flag{false};
+        std::barrier sync_point(N);
+
+        std::vector<std::vector<uint64_t>> per_thread_samples(N);
+        for (auto& s : per_thread_samples) s.reserve(2000000 / N);
+
+        std::vector<std::thread> threads;
+        threads.reserve(N);
+
+        const uint32_t stride = static_cast<uint32_t>(pool_size / N) + 1;
+        auto wall_start = steady_clock::now();
+
+        bench.run("edge_read_" + std::to_string(N) + "t", [&] {
+            for (uint32_t t = 0; t < N; ++t) {
+                threads.emplace_back([&, tid = t]() {
+                    uint64_t local_ops = 0;
+                    auto& samples = per_thread_samples[tid];
+
+                    sync_point.arrive_and_wait();
+
+                    while (!stop_flag.load(std::memory_order_relaxed)) {
+                        uint64_t idx = (local_ops + tid * stride) % pool_size;
+                        uint64_t ts = bench_now();
+                        auto edge = graph->get_edge(root->id(), pool[idx], "test_edge");
+                        samples.push_back(bench_now() - ts);
+                        if (!edge.has_value())
+                            failed_ops.fetch_add(1, std::memory_order_relaxed);
+                        local_ops++;
+                    }
+
+                    total_ops.fetch_add(local_ops, std::memory_order_relaxed);
+                });
+            }
+
+            std::this_thread::sleep_for(THREAD_DUR);
+            stop_flag.store(true, std::memory_order_relaxed);
+            for (auto& th : threads) th.join();
+
+            bench.batch(total_ops.load());
+            ankerl::nanobench::doNotOptimizeAway(total_ops.load());
+        });
+
+        if (failed_ops.load() > 0)
+            std::cerr << "[BENCH edge_read threads=" << N << "] "
+                      << failed_ops.load() << " get_edge calls returned empty\n";
+
+        auto dur = duration_cast<milliseconds>(steady_clock::now() - wall_start);
+
+        LatencyTracker merged;
+        for (auto& s : per_thread_samples)
+            for (auto v : s) merged.record(v);
+
+        const std::string n_str = std::to_string(N);
+        collector.record_throughput("edge_read", total_ops.load(), dur,
+            {{"threads", n_str}});
+        if (!merged.empty())
+            collector.record_latency_stats("edge_read", merged.stats(),
+                {{"threads", n_str}});
+
+        double ops_per_sec = static_cast<double>(total_ops.load()) /
+                             (static_cast<double>(dur.count()) / 1000.0);
+        collector.record_scalability("edge_read", N, ops_per_sec, "ops/sec",
+            {{"threads", n_str}, {"scale_dim", "threads"}});
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "edge_read_thread_scaling");
+}
diff --git a/benchmarks/throughput/concurrent_writers_bench.cpp b/benchmarks/throughput/concurrent_writers_bench.cpp
new file mode 100644
index 0000000..6c9bda7
--- /dev/null
+++ b/benchmarks/throughput/concurrent_writers_bench.cpp
@@ -0,0 +1,366 @@
+#include <catch2/catch_test_macros.hpp>
+#include <thread>
+#include <atomic>
+#include <barrier>
+#include <vector>
+#include <chrono>
+#include <iostream>
+
+#include <nanobench.h>
+#include "../core/nanobench_adapter.h"
+#include "../core/timing_utils.h"
+#include "../core/metrics_collector.h"
+#include "../core/report_generator.h"
+#include "../fixtures/multi_agent_fixture.h"
+#include "../fixtures/graph_generator.h"
+
+using namespace DSR;
+using namespace DSR::Benchmark;
+
+TEST_CASE("Concurrent writers throughput", "[THROUGHPUT][concurrent][PROFILE][LOAD]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("concurrent_writers");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    constexpr auto TEST_DURATION = std::chrono::seconds(5);
+
+    ankerl::nanobench::Bench bench;
+    bench.output(&nb_report_stream()).warmup(0).epochs(1).epochIterations(1);
+
+    auto run_concurrent_test = [&](uint32_t num_threads, const std::string& test_name) {
+        std::atomic<uint64_t> total_operations{0};
+        std::atomic<uint64_t> failed_ops{0};
+        std::atomic<bool> stop_flag{false};
+        std::barrier sync_point(num_threads);
+
+        std::vector<std::thread> threads;
+        threads.reserve(num_threads);
+
+        auto start = std::chrono::steady_clock::now();
+
+        bench.run(test_name, [&] {
+            for (uint32_t t = 0; t < num_threads; ++t) {
+                threads.emplace_back([&, thread_id = t]() {
+                    uint64_t base_id = 100000 + thread_id * 100000;
+                    uint64_t local_ops = 0;
+
+                    sync_point.arrive_and_wait();
+
+                    while (!stop_flag.load(std::memory_order_relaxed)) {
+                        auto node = GraphGenerator::create_test_node(
+                            base_id + local_ops, graph->get_agent_id(),
+                            "thread_" + std::to_string(thread_id) + "_node_" + std::to_string(local_ops));
+                        if (!graph->insert_node(node).has_value())
+                            failed_ops.fetch_add(1, std::memory_order_relaxed);
+                        local_ops++;
+                    }
+
+                    total_operations.fetch_add(local_ops, std::memory_order_relaxed);
+                });
+            }
+
+            std::this_thread::sleep_for(TEST_DURATION);
+            stop_flag.store(true, std::memory_order_relaxed);
+            for (auto& t : threads) t.join();
+
+            bench.batch(total_operations.load());
+            ankerl::nanobench::doNotOptimizeAway(total_operations.load());
+        });
+
+        if (failed_ops.load() > 0)
+            std::cerr << "[BENCH " << test_name << "] "
+                      << failed_ops.load() << " insert_node calls failed\n";
+
+        auto actual_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+            std::chrono::steady_clock::now() - start);
+
+        collector.record_throughput(test_name, total_operations.load(), actual_duration,
+            {{"num_threads", std::to_string(num_threads)}});
+
+        double ops_per_sec = static_cast<double>(total_operations.load()) /
+                            (static_cast<double>(actual_duration.count()) / 1000.0);
+
+        return ops_per_sec;
+    };
+
+    SECTION("2 concurrent writers") {
+        double ops = run_concurrent_test(2, "concurrent_insert_2t");
+        INFO("2 threads: " << ops << " ops/sec");
+        CHECK(ops >= MIN_EXPECTED_THROUGHPUT_OPS);
+    }
+
+    SECTION("4 concurrent writers") {
+        double ops = run_concurrent_test(4, "concurrent_insert_4t");
+        INFO("4 threads: " << ops << " ops/sec");
+    }
+
+    SECTION("8 concurrent writers") {
+        double ops = run_concurrent_test(8, "concurrent_insert_8t");
+        INFO("8 threads: " << ops << " ops/sec");
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "concurrent_writers");
+}
+
+TEST_CASE("Concurrent read-write throughput", "[THROUGHPUT][concurrent][PROFILE][LOAD]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("concurrent_read_write");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    // Pre-populate graph and store actual IDs
+    std::vector<uint64_t> pre_node_ids;
+    pre_node_ids.reserve(1000);
+    for (uint64_t i = 0; i < 1000; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        auto result = graph->insert_node(node);
+        REQUIRE(result.has_value());
+        pre_node_ids.push_back(result.value());
+    }
+
+    constexpr auto TEST_DURATION = std::chrono::seconds(5);
+
+    ankerl::nanobench::Bench bench;
+    bench.output(&nb_report_stream()).warmup(0).epochs(1).epochIterations(1);
+
+    SECTION("Mixed read-write workload") {
+        constexpr uint32_t NUM_READERS = 4;
+        constexpr uint32_t NUM_WRITERS = 2;
+        constexpr uint32_t TOTAL_THREADS = NUM_READERS + NUM_WRITERS;
+
+        std::atomic<uint64_t> read_ops{0};
+        std::atomic<uint64_t> write_ops{0};
+        std::atomic<uint64_t> write_failures{0};
+        std::atomic<bool> stop_flag{false};
+        std::barrier sync_point(TOTAL_THREADS);
+
+        std::vector<std::thread> threads;
+        threads.reserve(TOTAL_THREADS);
+
+        auto start = std::chrono::steady_clock::now();
+
+        bench.run("mixed_read_write", [&] {
+            // Reader threads
+            for (uint32_t t = 0; t < NUM_READERS; ++t) {
+                threads.emplace_back([&, thread_id = t]() {
+                    uint64_t local_ops = 0;
+                    sync_point.arrive_and_wait();
+
+                    while (!stop_flag.load(std::memory_order_relaxed)) {
+                        uint64_t id = pre_node_ids[local_ops % pre_node_ids.size()];
+                        auto node = graph->get_node(id);
+                        ankerl::nanobench::doNotOptimizeAway(node);
+                        local_ops++;
+                    }
+
+                    read_ops.fetch_add(local_ops, std::memory_order_relaxed);
+                });
+            }
+
+            // Writer threads
+            for (uint32_t t = 0; t < NUM_WRITERS; ++t) {
+                threads.emplace_back([&, thread_id = t]() {
+                    uint64_t base_id = 300000 + thread_id * 100000;
+                    uint64_t local_ops = 0;
+                    sync_point.arrive_and_wait();
+
+                    while (!stop_flag.load(std::memory_order_relaxed)) {
+                        auto node = GraphGenerator::create_test_node(
+                            base_id + local_ops, graph->get_agent_id());
+                        if (!graph->insert_node(node).has_value())
+                            write_failures.fetch_add(1, std::memory_order_relaxed);
+                        local_ops++;
+                    }
+
+                    write_ops.fetch_add(local_ops, std::memory_order_relaxed);
+                });
+            }
+
+            std::this_thread::sleep_for(TEST_DURATION);
+            stop_flag.store(true, std::memory_order_relaxed);
+            for (auto& t : threads) t.join();
+
+            bench.batch(read_ops.load() + write_ops.load());
+            ankerl::nanobench::doNotOptimizeAway(read_ops.load());
+        });
+
+        if (write_failures.load() > 0)
+            std::cerr << "[BENCH concurrent_read_write] "
+                      << write_failures.load() << " insert_node calls failed\n";
+
+        auto actual_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+            std::chrono::steady_clock::now() - start);
+
+        collector.record_throughput("concurrent_reads", read_ops.load(), actual_duration,
+            {{"num_readers", std::to_string(NUM_READERS)}});
+        collector.record_throughput("concurrent_writes", write_ops.load(), actual_duration,
+            {{"num_writers", std::to_string(NUM_WRITERS)}});
+
+        double read_ops_sec = static_cast<double>(read_ops.load()) /
+                             (static_cast<double>(actual_duration.count()) / 1000.0);
+        double write_ops_sec = static_cast<double>(write_ops.load()) /
+                              (static_cast<double>(actual_duration.count()) / 1000.0);
+
+        INFO("Read throughput: " << read_ops_sec << " ops/sec");
+        INFO("Write throughput: " << write_ops_sec << " ops/sec");
+    }
+
+    SECTION("Update contention test") {
+        constexpr uint32_t NUM_THREADS = 4;
+
+        // All threads update the same node
+        auto test_node = GraphGenerator::create_test_node(
+            0, graph->get_agent_id(), "contention_test");
+        auto contention_id_opt = graph->insert_node(test_node);
+        REQUIRE(contention_id_opt.has_value());
+        uint64_t contention_node_id = contention_id_opt.value();
+
+        std::atomic<uint64_t> total_ops{0};
+        std::atomic<uint64_t> successful_ops{0};
+        std::atomic<bool> stop_flag{false};
+        std::barrier sync_point(NUM_THREADS);
+
+        std::vector<std::thread> threads;
+        threads.reserve(NUM_THREADS);
+
+        auto start = std::chrono::steady_clock::now();
+
+        bench.run("update_contention", [&] {
+            for (uint32_t t = 0; t < NUM_THREADS; ++t) {
+                threads.emplace_back([&, thread_id = t, node_id = contention_node_id]() {
+                    uint64_t local_total = 0;
+                    uint64_t local_success = 0;
+                    sync_point.arrive_and_wait();
+
+                    while (!stop_flag.load(std::memory_order_relaxed)) {
+                        auto node = graph->get_node(node_id);
+                        if (node) {
+                            graph->add_or_modify_attrib_local<level_att>(
+                                *node, static_cast<int32_t>(thread_id * 1000 + local_total));
+                            if (graph->update_node(*node)) {
+                                local_success++;
+                            }
+                        }
+                        local_total++;
+                    }
+
+                    total_ops.fetch_add(local_total, std::memory_order_relaxed);
+                    successful_ops.fetch_add(local_success, std::memory_order_relaxed);
+                });
+            }
+
+            std::this_thread::sleep_for(TEST_DURATION);
+            stop_flag.store(true, std::memory_order_relaxed);
+            for (auto& t : threads) t.join();
+
+            bench.batch(total_ops.load());
+            ankerl::nanobench::doNotOptimizeAway(total_ops.load());
+        });
+
+        auto actual_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+            std::chrono::steady_clock::now() - start);
+
+        double success_rate = static_cast<double>(successful_ops.load()) /
+                             static_cast<double>(total_ops.load()) * 100.0;
+
+        collector.record("update_contention_total", MetricCategory::Throughput,
+            static_cast<double>(total_ops.load()), "ops",
+            {{"num_threads", std::to_string(NUM_THREADS)}});
+        collector.record("update_contention_success_rate", MetricCategory::Throughput,
+            success_rate, "%",
+            {{"num_threads", std::to_string(NUM_THREADS)}});
+
+        INFO("Total attempts: " << total_ops.load());
+        INFO("Successful updates: " << successful_ops.load());
+        INFO("Success rate: " << success_rate << "%");
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "concurrent_read_write");
+}
+
+TEST_CASE("Multi-agent concurrent operations", "[THROUGHPUT][concurrent][multiagent][.multi][PROFILE][LOAD][MULTIAGENT]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("multiagent_concurrent");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(4, config_file));
+    fixture.wait_for_sync();
+
+    constexpr auto TEST_DURATION = std::chrono::seconds(5);
+
+    SECTION("Each agent writes independently") {
+        std::atomic<uint64_t> total_ops{0};
+        std::atomic<uint64_t> failed_ops{0};
+        std::atomic<bool> stop_flag{false};
+        std::barrier sync_point(fixture.agent_count());
+
+        std::vector<std::thread> threads;
+        threads.reserve(fixture.agent_count());
+
+        auto start = std::chrono::steady_clock::now();
+
+        for (size_t i = 0; i < fixture.agent_count(); ++i) {
+            threads.emplace_back([&, agent_idx = i]() {
+                auto* graph = fixture.get_agent(agent_idx);
+                uint64_t base_id = 600000 + agent_idx * 100000;
+                uint64_t local_ops = 0;
+
+                sync_point.arrive_and_wait();
+
+                while (!stop_flag.load(std::memory_order_relaxed)) {
+                    auto node = GraphGenerator::create_test_node(
+                        base_id + local_ops, graph->get_agent_id(),
+                        "agent_" + std::to_string(agent_idx) + "_node_" + std::to_string(local_ops));
+                    if (!graph->insert_node(node).has_value())
+                        failed_ops.fetch_add(1, std::memory_order_relaxed);
+                    local_ops++;
+                }
+
+                total_ops.fetch_add(local_ops, std::memory_order_relaxed);
+            });
+        }
+
+        std::this_thread::sleep_for(TEST_DURATION);
+        stop_flag.store(true, std::memory_order_relaxed);
+
+        for (auto& t : threads) {
+            t.join();
+        }
+
+        if (failed_ops.load() > 0)
+            std::cerr << "[BENCH multiagent_concurrent] "
+                      << failed_ops.load() << " insert_node calls failed\n";
+
+        auto actual_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+            std::chrono::steady_clock::now() - start);
+
+        collector.record_throughput("multiagent_concurrent_insert",
+            total_ops.load(), actual_duration,
+            {{"num_agents", std::to_string(fixture.agent_count())}});
+
+        double ops_per_sec = static_cast<double>(total_ops.load()) /
+                            (static_cast<double>(actual_duration.count()) / 1000.0);
+
+        INFO("Multi-agent concurrent throughput: " << ops_per_sec << " ops/sec");
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "multiagent_concurrent");
+}
diff --git a/benchmarks/throughput/query_ops_bench.cpp b/benchmarks/throughput/query_ops_bench.cpp
new file mode 100644
index 0000000..360d71f
--- /dev/null
+++ b/benchmarks/throughput/query_ops_bench.cpp
@@ -0,0 +1,131 @@
+#include <catch2/catch_test_macros.hpp>
+
+#include "../core/nanobench_adapter.h"
+#include "../core/metrics_collector.h"
+#include "../core/report_generator.h"
+#include "../fixtures/multi_agent_fixture.h"
+#include "../fixtures/graph_generator.h"
+
+using namespace DSR;
+using namespace DSR::Benchmark;
+
+TEST_CASE("Graph query convenience operations", "[EXTENDED][LATENCY][THROUGHPUT][query][single][.extended]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("graph_query_baseline");
+    collector.add_metadata("profile", "extended");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    auto root = graph->get_node_root();
+    REQUIRE(root.has_value());
+
+    std::vector<uint64_t> node_ids;
+    node_ids.reserve(1000);
+    for (uint64_t i = 0; i < 1000; ++i) {
+        auto node = GraphGenerator::create_test_node(i, graph->get_agent_id(), "query_node_" + std::to_string(i));
+        auto inserted = graph->insert_node(node);
+        REQUIRE(inserted.has_value());
+        node_ids.push_back(*inserted);
+
+        auto edge = GraphGenerator::create_test_edge(root->id(), *inserted, graph->get_agent_id());
+        REQUIRE(graph->insert_or_assign_edge(edge));
+    }
+
+    for (auto id : node_ids) {
+        (void)graph->get_node(id);
+    }
+    (void)graph->get_nodes();
+    (void)graph->get_nodes_by_type("test_node");
+    (void)graph->get_edges(root->id());
+    (void)graph->get_edges_to_id(root->id());
+    (void)graph->get_edges_by_type("test_edge");
+
+    {
+        auto bench = make_latency_bench(1000, 0);
+        bench.minEpochIterations(10);
+        bench.run("get_nodes", [&] {
+            auto nodes = graph->get_nodes();
+            ankerl::nanobench::doNotOptimizeAway(nodes);
+        });
+        collector.record_latency_stats("get_nodes", nb_to_stats(bench));
+        collector.record("get_nodes", MetricCategory::Throughput, nb_throughput(bench), "ops/sec");
+    }
+
+    {
+        auto bench = make_latency_bench(1000, 0);
+        bench.minEpochIterations(10);
+        bench.run("get_nodes_by_type", [&] {
+            auto nodes = graph->get_nodes_by_type("test_node");
+            ankerl::nanobench::doNotOptimizeAway(nodes);
+        });
+        collector.record_latency_stats("get_nodes_by_type", nb_to_stats(bench));
+        collector.record("get_nodes_by_type", MetricCategory::Throughput, nb_throughput(bench), "ops/sec");
+    }
+
+    {
+        auto bench = make_latency_bench(1000, 0);
+        bench.minEpochIterations(20);
+        bench.run("get_edges_from_root", [&] {
+            auto edges = graph->get_edges(root->id());
+            ankerl::nanobench::doNotOptimizeAway(edges);
+        });
+        collector.record_latency_stats("get_edges_from_root", nb_to_stats(bench));
+        collector.record("get_edges_from_root", MetricCategory::Throughput, nb_throughput(bench), "ops/sec");
+    }
+
+    {
+        auto bench = make_latency_bench(1000, 0);
+        bench.minEpochIterations(20);
+        bench.run("get_edges_to_root", [&] {
+            auto edges = graph->get_edges_to_id(root->id());
+            ankerl::nanobench::doNotOptimizeAway(edges);
+        });
+        collector.record_latency_stats("get_edges_to_root", nb_to_stats(bench));
+        collector.record("get_edges_to_root", MetricCategory::Throughput, nb_throughput(bench), "ops/sec");
+    }
+
+    {
+        auto bench = make_latency_bench(1000, 0);
+        bench.minEpochIterations(20);
+        bench.run("get_edges_by_type", [&] {
+            auto edges = graph->get_edges_by_type("test_edge");
+            ankerl::nanobench::doNotOptimizeAway(edges);
+        });
+        collector.record_latency_stats("get_edges_by_type", nb_to_stats(bench));
+        collector.record("get_edges_by_type", MetricCategory::Throughput, nb_throughput(bench), "ops/sec");
+    }
+
+    {
+        size_t idx = 0;
+        auto bench = make_latency_bench(1000, 0);
+        bench.minEpochIterations(5000);
+        bench.run("get_name_from_id", [&] {
+            auto name = graph->get_name_from_id(node_ids[idx++ % node_ids.size()]);
+            REQUIRE(name.has_value());
+            ankerl::nanobench::doNotOptimizeAway(name);
+        });
+        collector.record_latency_stats("get_name_from_id", nb_to_stats(bench));
+        collector.record("get_name_from_id", MetricCategory::Throughput, nb_throughput(bench), "ops/sec");
+    }
+
+    {
+        size_t idx = 0;
+        auto bench = make_latency_bench(1000, 0);
+        bench.minEpochIterations(5000);
+        bench.run("get_id_from_name", [&] {
+            auto id = graph->get_id_from_name("query_node_" + std::to_string(idx++ % node_ids.size()));
+            REQUIRE(id.has_value());
+            ankerl::nanobench::doNotOptimizeAway(id);
+        });
+        collector.record_latency_stats("get_id_from_name", nb_to_stats(bench));
+        collector.record("get_id_from_name", MetricCategory::Throughput, nb_throughput(bench), "ops/sec");
+    }
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "graph_query_baseline");
+}
diff --git a/benchmarks/throughput/single_agent_ops_bench.cpp b/benchmarks/throughput/single_agent_ops_bench.cpp
new file mode 100644
index 0000000..3fb79e4
--- /dev/null
+++ b/benchmarks/throughput/single_agent_ops_bench.cpp
@@ -0,0 +1,397 @@
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/benchmark/catch_benchmark.hpp>
+
+#include "../core/nanobench_adapter.h"
+#include "../core/metrics_collector.h"
+#include "../core/report_generator.h"
+#include "../fixtures/multi_agent_fixture.h"
+#include "../fixtures/graph_generator.h"
+
+using namespace DSR;
+using namespace DSR::Benchmark;
+
+// Each operation gets its own TEST_CASE.  nanobench replaces the manual
+// 5-second time-window loops: it auto-tunes warmup and iteration count,
+// and derives throughput from the mean latency (nb_throughput()).
+
+TEST_CASE("Node insertion throughput", "[THROUGHPUT][single]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("node_insert_throughput");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    uint64_t id_counter = 0;
+    auto sampled = run_sampled_benchmark(
+        50,
+        1000,
+        [&] {
+        auto node = GraphGenerator::create_test_node(id_counter++, graph->get_agent_id());
+        auto res = graph->insert_node(node);
+        REQUIRE(res.has_value());
+        },
+        [&] { fixture.process_events(1); },
+        16);
+
+    collector.record_latency_stats("node_insert", sampled.latency);
+    collector.record_throughput("node_insert", sampled.latency.count, sampled.wall_time);
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "node_insert_throughput");
+}
+
+TEST_CASE("Node read throughput", "[THROUGHPUT][single]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("node_read_throughput");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    std::vector<uint64_t> node_ids;
+    node_ids.reserve(1000);
+    for (uint64_t i = 0; i < 1000; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        auto result = graph->insert_node(node);
+        REQUIRE(result.has_value());
+        node_ids.push_back(result.value());
+    }
+
+    // Cache warmup
+    for (auto id : node_ids) { (void)graph->get_node(id); }
+
+    // ~900 ns/op: 10 000 iters/epoch × 200 epochs ≈ 1.8 s
+    size_t idx = 0;
+    bool last_ok = true;
+    auto bench = make_latency_bench(200, 0); // manual warmup done above
+    bench.minEpochIterations(10000);
+    bench.run("node_read", [&] {
+        auto node = graph->get_node(node_ids[idx++ % node_ids.size()]);
+        last_ok = node.has_value();
+        ankerl::nanobench::doNotOptimizeAway(node);
+    });
+    REQUIRE(last_ok);
+
+    collector.record_latency_stats("node_read", nb_to_stats(bench));
+    collector.record("node_read", MetricCategory::Throughput,
+                     nb_throughput(bench), "ops/sec");
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "node_read_throughput");
+}
+
+TEST_CASE("Node update throughput", "[THROUGHPUT][single]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("node_update_throughput");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    auto test_node = GraphGenerator::create_test_node(0, graph->get_agent_id(), "update_test");
+    auto insert_result = graph->insert_node(test_node);
+    REQUIRE(insert_result.has_value());
+    uint64_t node_id = insert_result.value();
+
+    // ~38µs/op: 300 iters/epoch × 50 epochs ≈ 0.57 s
+    uint64_t update_counter = 0;
+    auto bench = make_latency_bench(50);
+    bench.minEpochIterations(300);
+    bench.run("node_update", [&] {
+        auto node = graph->get_node(node_id);
+        REQUIRE(node.has_value());
+        graph->add_or_modify_attrib_local<level_att>(
+            *node, static_cast<int32_t>(update_counter++ % 1000));
+        bool ok = graph->update_node(*node);
+        REQUIRE(ok);
+        ankerl::nanobench::doNotOptimizeAway(ok);
+    });
+
+    collector.record_latency_stats("node_update", nb_to_stats(bench));
+    collector.record("node_update", MetricCategory::Throughput,
+                     nb_throughput(bench), "ops/sec");
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "node_update_throughput");
+}
+
+TEST_CASE("Edge insertion throughput", "[THROUGHPUT][single]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("edge_insert_throughput");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    auto root = graph->get_node_root();
+    REQUIRE(root.has_value());
+
+    std::vector<uint64_t> target_ids;
+    target_ids.reserve(10000);
+    for (uint64_t i = 0; i < 10000; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        auto result = graph->insert_node(node);
+        REQUIRE(result.has_value());
+        target_ids.push_back(result.value());
+    }
+
+    size_t idx = 0;
+    auto sampled = run_sampled_benchmark(
+        50,
+        1000,
+        [&] {
+        uint64_t target = target_ids[idx++ % target_ids.size()];
+        auto edge = GraphGenerator::create_test_edge(root->id(), target, graph->get_agent_id());
+        bool ok = graph->insert_or_assign_edge(edge);
+        REQUIRE(ok);
+        },
+        [&] { fixture.process_events(1); },
+        8);
+
+    collector.record_latency_stats("edge_insert", sampled.latency);
+    collector.record_throughput("edge_insert", sampled.latency.count, sampled.wall_time);
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "edge_insert_throughput");
+}
+
+TEST_CASE("Edge read throughput", "[THROUGHPUT][single]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("edge_read_throughput");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    auto root = graph->get_node_root();
+    REQUIRE(root.has_value());
+
+    std::vector<uint64_t> target_ids;
+    target_ids.reserve(1000);
+    for (uint64_t i = 0; i < 1000; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        auto result = graph->insert_node(node);
+        REQUIRE(result.has_value());
+        target_ids.push_back(result.value());
+        auto edge = GraphGenerator::create_test_edge(
+            root->id(), result.value(), graph->get_agent_id());
+        REQUIRE(graph->insert_or_assign_edge(edge));
+    }
+
+    // Cache warmup
+    for (auto tid : target_ids) { (void)graph->get_edge(root->id(), tid, "test_edge"); }
+
+    size_t idx = 0;
+    bool last_ok = true;
+    auto bench = make_latency_bench(1000, 0); // manual warmup done above
+    bench.run("edge_read", [&] {
+        uint64_t target = target_ids[idx++ % target_ids.size()];
+        auto edge = graph->get_edge(root->id(), target, "test_edge");
+        last_ok = edge.has_value();
+        ankerl::nanobench::doNotOptimizeAway(edge);
+    });
+    REQUIRE(last_ok);
+
+    collector.record_latency_stats("edge_read", nb_to_stats(bench));
+    collector.record("edge_read", MetricCategory::Throughput,
+                     nb_throughput(bench), "ops/sec");
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "edge_read_throughput");
+}
+
+TEST_CASE("Mixed operations throughput", "[THROUGHPUT][single]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("mixed_ops_throughput");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    auto root = graph->get_node_root();
+    REQUIRE(root.has_value());
+
+    std::vector<uint64_t> node_ids;
+    node_ids.reserve(600); // 500 initial + up to ~100 inserts from 30% insert rate × 1100 calls
+    for (uint64_t i = 0; i < 500; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        auto result = graph->insert_node(node);
+        REQUIRE(result.has_value());
+        node_ids.push_back(result.value());
+    }
+
+    uint64_t ops = 0;
+    auto sampled = run_sampled_benchmark(
+        50,
+        1000,
+        [&] {
+        int op_type = static_cast<int>(ops % 10);
+        if (op_type < 4) {
+            auto node = graph->get_node(node_ids[ops % node_ids.size()]);
+            ankerl::nanobench::doNotOptimizeAway(node);
+        } else if (op_type < 7) {
+            auto node = GraphGenerator::create_test_node(ops, graph->get_agent_id());
+            auto result = graph->insert_node(node);
+            REQUIRE(result.has_value());
+            node_ids.push_back(result.value());
+        } else {
+            auto node = graph->get_node(node_ids[ops % node_ids.size()]);
+            REQUIRE(node.has_value());
+            graph->add_or_modify_attrib_local<level_att>(
+                *node, static_cast<int32_t>(ops));
+            bool ok = graph->update_node(*node);
+            REQUIRE(ok);
+        }
+        ++ops;
+        },
+        [&] { fixture.process_events(1); },
+        16);
+
+    collector.record_latency_stats("mixed_ops", sampled.latency);
+    collector.record_throughput("mixed_ops", sampled.latency.count, sampled.wall_time);
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "mixed_ops_throughput");
+}
+
+TEST_CASE("Node deletion throughput", "[THROUGHPUT][single]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("node_delete_throughput");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    // Pool is 3× the expected maximum (warmup + epochs) so nanobench auto-tuning
+    // cannot exhaust it; the REQUIRE fires loudly if it somehow does.
+    std::vector<uint64_t> node_ids;
+    node_ids.reserve(3000);
+    for (uint64_t i = 0; i < 3000; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        auto res = graph->insert_node(node);
+        REQUIRE(res.has_value());
+        node_ids.push_back(res.value());
+    }
+
+    size_t pool_idx = 0;
+    auto sampled = run_sampled_benchmark(
+        50,
+        1000,
+        [&] {
+        REQUIRE(pool_idx < node_ids.size());
+        bool ok = graph->delete_node(node_ids[pool_idx++]);
+        REQUIRE(ok);
+        },
+        [&] { fixture.process_events(1); },
+        16);
+
+    collector.record_latency_stats("node_delete", sampled.latency);
+    collector.record_throughput("node_delete", sampled.latency.count, sampled.wall_time);
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "node_delete_throughput");
+}
+
+TEST_CASE("Edge deletion throughput", "[THROUGHPUT][single]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("edge_delete_throughput");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    auto root = graph->get_node_root();
+    REQUIRE(root.has_value());
+
+    // Pool is 3× the expected maximum (warmup + epochs) so nanobench auto-tuning
+    // cannot exhaust it; the REQUIRE fires loudly if it somehow does.
+    std::vector<uint64_t> target_ids;
+    target_ids.reserve(3000);
+    for (uint64_t i = 0; i < 3000; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        auto res = graph->insert_node(node);
+        REQUIRE(res.has_value());
+        auto edge = GraphGenerator::create_test_edge(
+            root->id(), res.value(), graph->get_agent_id());
+        REQUIRE(graph->insert_or_assign_edge(edge));
+        target_ids.push_back(res.value());
+    }
+
+    size_t pool_idx = 0;
+    auto sampled = run_sampled_benchmark(
+        50,
+        1000,
+        [&] {
+        REQUIRE(pool_idx < target_ids.size());
+        bool ok = graph->delete_edge(root->id(), target_ids[pool_idx++], "test_edge");
+        REQUIRE(ok);
+        },
+        [&] { fixture.process_events(1); },
+        8);
+
+    collector.record_latency_stats("edge_delete", sampled.latency);
+    collector.record_throughput("edge_delete", sampled.latency.count, sampled.wall_time);
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "edge_delete_throughput");
+}
+
+// Catch2 BENCHMARK macros (microbenchmark mode, run with [!benchmark])
+TEST_CASE("Single agent operations (Catch2 BENCHMARK)", "[THROUGHPUT][single][!benchmark]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    BENCHMARK("Node insert") {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        return graph->insert_node(node);
+    };
+
+    auto read_node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+    auto read_id_opt = graph->insert_node(read_node);
+    REQUIRE(read_id_opt.has_value());
+    uint64_t read_id = read_id_opt.value();
+
+    BENCHMARK("Node read") {
+        return graph->get_node(read_id);
+    };
+
+    BENCHMARK("Node update") {
+        auto node = graph->get_node(read_id);
+        if (node) {
+            graph->add_or_modify_attrib_local<level_att>(*node, 42);
+            return graph->update_node(*node);
+        }
+        return false;
+    };
+}
diff --git a/benchmarks/throughput/single_agent_ops_with_latency_bench.cpp b/benchmarks/throughput/single_agent_ops_with_latency_bench.cpp
new file mode 100644
index 0000000..3bde6d8
--- /dev/null
+++ b/benchmarks/throughput/single_agent_ops_with_latency_bench.cpp
@@ -0,0 +1,330 @@
+#include <catch2/catch_test_macros.hpp>
+
+#include "../core/nanobench_adapter.h"
+#include "../core/metrics_collector.h"
+#include "../core/report_generator.h"
+#include "../fixtures/multi_agent_fixture.h"
+#include "../fixtures/graph_generator.h"
+
+using namespace DSR;
+using namespace DSR::Benchmark;
+
+// Each TEST_CASE measures both latency and throughput simultaneously using
+// nanobench. Steady-state read/update paths may raise minEpochIterations() to
+// reduce timer noise. Destructive/stateful workloads use
+// make_single_op_latency_bench() so graph growth does not drift across runs.
+// Tags {"threads","1","graph_size","0"} mark these as the single-thread,
+// empty-graph baseline for the Scalability tab.
+
+TEST_CASE("Node insert latency+throughput", "[THROUGHPUT][LATENCY][single]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("node_insert_lat_thr");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    uint64_t id_counter = 0;
+    auto sampled = run_sampled_benchmark(
+        50,
+        1000,
+        [&] {
+        auto node = GraphGenerator::create_test_node(id_counter++, graph->get_agent_id());
+        auto res = graph->insert_node(node);
+        REQUIRE(res.has_value());
+        },
+        [&] { fixture.process_events(1); },
+        16);
+
+    collector.record_latency_stats("node_insert", sampled.latency,
+        {{"threads", "1"}, {"graph_size", "0"}});
+    collector.record_throughput("node_insert", sampled.latency.count, sampled.wall_time,
+        {{"threads", "1"}, {"graph_size", "0"}});
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "node_insert_lat_thr");
+}
+
+TEST_CASE("Node read latency+throughput", "[THROUGHPUT][LATENCY][single][EXTENDED][.extended]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("node_read_lat_thr");
+    collector.add_metadata("profile", "extended");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    // Pre-populate 1000 nodes for round-robin reads
+    std::vector<uint64_t> node_ids;
+    node_ids.reserve(1000);
+    for (uint64_t i = 0; i < 1000; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        auto res = graph->insert_node(node);
+        REQUIRE(res.has_value());
+        node_ids.push_back(res.value());
+    }
+
+    // Cache warmup: touch every node once so the read loop is warm
+    for (auto id : node_ids) { (void)graph->get_node(id); }
+
+    // ~900 ns/op: 10 000 iters/epoch × 200 epochs ≈ 1.8 s
+    size_t idx = 0;
+    bool last_ok = true;
+    auto bench = make_latency_bench(200, 0); // manual warmup done above
+    bench.minEpochIterations(10000);
+    bench.run("node_read", [&] {
+        auto node = graph->get_node(node_ids[idx++ % node_ids.size()]);
+        last_ok = node.has_value();
+        ankerl::nanobench::doNotOptimizeAway(node);
+    });
+    REQUIRE(last_ok);
+
+    auto stats = nb_to_stats(bench);
+    collector.record_latency_stats("node_read", stats,
+        {{"threads", "1"}, {"graph_size", "0"}});
+    collector.record("node_read", MetricCategory::Throughput,
+        nb_throughput(bench), "ops/sec",
+        {{"threads", "1"}, {"graph_size", "0"}});
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "node_read_lat_thr");
+}
+
+TEST_CASE("Node update latency+throughput", "[THROUGHPUT][LATENCY][single][EXTENDED][.extended]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("node_update_lat_thr");
+    collector.add_metadata("profile", "extended");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    auto test_node = GraphGenerator::create_test_node(0, graph->get_agent_id(), "update_test");
+    auto insert_result = graph->insert_node(test_node);
+    REQUIRE(insert_result.has_value());
+    uint64_t node_id = insert_result.value();
+
+    // ~38µs/op: 300 iters/epoch × 50 epochs ≈ 0.57 s
+    uint64_t update_counter = 0;
+    auto bench = make_latency_bench(50);
+    bench.minEpochIterations(300);
+    bench.run("node_update", [&] {
+        auto node = graph->get_node(node_id);
+        REQUIRE(node.has_value());
+        graph->add_or_modify_attrib_local<level_att>(
+            *node, static_cast<int32_t>(update_counter++ % 1000));
+        bool ok = graph->update_node(*node);
+        REQUIRE(ok);
+        ankerl::nanobench::doNotOptimizeAway(ok);
+    });
+
+    auto stats = nb_to_stats(bench);
+    collector.record_latency_stats("node_update", stats,
+        {{"threads", "1"}, {"graph_size", "0"}});
+    collector.record("node_update", MetricCategory::Throughput,
+        nb_throughput(bench), "ops/sec",
+        {{"threads", "1"}, {"graph_size", "0"}});
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "node_update_lat_thr");
+}
+
+TEST_CASE("Edge insert latency+throughput", "[THROUGHPUT][LATENCY][single]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("edge_insert_lat_thr");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    auto root = graph->get_node_root();
+    REQUIRE(root.has_value());
+
+    // Pre-populate target node pool
+    std::vector<uint64_t> target_ids;
+    target_ids.reserve(10000);
+    for (uint64_t i = 0; i < 10000; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        auto res = graph->insert_node(node);
+        REQUIRE(res.has_value());
+        target_ids.push_back(res.value());
+    }
+
+    size_t idx = 0;
+    auto sampled = run_sampled_benchmark(
+        50,
+        1000,
+        [&] {
+        uint64_t target = target_ids[idx++ % target_ids.size()];
+        auto edge = GraphGenerator::create_test_edge(root->id(), target, graph->get_agent_id());
+        bool ok = graph->insert_or_assign_edge(edge);
+        REQUIRE(ok);
+        },
+        [&] { fixture.process_events(1); },
+        8);
+
+    collector.record_latency_stats("edge_insert", sampled.latency,
+        {{"threads", "1"}, {"graph_size", "0"}});
+    collector.record_throughput("edge_insert", sampled.latency.count, sampled.wall_time,
+        {{"threads", "1"}, {"graph_size", "0"}});
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "edge_insert_lat_thr");
+}
+
+TEST_CASE("Edge read latency+throughput", "[THROUGHPUT][LATENCY][single][EXTENDED][.extended]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("edge_read_lat_thr");
+    collector.add_metadata("profile", "extended");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    auto root = graph->get_node_root();
+    REQUIRE(root.has_value());
+
+    // Pre-populate 1000 nodes + edges
+    std::vector<uint64_t> target_ids;
+    target_ids.reserve(1000);
+    for (uint64_t i = 0; i < 1000; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        auto res = graph->insert_node(node);
+        REQUIRE(res.has_value());
+        target_ids.push_back(res.value());
+        auto edge = GraphGenerator::create_test_edge(
+            root->id(), res.value(), graph->get_agent_id());
+        REQUIRE(graph->insert_or_assign_edge(edge));
+    }
+
+    // Cache warmup
+    for (auto tid : target_ids) { (void)graph->get_edge(root->id(), tid, "test_edge"); }
+
+    size_t idx = 0;
+    bool last_ok = true;
+    auto bench = make_latency_bench(1000, 0); // manual warmup done above
+    bench.run("edge_read", [&] {
+        uint64_t target = target_ids[idx++ % target_ids.size()];
+        auto edge = graph->get_edge(root->id(), target, "test_edge");
+        last_ok = edge.has_value();
+        ankerl::nanobench::doNotOptimizeAway(edge);
+    });
+    REQUIRE(last_ok);
+
+    auto stats = nb_to_stats(bench);
+    collector.record_latency_stats("edge_read", stats,
+        {{"threads", "1"}, {"graph_size", "0"}});
+    collector.record("edge_read", MetricCategory::Throughput,
+        nb_throughput(bench), "ops/sec",
+        {{"threads", "1"}, {"graph_size", "0"}});
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "edge_read_lat_thr");
+}
+
+TEST_CASE("Node delete latency+throughput", "[THROUGHPUT][LATENCY][single]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("node_delete_lat_thr");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    // Pool is 3× the expected maximum (warmup + epochs) so nanobench auto-tuning
+    // cannot exhaust it; the REQUIRE fires loudly if it somehow does.
+    std::vector<uint64_t> node_ids;
+    node_ids.reserve(3000);
+    for (uint64_t i = 0; i < 3000; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        auto res = graph->insert_node(node);
+        REQUIRE(res.has_value());
+        node_ids.push_back(res.value());
+    }
+
+    size_t pool_idx = 0;
+    auto sampled = run_sampled_benchmark(
+        50,
+        1000,
+        [&] {
+        REQUIRE(pool_idx < node_ids.size());
+        bool ok = graph->delete_node(node_ids[pool_idx++]);
+        REQUIRE(ok);
+        },
+        [&] { fixture.process_events(1); },
+        16);
+
+    collector.record_latency_stats("node_delete", sampled.latency,
+        {{"threads", "1"}, {"graph_size", "0"}});
+    collector.record_throughput("node_delete", sampled.latency.count, sampled.wall_time,
+        {{"threads", "1"}, {"graph_size", "0"}});
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "node_delete_lat_thr");
+}
+
+TEST_CASE("Edge delete latency+throughput", "[THROUGHPUT][LATENCY][single]") {
+    MultiAgentFixture fixture;
+    GraphGenerator generator;
+    MetricsCollector collector("edge_delete_lat_thr");
+
+    auto config_file = generator.generate_empty_graph();
+    REQUIRE(fixture.create_agents(1, config_file));
+    auto* graph = fixture.get_agent(0);
+    REQUIRE(graph != nullptr);
+
+    auto root = graph->get_node_root();
+    REQUIRE(root.has_value());
+
+    // Pool is 3× the expected maximum (warmup + epochs) so nanobench auto-tuning
+    // cannot exhaust it; the REQUIRE fires loudly if it somehow does.
+    std::vector<uint64_t> target_ids;
+    target_ids.reserve(3000);
+    for (uint64_t i = 0; i < 3000; ++i) {
+        auto node = GraphGenerator::create_test_node(0, graph->get_agent_id());
+        auto res = graph->insert_node(node);
+        REQUIRE(res.has_value());
+        auto edge = GraphGenerator::create_test_edge(
+            root->id(), res.value(), graph->get_agent_id());
+        REQUIRE(graph->insert_or_assign_edge(edge));
+        target_ids.push_back(res.value());
+    }
+
+    size_t pool_idx = 0;
+    auto sampled = run_sampled_benchmark(
+        50,
+        1000,
+        [&] {
+        REQUIRE(pool_idx < target_ids.size());
+        bool ok = graph->delete_edge(root->id(), target_ids[pool_idx++], "test_edge");
+        REQUIRE(ok);
+        },
+        [&] { fixture.process_events(1); },
+        8);
+
+    collector.record_latency_stats("edge_delete", sampled.latency,
+        {{"threads", "1"}, {"graph_size", "0"}});
+    collector.record_throughput("edge_delete", sampled.latency.count, sampled.wall_time,
+        {{"threads", "1"}, {"graph_size", "0"}});
+
+    auto result = collector.finalize();
+    ReportGenerator reporter("results");
+    reporter.export_all(result, "edge_delete_lat_thr");
+}
diff --git a/core/include/dsr/core/rtps/dsrparticipant.h b/core/include/dsr/core/rtps/dsrparticipant.h
index 662897a..e3e59d2 100644
--- a/core/include/dsr/core/rtps/dsrparticipant.h
+++ b/core/include/dsr/core/rtps/dsrparticipant.h
@@ -17,6 +17,7 @@ class DSRParticipant
     DSRParticipant();
     virtual ~DSRParticipant();
     [[nodiscard]] std::tuple<bool, eprosima::fastdds::dds::DomainParticipant *> init(uint32_t agent_id, const std::string& agent_name, int localhost, std::function<void(eprosima::fastdds::rtps::ParticipantDiscoveryStatus, const eprosima::fastdds::rtps::ParticipantBuiltinTopicData&)> fn, int8_t domain_id=0);
+    [[nodiscard]] int8_t get_domain_id() const { return domain_id_; }
     [[nodiscard]] const eprosima::fastdds::rtps::GUID_t& getID() const;
     [[nodiscard]] const char *getNodeTopicName()     const { return dsrgraphType->get_name().data();}
     [[nodiscard]] const char *getRequestTopicName()  const { return graphrequestType->get_name().data();}
@@ -41,6 +42,7 @@ class DSRParticipant
     void remove_participant_and_entities();
 
 private:
+    int8_t domain_id_ {0};
     eprosima::fastdds::dds::DomainParticipant* mp_participant{};
 
     eprosima::fastdds::dds::Topic*  topic_node{};
diff --git a/core/include/dsr/core/rtps/dsrpublisher.h b/core/include/dsr/core/rtps/dsrpublisher.h
index a55f000..65d8707 100644
--- a/core/include/dsr/core/rtps/dsrpublisher.h
+++ b/core/include/dsr/core/rtps/dsrpublisher.h
@@ -16,7 +16,11 @@ class DSRPublisher
 public:
     DSRPublisher();
     virtual ~DSRPublisher();
-    [[nodiscard]] std::tuple<bool, eprosima::fastdds::dds::Publisher*, eprosima::fastdds::dds::DataWriter*> init(eprosima::fastdds::dds::DomainParticipant *mp_participant_, eprosima::fastdds::dds::Topic *topic,  bool isStreamData = false);
+    [[nodiscard]] std::tuple<bool, eprosima::fastdds::dds::Publisher*, eprosima::fastdds::dds::DataWriter*> init(
+        eprosima::fastdds::dds::DomainParticipant *mp_participant_,
+        eprosima::fastdds::dds::Topic *topic,
+        int8_t domain_id,
+        bool isStreamData = false);
     [[nodiscard]] eprosima::fastdds::rtps::GUID_t getParticipantID() const;
     bool write(IDL::GraphRequest *object);
     bool write(IDL::MvregNode *object);
diff --git a/core/include/dsr/core/rtps/dsrsubscriber.h b/core/include/dsr/core/rtps/dsrsubscriber.h
index d51f317..bedd98b 100644
--- a/core/include/dsr/core/rtps/dsrsubscriber.h
+++ b/core/include/dsr/core/rtps/dsrsubscriber.h
@@ -16,6 +16,7 @@ class DSRSubscriber
     [[nodiscard]] std::tuple<bool, eprosima::fastdds::dds::Subscriber*, eprosima::fastdds::dds::DataReader*>
 	          init(eprosima::fastdds::dds::DomainParticipant *mp_participant_,
                    eprosima::fastdds::dds::Topic *topic,
+                   int8_t domain_id,
 				   const std::function<void(eprosima::fastdds::dds::DataReader*)>&  f_,
 				   std::mutex& mtx,
 				   bool isStreamData = false);
diff --git a/core/rtps/dsrparticipant.cpp b/core/rtps/dsrparticipant.cpp
index 02026dd..2139130 100644
--- a/core/rtps/dsrparticipant.cpp
+++ b/core/rtps/dsrparticipant.cpp
@@ -10,6 +10,23 @@
 using namespace eprosima::fastdds::dds;
 using namespace eprosima::fastdds::rtps;
 
+namespace {
+std::vector<std::string> host_ipv4_interfaces()
+{
+    std::vector<std::string> ips{"127.0.0.1"};
+    std::vector<IPFinder::info_IP> found;
+    IPFinder::getIPs(&found, false);
+    for (const auto& ip : found) {
+        if (ip.type == IPFinder::IP4) {
+            if (std::find(ips.begin(), ips.end(), ip.name) == ips.end()) {
+                ips.push_back(ip.name);
+            }
+        }
+    }
+    return ips;
+}
+}
+
 DSRParticipant::DSRParticipant() : mp_participant(nullptr),
                                    dsrgraphType(new MvregNodePubSubType()),
                                    graphrequestType(new GraphRequestPubSubType()),
@@ -32,6 +49,7 @@ DSRParticipant::~DSRParticipant()
 
 std::tuple<bool, eprosima::fastdds::dds::DomainParticipant*> DSRParticipant::init(uint32_t agent_id, const std::string& agent_name, int localhost, std::function<void(eprosima::fastdds::rtps::ParticipantDiscoveryStatus, const eprosima::fastdds::rtps::ParticipantBuiltinTopicData&)> fn, int8_t domain_id)
 {
+    domain_id_ = domain_id;
     // Create RTPSParticipant     
     DomainParticipantQos PParam;
     PParam.name(("Participant_" + std::to_string(agent_id)+ " ( " + agent_name + " )").data() );
@@ -40,32 +58,24 @@ std::tuple<bool, eprosima::fastdds::dds::DomainParticipant*> DSRParticipant::ini
     //Disable the built-in Transport Layer.
     PParam.transport().use_builtin_transports = false;
 
-    //Create a descriptor for the new transport.
-    auto custom_transport = std::make_shared<UDPv4TransportDescriptor>();
-    //auto custom_transport = std::make_shared<SharedMemTransportDescriptor>();
-    //custom_transport->sendBufferSize = 33554432; // commented it will use the OS default
-    //custom_transport->receiveBufferSize = 33554432; // commented it will use the OS default
-    custom_transport->maxMessageSize = 65000;
-
-    PParam.transport().user_transports.push_back(custom_transport);
-
-
-    custom_transport->interface_allowlist.emplace_back("127.0.0.1");
-
-    /*if (not localhost)
-    {
-
-        
-        std::vector<eprosima::fastdds::rtps::IPFinder::info_IP> ips;
-        eprosima::fastdds::rtps::IPFinder::getIPs(&ips, false);
-
-        for (auto &ip : ips) {
-            if (ip.type == eprosima::fastdds::rtps::IPFinder::IP4 ) {
-                //custom_transport->interfaceWhiteList.emplace_back(ip.name);
-            }
+    if (localhost) {
+        // Same-host deployments should prefer shared memory. Keep loopback UDP
+        // as a discovery/data fallback for environments where SHM is limited.
+        auto shm_transport = std::make_shared<SharedMemTransportDescriptor>();
+        PParam.transport().user_transports.push_back(shm_transport);
+
+        auto udp_transport = std::make_shared<UDPv4TransportDescriptor>();
+        udp_transport->maxMessageSize = 65000;
+        udp_transport->interface_allowlist.emplace_back("127.0.0.1");
+        PParam.transport().user_transports.push_back(udp_transport);
+    } else {
+        auto udp_transport = std::make_shared<UDPv4TransportDescriptor>();
+        udp_transport->maxMessageSize = 65000;
+        for (const auto& ip : host_ipv4_interfaces()) {
+            udp_transport->interface_allowlist.emplace_back(ip);
         }
-
-    }*/
+        PParam.transport().user_transports.push_back(udp_transport);
+    }
 
     PParam.transport().send_socket_buffer_size = 33554432;
     PParam.transport().listen_socket_buffer_size = 33554432;
diff --git a/core/rtps/dsrpublisher.cpp b/core/rtps/dsrpublisher.cpp
index 04d3208..ea13b5d 100644
--- a/core/rtps/dsrpublisher.cpp
+++ b/core/rtps/dsrpublisher.cpp
@@ -15,6 +15,19 @@ using namespace eprosima::fastdds;
 using namespace eprosima::fastdds::rtps;
 using namespace eprosima::fastdds::dds;
 
+namespace {
+Locator_t domain_multicast_locator(int8_t domain_id)
+{
+    const auto domain = static_cast<uint8_t>(domain_id);
+    Locator_t locator;
+    locator.port = 7900;
+    locator.kind = LOCATOR_KIND_UDPv4;
+    IPLocator::setIPv4(locator,
+        ("239.255." + std::to_string(domain / 250) + "." + std::to_string(1 + (domain % 250))).c_str());
+    return locator;
+}
+}
+
 DSRPublisher::DSRPublisher() : mp_participant(nullptr), mp_publisher(nullptr), mp_writer(nullptr)
 {}
 
@@ -23,7 +36,7 @@ DSRPublisher::~DSRPublisher()
 }
 
 std::tuple<bool, eprosima::fastdds::dds::Publisher*, eprosima::fastdds::dds::DataWriter*>
-        DSRPublisher::init(eprosima::fastdds::dds::DomainParticipant *mp_participant_, eprosima::fastdds::dds::Topic *topic, bool isStreamData )
+        DSRPublisher::init(eprosima::fastdds::dds::DomainParticipant *mp_participant_, eprosima::fastdds::dds::Topic *topic, int8_t domain_id, bool isStreamData )
 {
     mp_participant = mp_participant_;
 
@@ -46,12 +59,7 @@ std::tuple<bool, eprosima::fastdds::dds::Publisher*, eprosima::fastdds::dds::Dat
 
 
     if (not local) {
-        Locator_t locator;
-        locator.port = 7900;
-        locator.kind = LOCATOR_KIND_UDPv4;
-        IPLocator::setIPv4(locator, "239.255.1.33");
-        dataWriterQos.endpoint().multicast_locator_list.push_back(locator);
-
+        dataWriterQos.endpoint().multicast_locator_list.push_back(domain_multicast_locator(domain_id));
     }
 
     //ThroughputControllerDescriptor PublisherThroughputController{30000000, 1000};
diff --git a/core/rtps/dsrsubscriber.cpp b/core/rtps/dsrsubscriber.cpp
index 68dc334..1a7ed70 100644
--- a/core/rtps/dsrsubscriber.cpp
+++ b/core/rtps/dsrsubscriber.cpp
@@ -14,6 +14,19 @@ using namespace eprosima;
 using namespace eprosima::fastdds;
 using namespace eprosima::fastdds::rtps;
 
+namespace {
+Locator_t domain_multicast_locator(int8_t domain_id)
+{
+    const auto domain = static_cast<uint8_t>(domain_id);
+    Locator_t locator;
+    locator.port = 7900;
+    locator.kind = LOCATOR_KIND_UDPv4;
+    IPLocator::setIPv4(locator,
+        ("239.255." + std::to_string(domain / 250) + "." + std::to_string(1 + (domain % 250))).c_str());
+    return locator;
+}
+}
+
 DSRSubscriber::DSRSubscriber() : mp_participant(nullptr), mp_subscriber(nullptr), mp_reader(nullptr) {}
 
 DSRSubscriber::~DSRSubscriber()
@@ -22,6 +35,7 @@ DSRSubscriber::~DSRSubscriber()
 std::tuple<bool, eprosima::fastdds::dds::Subscriber*, eprosima::fastdds::dds::DataReader*>
         DSRSubscriber::init(eprosima::fastdds::dds::DomainParticipant *mp_participant_,
                          eprosima::fastdds::dds::Topic *topic,
+                         int8_t domain_id,
                         const std::function<void(eprosima::fastdds::dds::DataReader*)>&  f_,
                         std::mutex& mtx,
                         bool isStreamData)
@@ -52,11 +66,7 @@ std::tuple<bool, eprosima::fastdds::dds::Subscriber*, eprosima::fastdds::dds::Da
                               }) != mp_participant_->get_qos().transport().user_transports.end();
 
     if (not local) {
-        Locator_t locator;
-        locator.port = 7900;
-        locator.kind = LOCATOR_KIND_UDPv4;
-        IPLocator::setIPv4(locator, "239.255.1.33");
-        dataReaderQos.endpoint().multicast_locator_list.push_back(locator);
+        dataReaderQos.endpoint().multicast_locator_list.push_back(domain_multicast_locator(domain_id));
     }
 
     //Check latency
@@ -121,4 +131,3 @@ void DSRSubscriber::SubListener::on_data_available(eprosima::fastdds::dds::DataR
 {
     f(sub);
 }
-
diff --git a/tests/graph/edge_operations.cpp b/tests/graph/edge_operations.cpp
index d983537..61da995 100644
--- a/tests/graph/edge_operations.cpp
+++ b/tests/graph/edge_operations.cpp
@@ -130,6 +130,33 @@ auto n = Node::create<testtype_node_type>();
         REQUIRE_FALSE(r);
     }
 
+    SECTION("Deleting one edge type between a pair leaves other types intact") {
+        auto n1 = Node::create<testtype_node_type>();
+        auto id1 = G.insert_node(n1);
+        REQUIRE(id1.has_value());
+
+        auto n2 = Node::create<testtype_node_type>();
+        auto id2 = G.insert_node(n2);
+        REQUIRE(id2.has_value());
+
+        // Two different edge types between the same node pair
+        auto e_in    = Edge::create<in_edge_type>(*id1, *id2);
+        auto e_knows = Edge::create<knows_edge_type>(*id1, *id2);
+        REQUIRE(G.insert_or_assign_edge(e_in));
+        REQUIRE(G.insert_or_assign_edge(e_knows));
+
+        // Delete only the "in" edge
+        REQUIRE(G.delete_edge(*id1, *id2, std::string(in_edge_type::attr_name)));
+
+        // "in" must be gone
+        REQUIRE_FALSE(G.get_edge(*id1, *id2, std::string(in_edge_type::attr_name)).has_value());
+
+        // "knows" must still be visible
+        REQUIRE(G.get_edge(*id1, *id2, std::string(knows_edge_type::attr_name)).has_value());
+        auto remaining = G.get_edges_by_type(std::string(knows_edge_type::attr_name));
+        REQUIRE(remaining.size() == 1);
+    }
+
 }
 
 
diff --git a/tests/graph/node_operations.cpp b/tests/graph/node_operations.cpp
index 7d3b7b6..0edec07 100644
--- a/tests/graph/node_operations.cpp
+++ b/tests/graph/node_operations.cpp
@@ -6,6 +6,7 @@
 #include "catch2/catch_test_macros.hpp"
 
 #include "dsr/core/types/user_types.h"
+#include "dsr/core/types/type_checking/dsr_edge_type.h"
 
 #include "dsr/api/dsr_api.h"
 #include "../utils.h"
@@ -120,6 +121,28 @@ TEST_CASE("Graph node operations", "[NODE]") {
         REQUIRE_FALSE(r);
     }
 
+    SECTION("Deleting a node removes incoming edges from other nodes") {
+        auto n1 = Node::create<testtype_node_type>();
+        auto id1 = G.insert_node(n1);
+        REQUIRE(id1.has_value());
+
+        auto n2 = Node::create<testtype_node_type>();
+        auto id2 = G.insert_node(n2);
+        REQUIRE(id2.has_value());
+
+        // Create edge n1 -> n2
+        auto e = Edge::create<in_edge_type>(*id1, *id2);
+        REQUIRE(G.insert_or_assign_edge(e));
+        REQUIRE(G.get_edge(*id1, *id2, std::string(in_edge_type::attr_name)).has_value());
+
+        // Deleting n2 must also remove the incoming edge from n1
+        REQUIRE(G.delete_node(*id2));
+        REQUIRE_FALSE(G.get_edge(*id1, *id2, std::string(in_edge_type::attr_name)).has_value());
+
+        // n1 should still exist
+        REQUIRE(G.get_node(*id1).has_value());
+    }
+
     SECTION("Create a node with an user defined name") {
         auto name = random_string();
         Node n;
diff --git a/tests/synchronization/graph_synchronization.cpp b/tests/synchronization/graph_synchronization.cpp
index 76b39df..c3fba1b 100644
--- a/tests/synchronization/graph_synchronization.cpp
+++ b/tests/synchronization/graph_synchronization.cpp
@@ -1,10 +1,6 @@
 //
 // Created by jc on 5/11/24.
 //
-
-
-
-
 #include "dsr/api/dsr_api.h"
 #include "../utils.h"
 #include <thread>
@@ -12,9 +8,28 @@
 #include "catch2/catch_test_macros.hpp"
 #include "catch2/generators/catch_generators.hpp"
 
+#include "dsr/core/topics/IDLGraph.hpp"
+
 using namespace DSR;
 using namespace std::chrono_literals;
 
+namespace DSR
+{
+class DSRGraphTestAccess
+{
+public:
+    static std::map<uint64_t, IDL::MvregNode> Map(DSRGraph& graph)
+    {
+        return graph.Map();
+    }
+
+    static void join_full_graph(DSRGraph& graph, IDL::OrMap&& full_graph)
+    {
+        graph.join_full_graph(std::move(full_graph));
+    }
+};
+}
+
 TEST_CASE("Connect and receive the graph from other agent", "[SYNCHRONIZATION][GRAPH]"){
 
 
@@ -27,4 +42,83 @@ TEST_CASE("Connect and receive the graph from other agent", "[SYNCHRONIZATION][G
     std::this_thread::sleep_for(200ms);
     REQUIRE(G2.size() == G.size());
     
-}
\ No newline at end of file
+}
+
+TEST_CASE("Same-process agents discover each other and exchange updates", "[SYNCHRONIZATION][GRAPH][REGRESSION][DDS]")
+{
+    const auto same_host = GENERATE(true, false);
+    auto ctx = make_edge_config_file();
+    auto id1 = static_cast<uint32_t>(rand() % 1000 + 1000);
+    auto id2 = id1 + 1;
+
+    DSRGraph loader(random_string(10), id1, ctx, same_host);
+    DSRGraph follower(random_string(11), id2, std::string{}, same_host);
+
+    auto wait_until = [](auto&& predicate, std::chrono::milliseconds timeout = 2000ms)
+    {
+        const auto deadline = std::chrono::steady_clock::now() + timeout;
+        while (std::chrono::steady_clock::now() < deadline)
+        {
+            if (predicate())
+                return true;
+            std::this_thread::sleep_for(50ms);
+        }
+        return predicate();
+    };
+
+    REQUIRE(wait_until([&] { return follower.size() == loader.size(); }));
+    REQUIRE(wait_until([&] { return !loader.get_connected_agents().empty(); }));
+    REQUIRE(wait_until([&] { return !follower.get_connected_agents().empty(); }));
+
+    auto root_loader = loader.get_node("root");
+    REQUIRE(root_loader.has_value());
+    root_loader->attrs()["same_process_loader_" + std::to_string(same_host)] =
+        Attribute(std::string("loader"), get_unix_timestamp(), loader.get_agent_id());
+    REQUIRE(loader.update_node(root_loader.value()));
+
+    REQUIRE(wait_until([&] {
+        auto root_follower = follower.get_node("root");
+        return root_follower.has_value() &&
+               root_follower->attrs().contains("same_process_loader_" + std::to_string(same_host));
+    }));
+
+    auto root_follower = follower.get_node("root");
+    REQUIRE(root_follower.has_value());
+    root_follower->attrs()["same_process_follower_" + std::to_string(same_host)] =
+        Attribute(std::string("follower"), get_unix_timestamp(), follower.get_agent_id());
+    REQUIRE(follower.update_node(root_follower.value()));
+
+    REQUIRE(wait_until([&] {
+        auto updated_root_loader = loader.get_node("root");
+        return updated_root_loader.has_value() &&
+               updated_root_loader->attrs().contains("same_process_follower_" + std::to_string(same_host));
+    }));
+}
+
+TEST_CASE("Full graph join does not leave empty node registers after local deletion", "[SYNCHRONIZATION][GRAPH][REGRESSION]")
+{
+    auto ctx = make_empty_config_file();
+    DSRGraph graph(random_string(10), static_cast<uint32_t>(rand() % 4000), ctx);
+    const auto initial_size = graph.size();
+
+    auto node = Node::create<testtype_node_type>("regression_node");
+    node.id(1000);
+    node.agent_id(graph.get_agent_id());
+
+    REQUIRE(graph.insert_node_with_id(node).has_value());
+    REQUIRE(graph.size() == initial_size + 1);
+
+    IDL::OrMap full_graph;
+    full_graph.id(graph.get_agent_id());
+    full_graph.to_id(graph.get_agent_id());
+    full_graph.m(DSRGraphTestAccess::Map(graph));
+
+    REQUIRE(graph.delete_node(node.id()));
+    REQUIRE(graph.size() == initial_size);
+    REQUIRE_FALSE(graph.get_node(node.id()).has_value());
+
+    DSRGraphTestAccess::join_full_graph(graph, std::move(full_graph));
+
+    REQUIRE(graph.size() == initial_size);
+    REQUIRE_FALSE(graph.get_node(node.id()).has_value());
+}
diff --git a/tools/same_host_smoke/agent_worker.py b/tools/same_host_smoke/agent_worker.py
new file mode 100644
index 0000000..cc8953d
--- /dev/null
+++ b/tools/same_host_smoke/agent_worker.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python3
+import argparse
+import json
+import os
+import sys
+import time
+from pathlib import Path
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Run one DSR agent worker process")
+    parser.add_argument("--agent-name", required=True)
+    parser.add_argument("--agent-id", required=True, type=int)
+    parser.add_argument("--domain-id", required=True, type=int)
+    parser.add_argument("--same-host", required=True, choices=("true", "false"))
+    parser.add_argument("--graph-file", default="")
+    parser.add_argument("--artifacts-dir", required=True)
+    parser.add_argument("--local-attr", required=True)
+    parser.add_argument("--local-value", required=True)
+    parser.add_argument("--remote-attr", required=True)
+    parser.add_argument("--remote-value", required=True)
+    parser.add_argument("--startup-delay", default=0.0, type=float)
+    parser.add_argument("--sync-timeout", default=30.0, type=float)
+    parser.add_argument("--hold-seconds", default=0.0, type=float)
+    return parser.parse_args()
+
+
+def wait_for(predicate, timeout_s: float, interval_s: float = 0.1, error: str = "timeout"):
+    deadline = time.monotonic() + timeout_s
+    while time.monotonic() < deadline:
+        value = predicate()
+        if value:
+            return value
+        time.sleep(interval_s)
+    raise TimeoutError(error)
+
+
+def read_root_attr(graph, attr_name: str):
+    root = graph.get_node("root")
+    if root is None:
+        return None
+    if attr_name not in root.attrs:
+        return None
+    return root.attrs[attr_name].value
+
+
+def main() -> int:
+    args = parse_args()
+    artifacts_dir = Path(args.artifacts_dir)
+    artifacts_dir.mkdir(parents=True, exist_ok=True)
+    result_path = artifacts_dir / f"{args.agent_name}.json"
+
+    build_python_wrapper = Path(__file__).resolve().parents[2] / "build" / "python-wrapper"
+    sys.path.insert(0, str(build_python_wrapper))
+
+    import pydsr
+
+    time.sleep(args.startup_delay)
+
+    graph = pydsr.DSRGraph(
+        0,
+        args.agent_name,
+        args.agent_id,
+        args.graph_file,
+        args.same_host == "true",
+        args.domain_id,
+    )
+
+    result = {
+        "agent_name": args.agent_name,
+        "agent_id": args.agent_id,
+        "domain_id": args.domain_id,
+        "same_host": args.same_host == "true",
+        "graph_file_loaded": bool(args.graph_file),
+    }
+
+    try:
+        initial_nodes = wait_for(
+            lambda: len(graph.get_nodes()) if graph.get_node("root") is not None else 0,
+            timeout_s=args.sync_timeout,
+            error="graph root never became available",
+        )
+        result["initial_node_count"] = initial_nodes
+
+        root = wait_for(
+            lambda: graph.get_node("root"),
+            timeout_s=args.sync_timeout,
+            error="root node not available",
+        )
+        root.attrs[args.local_attr] = pydsr.Attribute(args.local_value)
+        update_ok = graph.update_node(root)
+        if not update_ok:
+            raise RuntimeError(f"failed to update root with {args.local_attr}")
+
+        observed_remote = wait_for(
+            lambda: read_root_attr(graph, args.remote_attr),
+            timeout_s=args.sync_timeout,
+            error=f"remote attribute {args.remote_attr} not observed",
+        )
+        if observed_remote != args.remote_value:
+            raise RuntimeError(
+                f"unexpected value for {args.remote_attr}: {observed_remote!r} != {args.remote_value!r}"
+            )
+
+        final_root = graph.get_node("root")
+        result["final_node_count"] = len(graph.get_nodes())
+        result["local_attr_value"] = final_root.attrs[args.local_attr].value
+        result["remote_attr_value"] = final_root.attrs[args.remote_attr].value
+        if args.hold_seconds > 0:
+            time.sleep(args.hold_seconds)
+        result["status"] = "ok"
+    except Exception as exc:
+        result["status"] = "error"
+        result["error"] = str(exc)
+    finally:
+        result_path.write_text(json.dumps(result, indent=2), encoding="utf-8")
+
+    return 0 if result["status"] == "ok" else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tools/same_host_smoke/run_same_host_smoke.sh b/tools/same_host_smoke/run_same_host_smoke.sh
new file mode 100644
index 0000000..5a10da4
--- /dev/null
+++ b/tools/same_host_smoke/run_same_host_smoke.sh
@@ -0,0 +1,89 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+ARTIFACT_ROOT="${ROOT_DIR}/.artifacts/same_host_smoke"
+GRAPH_FILE="${ROOT_DIR}/python-wrapper/etc/autonomyLab_objects.simscene.json"
+WORKER="${ROOT_DIR}/tools/same_host_smoke/agent_worker.py"
+
+export PYTHONPATH="${ROOT_DIR}/build/python-wrapper${PYTHONPATH:+:${PYTHONPATH}}"
+export LD_LIBRARY_PATH="${ROOT_DIR}/build/api:${ROOT_DIR}/build/core${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
+
+mkdir -p "${ARTIFACT_ROOT}"
+
+run_case() {
+  local same_host="$1"
+  local domain_id="$2"
+  local case_dir="${ARTIFACT_ROOT}/same_host_${same_host}"
+
+  rm -rf "${case_dir}"
+  mkdir -p "${case_dir}"
+
+  python3 "${WORKER}" \
+    --agent-name "same_host_${same_host}_loader" \
+    --agent-id $((domain_id * 10 + 1)) \
+    --domain-id "${domain_id}" \
+    --same-host "${same_host}" \
+    --graph-file "${GRAPH_FILE}" \
+    --artifacts-dir "${case_dir}" \
+    --local-attr "sync_from_loader_${same_host}" \
+    --local-value "loader_${same_host}" \
+    --remote-attr "sync_from_follower_${same_host}" \
+    --remote-value "follower_${same_host}" \
+    > "${case_dir}/loader.log" 2>&1 &
+  local pid_a=$!
+
+  python3 "${WORKER}" \
+    --agent-name "same_host_${same_host}_follower" \
+    --agent-id $((domain_id * 10 + 2)) \
+    --domain-id "${domain_id}" \
+    --same-host "${same_host}" \
+    --artifacts-dir "${case_dir}" \
+    --local-attr "sync_from_follower_${same_host}" \
+    --local-value "follower_${same_host}" \
+    --remote-attr "sync_from_loader_${same_host}" \
+    --remote-value "loader_${same_host}" \
+    --startup-delay 1.0 \
+    > "${case_dir}/follower.log" 2>&1 &
+  local pid_b=$!
+
+  local rc=0
+  wait "${pid_a}" || rc=1
+  wait "${pid_b}" || rc=1
+
+  if [[ "${rc}" -ne 0 ]]; then
+    echo "Scenario same_host=${same_host} failed. See ${case_dir}" >&2
+    return "${rc}"
+  fi
+
+  python3 - "${case_dir}" "${same_host}" <<'PY'
+import json
+import sys
+from pathlib import Path
+
+case_dir = Path(sys.argv[1])
+same_host = sys.argv[2]
+loader = json.loads((case_dir / f"same_host_{same_host}_loader.json").read_text(encoding="utf-8"))
+follower = json.loads((case_dir / f"same_host_{same_host}_follower.json").read_text(encoding="utf-8"))
+
+for result in (loader, follower):
+    if result["status"] != "ok":
+        raise SystemExit(f"{result['agent_name']} failed: {result.get('error', 'unknown error')}")
+
+if follower["initial_node_count"] <= 0:
+    raise SystemExit("Follower did not receive the initial graph")
+
+if loader["remote_attr_value"] != f"follower_{same_host}":
+    raise SystemExit("Loader did not observe follower mutation")
+
+if follower["remote_attr_value"] != f"loader_{same_host}":
+    raise SystemExit("Follower did not observe loader mutation")
+
+print(f"same_host={same_host}: PASS")
+PY
+}
+
+run_case true 41
+run_case false 42
+
+echo "Artifacts written to ${ARTIFACT_ROOT}"
diff --git a/tools/same_host_smoke/verify_transports.sh b/tools/same_host_smoke/verify_transports.sh
new file mode 100644
index 0000000..d60dded
--- /dev/null
+++ b/tools/same_host_smoke/verify_transports.sh
@@ -0,0 +1,148 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+ARTIFACT_ROOT="${ROOT_DIR}/.artifacts/same_host_transport"
+GRAPH_FILE="${ROOT_DIR}/python-wrapper/etc/autonomyLab_objects.simscene.json"
+WORKER="${ROOT_DIR}/tools/same_host_smoke/agent_worker.py"
+
+export PYTHONPATH="${ROOT_DIR}/build/python-wrapper${PYTHONPATH:+:${PYTHONPATH}}"
+export LD_LIBRARY_PATH="${ROOT_DIR}/build/api:${ROOT_DIR}/build/core${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
+
+mkdir -p "${ARTIFACT_ROOT}"
+
+snapshot_case() {
+  local case_dir="$1"
+  local pid_a="$2"
+  local pid_b="$3"
+
+  {
+    echo "loader_pid=${pid_a}"
+    echo "follower_pid=${pid_b}"
+  } > "${case_dir}/pids.txt"
+
+  lsof -p "${pid_a}" > "${case_dir}/loader.lsof" || true
+  lsof -p "${pid_b}" > "${case_dir}/follower.lsof" || true
+  ss -uapn > "${case_dir}/ss.txt" || true
+  ip maddr show > "${case_dir}/ip_maddr.txt" || true
+}
+
+verify_case() {
+  local same_host="$1"
+  local case_dir="${ARTIFACT_ROOT}/same_host_${same_host}"
+  local domain_id loader_id follower_id
+
+  if [[ "${same_host}" == "true" ]]; then
+    domain_id=51
+    loader_id=1501
+    follower_id=1502
+  else
+    domain_id=52
+    loader_id=1511
+    follower_id=1512
+  fi
+
+  mkdir -p "${case_dir}"
+
+  python3 "${WORKER}" \
+    --agent-name "transport_${same_host}_loader" \
+    --agent-id "${loader_id}" \
+    --domain-id "${domain_id}" \
+    --same-host "${same_host}" \
+    --graph-file "${GRAPH_FILE}" \
+    --artifacts-dir "${case_dir}" \
+    --local-attr "transport_loader_${same_host}" \
+    --local-value "loader_${same_host}" \
+    --remote-attr "transport_follower_${same_host}" \
+    --remote-value "follower_${same_host}" \
+    --hold-seconds 12 \
+    > "${case_dir}/loader.log" 2>&1 &
+  local pid_a=$!
+
+  python3 "${WORKER}" \
+    --agent-name "transport_${same_host}_follower" \
+    --agent-id "${follower_id}" \
+    --domain-id "${domain_id}" \
+    --same-host "${same_host}" \
+    --artifacts-dir "${case_dir}" \
+    --local-attr "transport_follower_${same_host}" \
+    --local-value "follower_${same_host}" \
+    --remote-attr "transport_loader_${same_host}" \
+    --remote-value "loader_${same_host}" \
+    --startup-delay 1 \
+    --hold-seconds 12 \
+    > "${case_dir}/follower.log" 2>&1 &
+  local pid_b=$!
+
+  sleep 4
+  snapshot_case "${case_dir}" "${pid_a}" "${pid_b}"
+
+  wait "${pid_a}"
+  wait "${pid_b}"
+
+  python3 - "${case_dir}" "${same_host}" <<'PY'
+import json
+import sys
+from pathlib import Path
+
+case_dir = Path(sys.argv[1])
+same_host = sys.argv[2]
+
+loader = json.loads(next(case_dir.glob("*loader.json")).read_text(encoding="utf-8"))
+follower = json.loads(next(case_dir.glob("*follower.json")).read_text(encoding="utf-8"))
+def read_if_exists(path: Path) -> str:
+    return path.read_text(encoding="utf-8", errors="ignore") if path.exists() else ""
+
+lsof_loader = read_if_exists(case_dir / "loader.lsof")
+lsof_follower = read_if_exists(case_dir / "follower.lsof")
+ss_txt = read_if_exists(case_dir / "ss.txt")
+ip_maddr = read_if_exists(case_dir / "ip_maddr.txt")
+
+for result in (loader, follower):
+    if result["status"] != "ok":
+        raise SystemExit(f"{result['agent_name']} failed: {result.get('error', 'unknown error')}")
+
+combined_lsof = lsof_loader + "\n" + lsof_follower
+
+evidence_lines = []
+for line in combined_lsof.splitlines():
+    if "/dev/shm/fastdds_" in line or "239.255." in line:
+        evidence_lines.append(line.strip())
+
+uses_multicast = any(marker in (combined_lsof + "\n" + ss_txt + "\n" + ip_maddr) for marker in (
+    "239.255.0.1",
+    "239.255.0.53",
+))
+uses_shm = "/dev/shm" in combined_lsof
+
+summary = {
+    "same_host": same_host == "true",
+    "uses_multicast": uses_multicast,
+    "uses_shm": uses_shm,
+    "evidence_lines": evidence_lines[:12],
+}
+(case_dir / "transport_summary.json").write_text(json.dumps(summary, indent=2), encoding="utf-8")
+
+if same_host == "true" and not uses_shm:
+    raise SystemExit("shared-memory evidence not found for same_host=true")
+
+# Multicast is the discovery mechanism for cross-host (same_host=false).
+# For same_host=true, DSR uses SHM + loopback-UDP unicast — no multicast
+# group is joined, so absence of 239.255.x.x evidence is expected and correct.
+if same_host == "false" and not uses_multicast:
+    raise SystemExit("multicast evidence not found for same_host=false")
+
+if same_host == "false" and uses_shm:
+    raise SystemExit("unexpected shared-memory evidence found for same_host=false")
+
+print(json.dumps(summary))
+print("evidence:")
+for line in summary["evidence_lines"]:
+    print(f"  {line}")
+PY
+}
+
+verify_case true
+verify_case false
+
+echo "Transport artifacts written to ${ARTIFACT_ROOT}"