From db77cede68ebb2ae72bb64fe0cf09e8947f863da Mon Sep 17 00:00:00 2001 From: aterenin Date: Tue, 2 Jun 2026 17:02:27 -0400 Subject: [PATCH 01/11] Add initial templates for Robin Hood probing. --- .../probing_scheme/probing_scheme_impl.inl | 83 ++++++++++++++++ include/cuco/probing_scheme.cuh | 97 +++++++++++++++++++ 2 files changed, 180 insertions(+) diff --git a/include/cuco/detail/probing_scheme/probing_scheme_impl.inl b/include/cuco/detail/probing_scheme/probing_scheme_impl.inl index 8bed41778..8b93076c3 100644 --- a/include/cuco/detail/probing_scheme/probing_scheme_impl.inl +++ b/include/cuco/detail/probing_scheme/probing_scheme_impl.inl @@ -208,4 +208,87 @@ double_hashing::hash_function() const noexcept return {hash1_, hash2_}; } +namespace detail { + +/** + * @brief Robin Hood inverse primitive for the linear probing sequence. + * + * @note Recovers a resident's probe distance ("age") from the slot it occupies: how many probing + * steps the resident sits from its own home bucket. For the linear sequence this is a single + * subtract — `(slot_base - resident_home) / stride mod num_buckets`. This is the linear-only + * overload; a `double_hashing` variant would add its own overload here (a modular inverse of the + * resident's per-key step, or a stored age), which is the single place that change lands. + * + * @tparam BucketSize Size of the bucket + * @tparam CGSize Size of CUDA Cooperative Groups + * @tparam Hash Unary callable type + * @tparam ProbeKey Type of probing key + * @tparam Extent Type of extent + * + * @param scheme The underlying linear probing scheme (supplies the hash function) + * @param resident_key The key currently residing in the slot + * @param slot_index The slot index at which `resident_key` resides + * @param upper_bound Upper bound of the iteration + * @return The resident's probe distance, in probing steps + */ +template +[[nodiscard]] __host__ __device__ constexpr typename Extent::value_type probe_distance( + linear_probing const& scheme, + ProbeKey resident_key, + typename Extent::value_type slot_index, + Extent upper_bound) noexcept +{ + using size_type = typename Extent::value_type; + size_type constexpr stride = CGSize * BucketSize; + auto const bound = static_cast(upper_bound); + auto const hash = scheme.hash_function(); + + // Home bucket base of the resident, using the same alignment as `make_iterator`. + size_type const resident_home = + cuco::detail::sanitize_hash(hash(resident_key)) % (bound / stride) * stride; + + // Bucket-strided base of the slot the resident currently occupies. The per-lane `thread_rank` + // offset (which is < stride) is stripped by the floor division so that the distance is measured + // in whole probing steps, consistent with the forward sequence. + size_type const slot_base = (slot_index / stride) * stride; + + // (slot_base - resident_home) mod capacity, expressed in probing steps. + return static_cast((slot_base + bound - resident_home) % bound) / stride; +} + +} // namespace detail + +template +__host__ __device__ constexpr robin_hood_probing::robin_hood_probing( + Underlying const& probing) + : Underlying{probing} +{ +} + +template +template +__host__ __device__ constexpr auto robin_hood_probing::rebind_hash_function( + NewHash const& hash) const noexcept +{ + auto const inner = static_cast(*this).rebind_hash_function(hash); + return robin_hood_probing>{inner}; +} + +template +template +__host__ __device__ constexpr typename Extent::value_type +robin_hood_probing::probe_distance(ProbeKey resident_key, + typename Extent::value_type slot_index, + Extent upper_bound) const noexcept +{ + // Delegate to the per-underlying-scheme inverse overload. Only `linear_probing` provides one + // today; wrapping a scheme without a matching overload is a compile-time error. + return cuco::detail::probe_distance( + static_cast(*this), resident_key, slot_index, upper_bound); +} + } // namespace cuco diff --git a/include/cuco/probing_scheme.cuh b/include/cuco/probing_scheme.cuh index c809794dc..f74c8fe80 100644 --- a/include/cuco/probing_scheme.cuh +++ b/include/cuco/probing_scheme.cuh @@ -209,6 +209,87 @@ class double_hashing : private detail::probing_scheme_base { Hash2 hash2_; }; +/** + * @brief Public Robin Hood probing scheme class. + * + * @note Robin Hood probing wraps an underlying probe sequence (e.g. `cuco::linear_probing`) and + * pairs it with the Robin Hood invariant: on insert, an in-flight key displaces any resident that + * sits closer to its own home than the in-flight key is to its home, and the displaced resident is + * then re-inserted from that point onward. This keeps probe lengths tightly distributed, which is + * especially valuable on GPUs where a tile's tail latency is set by its longest probe. + * + * @note This class is a thin decorator over `Underlying`. It forwards the forward probe sequence + * (`make_iterator`, `hash_function`) unchanged and contributes the `cuco::is_robin_hood_probing` + * trait that selects the displacement (insert) and early-termination (find) control flow in the + * open-addressing ref implementation. The invariant's one extra requirement — the inverse + * primitive `probe_distance`, which recovers a resident's probe distance ("age") from the slot it + * occupies — is delegated to `cuco::detail::probe_distance`, which is overloaded per underlying + * scheme. Only `cuco::linear_probing` provides that overload today; a `cuco::double_hashing` + * variant would compose simply by adding a matching `cuco::detail::probe_distance` overload, with + * no change to this class or the ref implementation. + * + * @tparam Underlying The wrapped probe-sequence scheme (e.g. `cuco::linear_probing`) + */ +template +class robin_hood_probing : private Underlying { + public: + using Underlying::cg_size; ///< Cooperative group size (from the underlying scheme) + using typename Underlying::hasher; ///< Hash function type (from the underlying scheme) + using Underlying::hash_function; ///< Forwarded: gets the function(s) used to hash keys + using Underlying::make_iterator; ///< Forwarded: the (unchanged) forward probe sequence + + /** + * @brief Constructs a Robin Hood probing scheme wrapping the given underlying scheme. + * + * @param probing The underlying probe-sequence scheme to wrap + */ + __host__ __device__ constexpr robin_hood_probing(Underlying const& probing = {}); + + /** + * @brief Makes a copy of the current probing method with the given hasher. + * + * @note Forwards to the underlying scheme's `rebind_hash_function` and re-wraps the result so the + * returned scheme is again a `robin_hood_probing`. + * + * @tparam NewHash New hasher type + * + * @param hash Hasher + * + * @return Copy of the current probing method + */ + template + [[nodiscard]] __host__ __device__ constexpr auto rebind_hash_function( + NewHash const& hash) const noexcept; + + /** + * @brief Computes the probe distance ("age") of a resident key. + * + * @note This is the inverse of the probe sequence: given a resident key and the slot index at + * which it currently lives, it returns how many probing steps that resident is from its own home + * bucket. The Robin Hood insert/find logic compares this against the in-flight key's own probe + * distance to decide displacement (insert) or early termination (find). + * + * @note Delegates to the `cuco::detail::probe_distance` overload for `Underlying`. Instantiating + * this for an `Underlying` without such an overload (e.g. `cuco::double_hashing` today) is a + * compile-time error — that is the single seam where a new underlying scheme would supply its own + * inverse. + * + * @tparam BucketSize Size of the bucket + * @tparam ProbeKey Type of probing key + * @tparam Extent Type of extent + * + * @param resident_key The key currently residing in the slot + * @param slot_index The slot index at which `resident_key` resides + * @param upper_bound Upper bound of the iteration + * @return The resident's probe distance, in probing steps + */ + template + [[nodiscard]] __host__ __device__ constexpr typename Extent::value_type probe_distance( + ProbeKey resident_key, + typename Extent::value_type slot_index, + Extent upper_bound) const noexcept; +}; + /** * @brief Trait indicating whether the given probing scheme is of `double_hashing` type or not * @@ -227,6 +308,22 @@ struct is_double_hashing : cuda::std::false_type {}; template struct is_double_hashing> : cuda::std::true_type {}; +/** + * @brief Trait indicating whether the given probing scheme is of `robin_hood_probing` type or not + * + * @tparam T Input probing scheme type + */ +template +struct is_robin_hood_probing : cuda::std::false_type {}; + +/** + * @brief Trait indicating whether the given probing scheme is of `robin_hood_probing` type or not + * + * @tparam Underlying The wrapped probe-sequence scheme + */ +template +struct is_robin_hood_probing> : cuda::std::true_type {}; + } // namespace cuco #include From d17754602b8af5fa18365a9e2ab66d8fe2cfcd34 Mon Sep 17 00:00:00 2001 From: aterenin Date: Tue, 2 Jun 2026 17:32:26 -0400 Subject: [PATCH 02/11] Add Robin Hood probe distance test. --- tests/utility/probing_scheme_test.cu | 72 ++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/tests/utility/probing_scheme_test.cu b/tests/utility/probing_scheme_test.cu index 39048946b..97f26dc18 100644 --- a/tests/utility/probing_scheme_test.cu +++ b/tests/utility/probing_scheme_test.cu @@ -78,6 +78,37 @@ __global__ void generate_cg_probing_sequence(Key key, } } +// Walks each lane's probe iterator and records, at every step, the probe distance reported for the +// slot that lane is visiting. Because the resident under test is `key` itself, the slot visited at +// step `i` is at probe distance `i` from `key`'s home — for every lane. Recording one column per +// lane lets the host check both that `probe_distance` inverts `make_iterator` and that it strips +// the per-lane intra-stride offset (so all lanes at a given step agree). +template +__global__ void generate_cg_probe_distance_sequence(Key key, + Extent upper_bound, + size_t seq_length, + OutputIt out_seq) +{ + auto constexpr cg_size = ProbingScheme::cg_size; + + auto const tid = blockIdx.x * blockDim.x + threadIdx.x; + auto probing_scheme = ProbingScheme{}; + + if (tid < cg_size) { + auto const tile = + cooperative_groups::tiled_partition( + cooperative_groups::this_thread_block()); + + auto iter = probing_scheme.template make_iterator(tile, key, upper_bound); + + for (size_t i = 0; i < seq_length; ++i) { + out_seq[i * cg_size + tile.thread_rank()] = + probing_scheme.template probe_distance(key, *iter, upper_bound); + ++iter; + } + } +} + TEMPLATE_TEST_CASE_SIG( "utility probing_scheme tests", "", @@ -111,3 +142,44 @@ TEMPLATE_TEST_CASE_SIG( REQUIRE(cuco::test::equal( scalar_seq.begin(), scalar_seq.end(), cg_seq.begin(), cuda::std::equal_to{})); } + +TEMPLATE_TEST_CASE_SIG( + "utility robin_hood probe_distance inverts make_iterator", + "", + ((typename Key, int32_t CGSize, int32_t BucketSize), Key, CGSize, BucketSize), + (int32_t, 1, 1), + (int32_t, 4, 1), + (int32_t, 8, 1), + (int32_t, 8, 2), + (int64_t, 4, 1), + (int64_t, 8, 2)) +{ + // Robin Hood wraps a linear probe sequence; `probe_distance` is its inverse. For `key`'s own + // probe sequence, the slot visited at step `i` must report probe distance `i`. + using probe = + cuco::robin_hood_probing>>; + + // A deliberately small capacity, so the probe sequence wraps around the table within + // `seq_length` steps — this exercises the modular-subtraction (wrap) path in `probe_distance`. + auto const upper_bound = + cuco::make_valid_extent>(cuco::extent{64}); + + // Probe distance is measured in whole probing steps and lives in `[0, num_buckets)`, where one + // step spans the full `cg_size * bucket_size` stride. Taking `seq_length` past `num_buckets` + // guarantees the walk wraps at least once. + auto const capacity = static_cast(upper_bound); + auto const num_buckets = capacity / (CGSize * BucketSize); + auto const seq_length = num_buckets + 3; + constexpr Key key{42}; + + thrust::device_vector distances(seq_length * CGSize); + generate_cg_probe_distance_sequence + <<<1, CGSize>>>(key, upper_bound, seq_length, distances.begin()); + + // Under wrap, the slot visited at step `i` sits at probe distance `i mod num_buckets`. + for (std::size_t i = 0; i < seq_length; ++i) { + for (std::int32_t r = 0; r < CGSize; ++r) { + REQUIRE(distances[i * CGSize + r] == i % num_buckets); + } + } +} From 75a78dfaf6e62e79c0969d7d2973d66ddda6e7af Mon Sep 17 00:00:00 2001 From: aterenin Date: Tue, 2 Jun 2026 17:52:22 -0400 Subject: [PATCH 03/11] Add Robin Hood get methods. --- .../open_addressing_ref_impl.cuh | 77 ++++++++++++++++++- 1 file changed, 73 insertions(+), 4 deletions(-) diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index ada8792bb..5c0967ff9 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -777,7 +777,8 @@ class open_addressing_ref_impl static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); auto probing_iter = probing_scheme_.template make_iterator(key, storage_ref_.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; + [[maybe_unused]] size_type probe_step = 0; while (true) { // TODO atomic_ref::load if insert operator is present @@ -791,6 +792,13 @@ class open_addressing_ref_impl case detail::equal_result::EQUAL: return true; } } + // Robin Hood: a resident richer than us proves the key is absent. + if constexpr (cuco::is_robin_hood_probing::value) { + if (this->robin_hood_proves_absent(bucket_slots, *probing_iter, probe_step)) { + return false; + } + ++probe_step; + } ++probing_iter; if (*probing_iter == init_idx) { return false; } } @@ -816,7 +824,8 @@ class open_addressing_ref_impl { auto probing_iter = probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; + [[maybe_unused]] size_type probe_step = 0; while (true) { auto const bucket_slots = storage_ref_[*probing_iter]; @@ -834,6 +843,13 @@ class open_addressing_ref_impl if (group.any(state == detail::equal_result::EQUAL)) { return true; } if (group.any(state == detail::equal_result::EMPTY)) { return false; } + // Robin Hood: a resident richer than us (in any lane's bucket) proves the key is absent. + if constexpr (cuco::is_robin_hood_probing::value) { + if (group.any(this->robin_hood_proves_absent(bucket_slots, *probing_iter, probe_step))) { + return false; + } + ++probe_step; + } ++probing_iter; if (*probing_iter == init_idx) { return false; } } @@ -857,7 +873,8 @@ class open_addressing_ref_impl static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); auto probing_iter = probing_scheme_.template make_iterator(key, storage_ref_.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; + [[maybe_unused]] size_type probe_step = 0; while (true) { // TODO atomic_ref::load if insert operator is present @@ -875,6 +892,13 @@ class open_addressing_ref_impl default: continue; } } + // Robin Hood: a resident richer than us proves the key is absent. + if constexpr (cuco::is_robin_hood_probing::value) { + if (this->robin_hood_proves_absent(bucket_slots, *probing_iter, probe_step)) { + return this->end(); + } + ++probe_step; + } ++probing_iter; if (*probing_iter == init_idx) { return this->end(); } } @@ -900,7 +924,8 @@ class open_addressing_ref_impl { auto probing_iter = probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; + [[maybe_unused]] size_type probe_step = 0; while (true) { auto const bucket_slots = storage_ref_[*probing_iter]; @@ -930,6 +955,13 @@ class open_addressing_ref_impl // Find an empty slot, meaning that the probe key isn't present in the container if (group.any(state == detail::equal_result::EMPTY)) { return this->end(); } + // Robin Hood: a resident richer than us (in any lane's bucket) proves the key is absent. + if constexpr (cuco::is_robin_hood_probing::value) { + if (group.any(this->robin_hood_proves_absent(bucket_slots, *probing_iter, probe_step))) { + return this->end(); + } + ++probe_step; + } ++probing_iter; if (*probing_iter == init_idx) { return this->end(); } } @@ -1592,6 +1624,43 @@ class open_addressing_ref_impl return storage_ref_.data() + probing_idx + intra_bucket_idx; } + /** + * @brief Determines whether the Robin Hood invariant proves the probe key absent at the current + * probe step. + * + * @note Only meaningful for Robin Hood probing. The key is proven absent when the bucket holds a + * resident that is "richer" than the probe key — i.e. whose own probe distance is smaller than + * the probe key's probe distance at the current step (`probe_step`). Such a resident would have + * been displaced on insertion if the probe key lived here, so the probe key cannot be present. + * + * @note Behavior is only well-defined when every slot in the bucket is occupied (the callers + * reach this check only after ruling out empty and matching slots), since probe distance is + * meaningless for an empty slot. + * + * @tparam BucketSlots Bucket slot array type + * + * @param bucket_slots The slots of the bucket currently being probed + * @param bucket_base The slot index of the first slot in the bucket + * @param probe_step The probe key's own probe distance at the current step + * + * @return True if some resident in the bucket is richer than the probe key + */ + template + [[nodiscard]] __device__ bool robin_hood_proves_absent(BucketSlots const& bucket_slots, + size_type bucket_base, + size_type probe_step) const noexcept + { + bool richer = false; + cuda::static_for([&](auto i) { + auto const resident_age = probing_scheme_.template probe_distance( + this->extract_key(bucket_slots[i()]), + static_cast(bucket_base + i()), + storage_ref_.extent()); + if (resident_age < probe_step) { richer = true; } + }); + return richer; + } + /** * @brief Extracts the key from a given value type. * From dc90ebe6b6783450efb33228cb314d28696aa551 Mon Sep 17 00:00:00 2001 From: aterenin Date: Tue, 2 Jun 2026 18:03:11 -0400 Subject: [PATCH 04/11] Add Robin Hood invariant test. --- tests/CMakeLists.txt | 1 + .../probing_scheme_invariants_test.cu | 112 ++++++++++++++++++ 2 files changed, 113 insertions(+) create mode 100644 tests/static_map/probing_scheme_invariants_test.cu diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 51d4f42c1..07465b86e 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -89,6 +89,7 @@ ConfigureTest(STATIC_MAP_TEST static_map/insert_or_assign_test.cu static_map/insert_or_apply_test.cu static_map/key_sentinel_test.cu + static_map/probing_scheme_invariants_test.cu static_map/shared_memory_test.cu static_map/stream_test.cu static_map/rehash_test.cu diff --git a/tests/static_map/probing_scheme_invariants_test.cu b/tests/static_map/probing_scheme_invariants_test.cu new file mode 100644 index 000000000..2fb4bdd88 --- /dev/null +++ b/tests/static_map/probing_scheme_invariants_test.cu @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include + +#include + +#include +#include + +namespace { + +// Identity hash. `cuco::detail::sanitize_hash` is `to_positive`, so for a non-negative key `k` this +// puts its home bucket at slot `k % capacity` — letting the test hand-craft an exact Robin Hood +// layout instead of reverse-engineering a real hash function. +template +struct identity_hash { + __host__ __device__ constexpr Key operator()(Key key) const noexcept { return key; } +}; + +} // namespace + +// Validates the Robin Hood read-path early-exit (`find` / `contains`) against a hand-seeded layout, +// before the displacing `insert` exists. A scalar (cg_size == 1) Robin Hood map over a linear probe +// sequence is seeded directly through the storage pointer with a known-valid Robin Hood cluster, +// then queried with keys chosen to exercise each lookup-termination rule. +TEST_CASE("static_map robin_hood read-path early-exit", "") +{ + using Key = std::int32_t; + using Value = std::int32_t; + using size_type = std::int32_t; + auto constexpr capacity = size_type{16}; + + using extent_type = cuco::extent; + using probe_type = cuco::robin_hood_probing>>; + using map_type = cuco::static_map, + probe_type, + cuco::cuda_allocator, + cuco::storage<1>>; + using value_type = typename map_type::value_type; // cuco::pair + + auto map = map_type{extent_type{}, cuco::empty_key{-1}, cuco::empty_value{-1}}; + // Ensure the constructor's slot initialization (empty sentinels) has completed before we seed. + REQUIRE(cudaDeviceSynchronize() == cudaSuccess); + + // Hand-seed a valid Robin Hood layout (identity hash => home(k) = k % capacity): + // slot 0: key 0 home 0, distance 0 + // slot 1: key 16 home 0, distance 1 (displaced past key 0) + // slot 2: key 2 home 2, distance 0 + // slots 3..15: empty + // This is exactly what inserting {0, 16, 2} would produce, so it satisfies the Robin Hood + // invariant. We only write the three occupied (and contiguous) slots; the rest stay empty. + // The ref is used purely to reach the storage pointer; the bulk queries below build their own. + auto const ref = map.ref(cuco::op::find); + std::vector const seed{value_type{0, 0}, value_type{16, 16}, value_type{2, 2}}; + REQUIRE(cudaMemcpy(ref.storage_ref().data(), + seed.data(), + seed.size() * sizeof(value_type), + cudaMemcpyHostToDevice) == cudaSuccess); + + // Probe keys chosen to exercise every lookup-termination rule: + // 0 : present, found immediately at its home (distance 0). + // 16 : present at distance 1 — found only if we do NOT early-exit at slot 0, where the resident + // distance (0) equals our probe step (0). Guards the strict `<` (vs `<=`) richer rule. + // 2 : present at its home. + // 32 : home 0, absent — terminates via the richer-resident early-exit at slot 2 (key 2 sits at + // distance 0 < our probe step 2), before reaching the empty slot 3. + // 1 : home 1, absent — also terminates via the early-exit at slot 2. + // 3 : home 3, absent — terminates on the empty slot 3. + std::vector const probe_keys{0, 16, 2, 32, 1, 3}; + std::vector const expected_contained{true, true, true, false, false, false}; + + thrust::device_vector const d_keys(probe_keys.begin(), probe_keys.end()); + + thrust::device_vector d_contained(probe_keys.size()); + map.contains(d_keys.begin(), d_keys.end(), d_contained.begin()); + for (std::size_t i = 0; i < probe_keys.size(); ++i) { + INFO("contains, probe key = " << probe_keys[i]); + REQUIRE(static_cast(d_contained[i]) == expected_contained[i]); + } + + // `find` must return the stored value for present keys and the empty-value sentinel otherwise. + thrust::device_vector d_values(probe_keys.size()); + map.find(d_keys.begin(), d_keys.end(), d_values.begin()); + std::vector const expected_values{0, 16, 2, -1, -1, -1}; + for (std::size_t i = 0; i < probe_keys.size(); ++i) { + INFO("find, probe key = " << probe_keys[i]); + REQUIRE(d_values[i] == expected_values[i]); + } +} From 12b8e57d54b06409bc43345e5125c51fdb94a1d6 Mon Sep 17 00:00:00 2001 From: aterenin Date: Tue, 2 Jun 2026 23:00:31 -0400 Subject: [PATCH 05/11] Add non-CG insert for Robin Hood. --- .../open_addressing_ref_impl.cuh | 66 +++++++++++++++++-- 1 file changed, 62 insertions(+), 4 deletions(-) diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index 5c0967ff9..e8e5ad931 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -378,16 +379,19 @@ class open_addressing_ref_impl { static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); - auto const val = this->heterogeneous_value(value); - auto const key = this->extract_key(val); + auto val = this->heterogeneous_value(value); + auto key = this->extract_key(val); auto probing_iter = probing_scheme_.template make_iterator(key, storage_ref_.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; + [[maybe_unused]] size_type probe_step = 0; while (true) { auto const bucket_slots = storage_ref_[*probing_iter]; + [[maybe_unused]] bool retry = false; + for (auto& slot_content : bucket_slots) { auto const eq_res = this->predicate_.template operator()( key, this->extract_key(slot_content)); @@ -407,11 +411,65 @@ class open_addressing_ref_impl return false; } } - case insert_result::CONTINUE: continue; + case insert_result::CONTINUE: { + // Retry on a lost CAS. Plain probing keeps scanning this (now stale) bucket; Robin + // Hood must re-read it instead, so the in-flight pair is re-evaluated against the new + // occupants -- otherwise it could be placed past a slot it should have displaced, + // breaking the invariant (and therefore lookups). + if constexpr (cuco::is_robin_hood_probing::value) { + retry = true; + break; + } else { + continue; + } + } case insert_result::SUCCESS: return true; } + if constexpr (cuco::is_robin_hood_probing::value) { + if (retry) { break; } // leave the scan to re-read the bucket + } + } + + // Robin Hood swap test. A resident "richer" than the in-flight pair (a smaller probe + // distance than our current probe step) is displaced: we swap our pair into its slot, + // adopt the evicted resident, and re-probe forward. + if constexpr (cuco::is_robin_hood_probing::value) { + if (eq_res == detail::equal_result::UNEQUAL) { + auto const intra_bucket_index = + cuda::std::distance(bucket_slots.begin(), &slot_content); + auto const evicted_age = probing_scheme_.template probe_distance( + this->extract_key(slot_content), + static_cast(*probing_iter + intra_bucket_index), + storage_ref_.extent()); + if (evicted_age < probe_step) { + if (this->attempt_insert( + this->get_slot_ptr(*probing_iter, intra_bucket_index), slot_content, val) == + insert_result::SUCCESS) { + // Adopt the evicted pair and re-probe THIS bucket -- its bucket distance here is + // `evicted_age`, and it may belong in another slot of the same bucket: an empty one, + // or one holding an even-richer resident it can displace in turn. Re-reading the + // bucket (rather than advancing past it) is the within-bucket linear probe, i.e. the + // combined bucket+slot distance that makes displacement correct for bucket_size > 1. + // The `slot_distance` term cancels in every comparison, so it never appears here; it + // shows up only as this slot-by-slot continuation. `bit_cast` keeps the adoption + // valid for heterogeneous insert types (layout-compatible by contract; identity in + // the common case). + val = cuda::std::bit_cast(slot_content); + key = this->extract_key(val); + probe_step = evicted_age; + } + retry = true; // re-read this bucket: re-probe with the victim, or re-evaluate a lost CAS + break; + } + } } } + + if constexpr (cuco::is_robin_hood_probing::value) { + if (retry) { continue; } // re-probe (re-read this bucket, or move on after displacement) + ++probe_step; + } + ++probing_iter; if (*probing_iter == init_idx) { return false; } } From 0cc3fbd4369289a99467b65911dc2210eeb65fc5 Mon Sep 17 00:00:00 2001 From: aterenin Date: Tue, 2 Jun 2026 23:07:07 -0400 Subject: [PATCH 06/11] Add CG insert for Robin Hood. --- .../open_addressing_ref_impl.cuh | 61 ++++++++++++++++++- 1 file changed, 58 insertions(+), 3 deletions(-) diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index e8e5ad931..c46db6cef 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -490,11 +490,12 @@ class open_addressing_ref_impl __device__ bool insert(cooperative_groups::thread_block_tile group, Value value) noexcept { - auto const val = this->heterogeneous_value(value); - auto const key = this->extract_key(val); + auto val = this->heterogeneous_value(value); + auto key = this->extract_key(val); auto probing_iter = probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; + [[maybe_unused]] size_type probe_step = 0; while (true) { auto const bucket_slots = storage_ref_[*probing_iter]; @@ -554,6 +555,60 @@ class open_addressing_ref_impl default: continue; } } else { + if constexpr (cuco::is_robin_hood_probing::value) { + // Robin Hood displacement: no match, no empty slot in this bucket. Displace the first + // resident in probe (lane) order that is richer than the in-flight pair, adopt it, and + // re-probe THIS bucket -- the victim may belong in another slot of it. The within-bucket + // linear probe (combined bucket+slot distance) is identical to the scalar path; the + // `slot_distance` term cancels, so the test is again `resident distance < probe_step`. + cuda::std::int32_t displace_idx = -1; + size_type evicted_age = 0; + cuda::static_for([&] __device__(auto i) { + if (displace_idx < 0) { + auto const age = probing_scheme_.template probe_distance( + this->extract_key(bucket_slots[i()]), + static_cast(*probing_iter + i()), + storage_ref_.extent()); + if (age < probe_step) { + displace_idx = i(); + evicted_age = age; + } + } + }); + + auto const group_displaceable = group.ballot(displace_idx >= 0); + if (group_displaceable) { + auto const src_lane = __ffs(group_displaceable) - 1; + auto status = insert_result::CONTINUE; + // Only `src_lane` reads `evicted` meaningfully; other lanes just need a valid value to + // feed the broadcast `shfl` below, so seed it with the empty-slot sentinel. + value_type evicted = this->empty_slot_sentinel(); + if (group.thread_rank() == src_lane) { + evicted = bucket_slots[displace_idx]; + status = + attempt_insert(this->get_slot_ptr(*probing_iter, displace_idx), evicted, val); + } + if (group.shfl(status, src_lane) == insert_result::SUCCESS) { + // Broadcast the evicted pair and its probe distance from the winning lane, and adopt + // it on every lane (all lanes need the new in-flight pair for the next scan). + auto const new_key = group.shfl(this->extract_key(evicted), src_lane); + auto const new_age = group.shfl(evicted_age, src_lane); + value_type evicted_slot; + if constexpr (has_payload) { + auto const new_payload = group.shfl(this->extract_payload(evicted), src_lane); + evicted_slot = value_type{new_key, new_payload}; + } else { + evicted_slot = new_key; + } + val = cuda::std::bit_cast(evicted_slot); + key = this->extract_key(val); + probe_step = new_age; + } + continue; // success: re-probe this bucket with the victim; lost CAS: re-read it + } + // No displaceable resident: fall through to the shared advance below. + ++probe_step; + } ++probing_iter; if (*probing_iter == init_idx) { return false; } } From 2a921319fe544639f739f3b0070f918b5aaaf2b2 Mon Sep 17 00:00:00 2001 From: aterenin Date: Tue, 2 Jun 2026 23:38:48 -0400 Subject: [PATCH 07/11] Propagate Robin Hood insert to variants. --- .../open_addressing_ref_impl.cuh | 126 +++++++++++- .../cuco/detail/static_map/static_map_ref.inl | 183 ++++++++++++++++-- 2 files changed, 284 insertions(+), 25 deletions(-) diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index c46db6cef..7fc5daa59 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -641,15 +641,21 @@ class open_addressing_ref_impl "pre-Volta GPUs."); #endif - auto const val = this->heterogeneous_value(value); - auto const key = this->extract_key(val); + auto val = this->heterogeneous_value(value); + auto key = this->extract_key(val); auto probing_iter = probing_scheme_.template make_iterator(key, storage_ref_.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; + [[maybe_unused]] size_type probe_step = 0; + // Robin Hood may displace the original key before the chain ends; remember the slot it landed in + // so we return an iterator to it (not to a later victim's slot). + [[maybe_unused]] value_type* placed_ptr = nullptr; while (true) { auto const bucket_slots = storage_ref_[*probing_iter]; + [[maybe_unused]] bool retry = false; + for (auto i = 0; i < bucket_size; ++i) { auto const eq_res = this->predicate_.template operator()( key, this->extract_key(bucket_slots[i])); @@ -663,16 +669,58 @@ class open_addressing_ref_impl if (eq_res == detail::equal_result::AVAILABLE) { switch (this->attempt_insert_stable(slot_ptr, bucket_slots[i], val)) { case insert_result::SUCCESS: { - this->maybe_wait_for_payload(slot_ptr); - return {iterator{slot_ptr}, true}; + // The in-flight pair is placed in an empty slot, ending any displacement chain. The + // iterator to return is the original key's slot (captured on its first placement). + auto* result_ptr = slot_ptr; + if constexpr (cuco::is_robin_hood_probing::value) { + if (placed_ptr != nullptr) { result_ptr = placed_ptr; } + } + this->maybe_wait_for_payload(result_ptr); + return {iterator{result_ptr}, true}; } case insert_result::DUPLICATE: { this->maybe_wait_for_payload(slot_ptr); return {iterator{slot_ptr}, false}; } - default: continue; + case insert_result::CONTINUE: { + if constexpr (cuco::is_robin_hood_probing::value) { + retry = true; + break; + } else { + continue; + } + } + } + if constexpr (cuco::is_robin_hood_probing::value) { + if (retry) { break; } } } + + // Robin Hood swap test (see `insert` for the full rationale). + if constexpr (cuco::is_robin_hood_probing::value) { + if (eq_res == detail::equal_result::UNEQUAL) { + auto const evicted_age = probing_scheme_.template probe_distance( + this->extract_key(bucket_slots[i]), + static_cast(*probing_iter + i), + storage_ref_.extent()); + if (evicted_age < probe_step) { + if (this->attempt_insert(slot_ptr, bucket_slots[i], val) == + insert_result::SUCCESS) { + if (placed_ptr == nullptr) { placed_ptr = slot_ptr; } // original key's slot + val = cuda::std::bit_cast(bucket_slots[i]); + key = this->extract_key(val); + probe_step = evicted_age; + } + retry = true; + break; + } + } + } + } + + if constexpr (cuco::is_robin_hood_probing::value) { + if (retry) { continue; } + ++probe_step; } ++probing_iter; if (*probing_iter == init_idx) { return {this->end(), false}; } @@ -707,11 +755,15 @@ class open_addressing_ref_impl "pre-Volta GPUs."); #endif - auto const val = this->heterogeneous_value(value); - auto const key = this->extract_key(val); + auto val = this->heterogeneous_value(value); + auto key = this->extract_key(val); auto probing_iter = probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; + [[maybe_unused]] size_type probe_step = 0; + // Robin Hood may displace the original key before the chain ends; remember (broadcast) the slot + // it first landed in so we return an iterator to it. 0 means "not yet placed". + [[maybe_unused]] intptr_t placed_ptr = 0; while (true) { auto const bucket_slots = storage_ref_[*probing_iter]; @@ -751,9 +803,15 @@ class open_addressing_ref_impl switch (group.shfl(status, src_lane)) { case insert_result::SUCCESS: { + // The in-flight pair is placed in an empty slot, ending any displacement chain. Return + // the original key's slot (the first placement) if it was displaced earlier. + auto result = res; + if constexpr (cuco::is_robin_hood_probing::value) { + if (placed_ptr != 0) { result = placed_ptr; } + } if (group.thread_rank() == src_lane) { this->maybe_wait_for_payload(slot_ptr); } group.sync(); - return {iterator{reinterpret_cast(res)}, true}; + return {iterator{reinterpret_cast(result)}, true}; } case insert_result::DUPLICATE: { if (group.thread_rank() == src_lane) { this->maybe_wait_for_payload(slot_ptr); } @@ -763,6 +821,54 @@ class open_addressing_ref_impl default: continue; } } else { + if constexpr (cuco::is_robin_hood_probing::value) { + // Robin Hood displacement (see CG `insert` for the full rationale). + cuda::std::int32_t displace_idx = -1; + size_type evicted_age = 0; + cuda::static_for([&] __device__(auto i) { + if (displace_idx < 0) { + auto const age = probing_scheme_.template probe_distance( + this->extract_key(bucket_slots[i()]), + static_cast(*probing_iter + i()), + storage_ref_.extent()); + if (age < probe_step) { + displace_idx = i(); + evicted_age = age; + } + } + }); + + auto const group_displaceable = group.ballot(displace_idx >= 0); + if (group_displaceable) { + auto const src_lane = __ffs(group_displaceable) - 1; + auto status = insert_result::CONTINUE; + value_type evicted = this->empty_slot_sentinel(); + intptr_t displaced = 0; + if (group.thread_rank() == src_lane) { + auto* dptr = this->get_slot_ptr(*probing_iter, displace_idx); + evicted = bucket_slots[displace_idx]; + status = attempt_insert(dptr, evicted, val); + displaced = reinterpret_cast(dptr); + } + if (group.shfl(status, src_lane) == insert_result::SUCCESS) { + if (placed_ptr == 0) { placed_ptr = group.shfl(displaced, src_lane); } + auto const new_key = group.shfl(this->extract_key(evicted), src_lane); + auto const new_age = group.shfl(evicted_age, src_lane); + value_type evicted_slot; + if constexpr (has_payload) { + auto const new_payload = group.shfl(this->extract_payload(evicted), src_lane); + evicted_slot = value_type{new_key, new_payload}; + } else { + evicted_slot = new_key; + } + val = cuda::std::bit_cast(evicted_slot); + key = this->extract_key(val); + probe_step = new_age; + } + continue; + } + ++probe_step; + } ++probing_iter; if (*probing_iter == init_idx) { return {this->end(), false}; } } diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl index 7f8ff043b..315e5b3fb 100644 --- a/include/cuco/detail/static_map/static_map_ref.inl +++ b/include/cuco/detail/static_map/static_map_ref.inl @@ -495,6 +495,7 @@ class operator_impl< using key_type = typename base_type::key_type; using value_type = typename base_type::value_type; using mapped_type = T; + using size_type = typename base_type::size_type; static constexpr auto cg_size = base_type::cg_size; static constexpr auto bucket_size = base_type::bucket_size; @@ -515,17 +516,20 @@ class operator_impl< ref_type& ref_ = static_cast(*this); - auto const val = ref_.impl_.heterogeneous_value(value); - auto const key = ref_.impl_.extract_key(val); + auto val = ref_.impl_.heterogeneous_value(value); + auto key = ref_.impl_.extract_key(val); auto const probing_scheme = ref_.impl_.probing_scheme(); auto storage_ref = ref_.impl_.storage_ref(); auto probing_iter = probing_scheme.template make_iterator(key, storage_ref.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; + [[maybe_unused]] size_type probe_step = 0; while (true) { auto const bucket_slots = storage_ref[*probing_iter]; + [[maybe_unused]] bool retry = false; + for (auto& slot_content : bucket_slots) { auto const eq_res = ref_.impl_.predicate_.template operator()(key, slot_content.first); @@ -540,8 +544,37 @@ class operator_impl< } if (eq_res == detail::equal_result::AVAILABLE) { if (attempt_insert_or_assign(slot_ptr, val)) { return; } + if constexpr (cuco::is_robin_hood_probing::value) { + retry = true; + break; + } + } + + // Robin Hood swap test (see `open_addressing_ref_impl::insert` for the rationale). + if constexpr (cuco::is_robin_hood_probing::value) { + if (eq_res == detail::equal_result::UNEQUAL) { + auto const evicted_age = probing_scheme.template probe_distance( + ref_.impl_.extract_key(slot_content), + static_cast(*probing_iter + intra_bucket_index), + storage_ref.extent()); + if (evicted_age < probe_step) { + if (ref_.impl_.attempt_insert(slot_ptr, slot_content, val) == + detail::insert_result::SUCCESS) { + val = cuda::std::bit_cast(slot_content); + key = ref_.impl_.extract_key(val); + probe_step = evicted_age; + } + retry = true; + break; + } + } } } + + if constexpr (cuco::is_robin_hood_probing::value) { + if (retry) { continue; } + ++probe_step; + } ++probing_iter; if (*probing_iter == init_idx) { return; } } @@ -565,13 +598,14 @@ class operator_impl< { ref_type& ref_ = static_cast(*this); - auto const val = ref_.impl_.heterogeneous_value(value); - auto const key = ref_.impl_.extract_key(val); + auto val = ref_.impl_.heterogeneous_value(value); + auto key = ref_.impl_.extract_key(val); auto const probing_scheme = ref_.impl_.probing_scheme(); auto storage_ref = ref_.impl_.storage_ref(); auto probing_iter = probing_scheme.template make_iterator(group, key, storage_ref.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; + [[maybe_unused]] size_type probe_step = 0; while (true) { auto const bucket_slots = storage_ref[*probing_iter]; @@ -612,6 +646,45 @@ class operator_impl< // Exit if inserted or assigned if (group.shfl(status, src_lane)) { return; } } else { + if constexpr (cuco::is_robin_hood_probing::value) { + // Robin Hood displacement (see CG `open_addressing_ref_impl::insert`). + cuda::std::int32_t displace_idx = -1; + size_type evicted_age = 0; + cuda::static_for([&] __device__(auto i) { + if (displace_idx < 0) { + auto const age = probing_scheme.template probe_distance( + ref_.impl_.extract_key(bucket_slots[i()]), + static_cast(*probing_iter + i()), + storage_ref.extent()); + if (age < probe_step) { + displace_idx = i(); + evicted_age = age; + } + } + }); + + auto const group_displaceable = group.ballot(displace_idx >= 0); + if (group_displaceable) { + auto const src_lane = __ffs(group_displaceable) - 1; + auto status = detail::insert_result::CONTINUE; + value_type evicted = ref_.impl_.empty_slot_sentinel(); + if (group.thread_rank() == src_lane) { + evicted = bucket_slots[displace_idx]; + status = ref_.impl_.attempt_insert( + ref_.impl_.get_slot_ptr(*probing_iter, displace_idx), evicted, val); + } + if (group.shfl(status, src_lane) == detail::insert_result::SUCCESS) { + auto const new_key = group.shfl(ref_.impl_.extract_key(evicted), src_lane); + auto const new_payload = group.shfl(ref_.impl_.extract_payload(evicted), src_lane); + auto const new_age = group.shfl(evicted_age, src_lane); + val = cuda::std::bit_cast(value_type{new_key, new_payload}); + key = ref_.impl_.extract_key(val); + probe_step = new_age; + } + continue; + } + ++probe_step; + } ++probing_iter; if (*probing_iter == init_idx) { return; } } @@ -670,6 +743,7 @@ class operator_impl< using ref_type = static_map_ref; using key_type = typename base_type::key_type; using value_type = typename base_type::value_type; + using size_type = typename base_type::size_type; static constexpr auto cg_size = base_type::cg_size; static constexpr auto bucket_size = base_type::bucket_size; @@ -886,14 +960,15 @@ class operator_impl< { ref_type& ref_ = static_cast(*this); - auto const val = ref_.impl_.heterogeneous_value(value); - auto const key = ref_.impl_.extract_key(val); + auto val = ref_.impl_.heterogeneous_value(value); + auto key = ref_.impl_.extract_key(val); auto const probing_scheme = ref_.impl_.probing_scheme(); auto storage_ref = ref_.impl_.storage_ref(); auto probing_iter = probing_scheme.template make_iterator(key, storage_ref.extent()); - auto const init_idx = *probing_iter; - auto const empty_value = ref_.empty_value_sentinel(); + auto const init_idx = *probing_iter; + auto const empty_value = ref_.empty_value_sentinel(); + [[maybe_unused]] size_type probe_step = 0; // wait for payload only when init != sentinel and insert strategy is not `packed_cas` auto constexpr wait_for_payload = (not UseDirectApply) and (sizeof(value_type) > 8); @@ -901,6 +976,8 @@ class operator_impl< while (true) { auto const bucket_slots = storage_ref[*probing_iter]; + [[maybe_unused]] bool retry = false; + for (auto& slot_content : bucket_slots) { auto const eq_res = ref_.impl_.predicate_.template operator()(key, slot_content.first); @@ -928,9 +1005,45 @@ class operator_impl< op(cuda::atomic_ref{slot_ptr->second}, val.second); return false; } - default: continue; + case insert_result::CONTINUE: { + if constexpr (cuco::is_robin_hood_probing< + typename ref_type::probing_scheme_type>::value) { + retry = true; + break; + } else { + continue; + } + } + } + if constexpr (cuco::is_robin_hood_probing::value) { + if (retry) { break; } } } + + // Robin Hood swap test (see `open_addressing_ref_impl::insert` for the rationale). + if constexpr (cuco::is_robin_hood_probing::value) { + if (eq_res == detail::equal_result::UNEQUAL) { + auto const evicted_age = probing_scheme.template probe_distance( + ref_.impl_.extract_key(slot_content), + static_cast(*probing_iter + intra_bucket_index), + storage_ref.extent()); + if (evicted_age < probe_step) { + if (ref_.impl_.attempt_insert(slot_ptr, slot_content, val) == + detail::insert_result::SUCCESS) { + val = cuda::std::bit_cast(slot_content); + key = ref_.impl_.extract_key(val); + probe_step = evicted_age; + } + retry = true; + break; + } + } + } + } + + if constexpr (cuco::is_robin_hood_probing::value) { + if (retry) { continue; } + ++probe_step; } ++probing_iter; if (*probing_iter == init_idx) { return false; } @@ -964,14 +1077,15 @@ class operator_impl< { ref_type& ref_ = static_cast(*this); - auto const val = ref_.impl_.heterogeneous_value(value); - auto const key = ref_.impl_.extract_key(val); + auto val = ref_.impl_.heterogeneous_value(value); + auto key = ref_.impl_.extract_key(val); auto const probing_scheme = ref_.impl_.probing_scheme(); auto storage_ref = ref_.impl_.storage_ref(); auto probing_iter = probing_scheme.template make_iterator(group, key, storage_ref.extent()); - auto const init_idx = *probing_iter; - auto const empty_value = ref_.empty_value_sentinel(); + auto const init_idx = *probing_iter; + auto const empty_value = ref_.empty_value_sentinel(); + [[maybe_unused]] size_type probe_step = 0; // wait for payload only when init != sentinel and insert strategy is not `packed_cas` auto constexpr wait_for_payload = (not UseDirectApply) and (sizeof(value_type) > 8); @@ -1030,6 +1144,45 @@ class operator_impl< default: continue; } } else { + if constexpr (cuco::is_robin_hood_probing::value) { + // Robin Hood displacement (see CG `open_addressing_ref_impl::insert`). + cuda::std::int32_t displace_idx = -1; + size_type evicted_age = 0; + cuda::static_for([&] __device__(auto i) { + if (displace_idx < 0) { + auto const age = probing_scheme.template probe_distance( + ref_.impl_.extract_key(bucket_slots[i()]), + static_cast(*probing_iter + i()), + storage_ref.extent()); + if (age < probe_step) { + displace_idx = i(); + evicted_age = age; + } + } + }); + + auto const group_displaceable = group.ballot(displace_idx >= 0); + if (group_displaceable) { + auto const src_lane = __ffs(group_displaceable) - 1; + auto status = detail::insert_result::CONTINUE; + value_type evicted = ref_.impl_.empty_slot_sentinel(); + if (group.thread_rank() == src_lane) { + evicted = bucket_slots[displace_idx]; + status = ref_.impl_.attempt_insert( + ref_.impl_.get_slot_ptr(*probing_iter, displace_idx), evicted, val); + } + if (group.shfl(status, src_lane) == detail::insert_result::SUCCESS) { + auto const new_key = group.shfl(ref_.impl_.extract_key(evicted), src_lane); + auto const new_payload = group.shfl(ref_.impl_.extract_payload(evicted), src_lane); + auto const new_age = group.shfl(evicted_age, src_lane); + val = cuda::std::bit_cast(value_type{new_key, new_payload}); + key = ref_.impl_.extract_key(val); + probe_step = new_age; + } + continue; + } + ++probe_step; + } ++probing_iter; if (*probing_iter == init_idx) { return false; } } From c27f483efbb0aaa5b354ae79c65d9c199da5edf6 Mon Sep 17 00:00:00 2001 From: aterenin Date: Thu, 4 Jun 2026 12:11:56 -0400 Subject: [PATCH 08/11] Add initial tests and fix insert_or_assign for Robin Hood. --- .../cuco/detail/static_map/static_map_ref.inl | 137 +++++++++++++++--- tests/static_map/contains_test.cu | 27 +++- tests/static_map/find_test.cu | 38 ++++- tests/static_map/insert_and_find_test.cu | 31 +++- tests/static_map/insert_or_apply_test.cu | 76 +++++++++- tests/static_map/insert_or_assign_test.cu | 93 +++++++++++- tests/static_map/robin_hood_invariant.cuh | 111 ++++++++++++++ tests/test_utils.hpp | 2 +- 8 files changed, 466 insertions(+), 49 deletions(-) create mode 100644 tests/static_map/robin_hood_invariant.cuh diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl index 315e5b3fb..df323db6d 100644 --- a/include/cuco/detail/static_map/static_map_ref.inl +++ b/include/cuco/detail/static_map/static_map_ref.inl @@ -538,15 +538,35 @@ class operator_impl< // If the key is already in the container, update the payload and return if (eq_res == detail::equal_result::EQUAL) { - cuda::atomic_ref payload_ref(slot_ptr->second); - payload_ref.store(val.second, cuda::memory_order_relaxed); - return; + if constexpr (cuco::is_robin_hood_probing::value) { + // Robin Hood may relocate this key; assign via a full-slot CAS that keeps the key and + // fails if it moved. On failure re-probe and retry -- the loop re-finds the key. + auto desired = slot_content; + desired.second = val.second; + if (ref_.impl_.attempt_insert(slot_ptr, slot_content, desired) == + detail::insert_result::SUCCESS) { + return; + } + retry = true; + break; + } else { + cuda::atomic_ref payload_ref(slot_ptr->second); + payload_ref.store(val.second, cuda::memory_order_relaxed); + return; + } } if (eq_res == detail::equal_result::AVAILABLE) { - if (attempt_insert_or_assign(slot_ptr, val)) { return; } if constexpr (cuco::is_robin_hood_probing::value) { + // Insert the new pair with a full-slot CAS; on a lost CAS (rival insert) or a duplicate, + // retry -- the loop re-finds the key and assigns it via the EQUAL full-slot CAS. + if (ref_.impl_.attempt_insert(slot_ptr, slot_content, val) == + detail::insert_result::SUCCESS) { + return; + } retry = true; break; + } else { + if (attempt_insert_or_assign(slot_ptr, val)) { return; } } } @@ -629,22 +649,48 @@ class operator_impl< auto const group_contains_equal = group.ballot(state == detail::equal_result::EQUAL); if (group_contains_equal) { auto const src_lane = __ffs(group_contains_equal) - 1; - if (group.thread_rank() == src_lane) { - cuda::atomic_ref payload_ref(slot_ptr->second); - payload_ref.store(val.second, cuda::memory_order_relaxed); + if constexpr (cuco::is_robin_hood_probing::value) { + // src_lane assigns via a full-slot CAS (key fixed); a relocation or rival update fails it, + // so the group re-probes and retries -- the loop re-finds the key. + auto const success = [&, target_idx = intra_bucket_index]() { + if (group.thread_rank() != src_lane) { return false; } + auto desired = bucket_slots[target_idx]; + desired.second = val.second; + return ref_.impl_.attempt_insert(slot_ptr, bucket_slots[target_idx], desired) == + detail::insert_result::SUCCESS; + }(); + if (group.shfl(success, src_lane)) { return; } + continue; + } else { + if (group.thread_rank() == src_lane) { + cuda::atomic_ref payload_ref(slot_ptr->second); + payload_ref.store(val.second, cuda::memory_order_relaxed); + } + group.sync(); + return; } - group.sync(); - return; } auto const group_contains_available = group.ballot(state == detail::equal_result::AVAILABLE); if (group_contains_available) { auto const src_lane = __ffs(group_contains_available) - 1; - auto const status = - (group.thread_rank() == src_lane) ? attempt_insert_or_assign(slot_ptr, val) : false; + if constexpr (cuco::is_robin_hood_probing::value) { + // Insert the new pair with a full-slot CAS; on a lost CAS or duplicate, re-probe and retry + // (the loop re-finds the key and assigns via the EQUAL full-slot CAS). + auto const success = [&, target_idx = intra_bucket_index]() { + if (group.thread_rank() != src_lane) { return false; } + return ref_.impl_.attempt_insert(slot_ptr, bucket_slots[target_idx], val) == + detail::insert_result::SUCCESS; + }(); + if (group.shfl(success, src_lane)) { return; } + continue; + } else { + auto const status = + (group.thread_rank() == src_lane) ? attempt_insert_or_assign(slot_ptr, val) : false; - // Exit if inserted or assigned - if (group.shfl(status, src_lane)) { return; } + // Exit if inserted or assigned + if (group.shfl(status, src_lane)) { return; } + } } else { if constexpr (cuco::is_robin_hood_probing::value) { // Robin Hood displacement (see CG `open_addressing_ref_impl::insert`). @@ -986,18 +1032,41 @@ class operator_impl< // If the key is already in the container, update the payload and return if (eq_res == detail::equal_result::EQUAL) { - // wait for payload only when performing insert operation - if constexpr (wait_for_payload) { - ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); + if constexpr (cuco::is_robin_hood_probing< + typename ref_type::probing_scheme_type>::value) { + // Lift `op` to the whole slot, keeping the key, and CAS it. A relocation (or a rival + // update) makes the CAS fail; re-probe and retry -- the loop re-finds the key. + auto desired = slot_content; + // `desired` is a local copy, so this `op` is just local arithmetic -- the `atomic_ref`'s + // atomicity does nothing here and is used only because `Op`'s signature requires one. The + // real atomic is the full-slot CAS below. + op(cuda::atomic_ref{desired.second}, val.second); + if (ref_.impl_.attempt_insert(slot_ptr, slot_content, desired) == + detail::insert_result::SUCCESS) { + return false; + } + retry = true; + break; + } else { + // wait for payload only when performing insert operation + if constexpr (wait_for_payload) { + ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); + } + op(cuda::atomic_ref{slot_ptr->second}, val.second); + return false; } - op(cuda::atomic_ref{slot_ptr->second}, val.second); - return false; } if (eq_res == detail::equal_result::AVAILABLE) { switch (ref_.template attempt_insert_or_apply( slot_ptr, slot_content, val, op)) { case insert_result::SUCCESS: return true; case insert_result::DUPLICATE: { + if constexpr (cuco::is_robin_hood_probing< + typename ref_type::probing_scheme_type>::value) { + // Key is present now; re-probe so it is found EQUAL and updated via the full-slot CAS. + retry = true; + break; + } // wait for payload only when performing insert operation if constexpr (wait_for_payload) { ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); @@ -1112,13 +1181,31 @@ class operator_impl< auto const group_contains_equal = group.ballot(state == detail::equal_result::EQUAL); if (group_contains_equal) { auto const src_lane = __ffs(group_contains_equal) - 1; - if (group.thread_rank() == src_lane) { - if constexpr (wait_for_payload) { - ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); + if constexpr (cuco::is_robin_hood_probing< + typename ref_type::probing_scheme_type>::value) { + // src_lane lifts `op` to the slot (key fixed) and CASes it; a relocation or rival update + // fails the CAS, so the group re-probes and retries -- the loop re-finds the key. + auto const success = [&, target_idx = intra_bucket_index]() { + if (group.thread_rank() != src_lane) { return false; } + auto desired = bucket_slots[target_idx]; + // `desired` is a local copy, so this `op` is just local arithmetic -- the `atomic_ref`'s + // atomicity does nothing here and is used only because `Op`'s signature requires one. The + // real atomic is the full-slot CAS below. + op(cuda::atomic_ref{desired.second}, val.second); + return ref_.impl_.attempt_insert(slot_ptr, bucket_slots[target_idx], desired) == + detail::insert_result::SUCCESS; + }(); + if (group.shfl(success, src_lane)) { return false; } + continue; + } else { + if (group.thread_rank() == src_lane) { + if constexpr (wait_for_payload) { + ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); + } + op(cuda::atomic_ref{slot_ptr->second}, val.second); } - op(cuda::atomic_ref{slot_ptr->second}, val.second); + return false; } - return false; } auto const group_contains_available = group.ballot(state == detail::equal_result::AVAILABLE); @@ -1133,6 +1220,10 @@ class operator_impl< switch (group.shfl(status, src_lane)) { case insert_result::SUCCESS: return true; case insert_result::DUPLICATE: { + if constexpr (cuco::is_robin_hood_probing< + typename ref_type::probing_scheme_type>::value) { + continue; // key present now: re-probe, find it EQUAL, apply via the full-slot CAS + } if (group.thread_rank() == src_lane) { if constexpr (wait_for_payload) { ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); diff --git a/tests/static_map/contains_test.cu b/tests/static_map/contains_test.cu index 0b3604528..9f2769a56 100644 --- a/tests/static_map/contains_test.cu +++ b/tests/static_map/contains_test.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include @@ -79,6 +80,12 @@ void test_unique_sequence(Map& map, size_type num_keys) map.insert(pairs_begin, pairs_begin + num_keys); + // Robin Hood-specific: the populated table must satisfy the per-bucket layout invariant. + if constexpr (cuco::is_robin_hood_probing< + typename std::decay_t::probing_scheme_type>::value) { + cuco::test::check_robin_hood_invariant(map); + } + SECTION("All inserted keys should be contained.") { REQUIRE(map.count(keys_begin, keys_begin + num_keys) == num_keys); @@ -131,12 +138,19 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), + // Robin Hood mirrors the linear-probing rows. Only single-CAS (<= 8-byte) slots are + // unconditional; wider-slot RH displacement needs a packed atom.cas.b128 (gated below). + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 2) #if defined(CUCO_HAS_128BIT_ATOMICS) , (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), - (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2), + // Wider-slot Robin Hood rows: the packed displacement CAS needs atom.cas.b128. + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 2) #endif ) { @@ -145,9 +159,12 @@ TEMPLATE_TEST_CASE_SIG( // XXX: testing static extent is intended, DO NOT CHANGE using extent_type = cuco::extent; using probe = std::conditional_t< - Probe == cuco::test::probe_sequence::linear_probing, - cuco::linear_probing>, - cuco::double_hashing, cuco::murmurhash3_32>>; + Probe == cuco::test::probe_sequence::double_hashing, + cuco::double_hashing, cuco::murmurhash3_32>, + std::conditional_t< + Probe == cuco::test::probe_sequence::robin_hood, + cuco::robin_hood_probing>>, + cuco::linear_probing>>>; auto map = cuco::static_map #include #include @@ -73,6 +74,12 @@ void test_unique_sequence(Map& map, size_type num_keys) map.insert(pairs_begin, pairs_begin + num_keys); + // Robin Hood-specific: the populated table must satisfy the per-bucket Robin Hood layout invariant + // (a no-op for linear/double hashing). + if constexpr (cuco::is_robin_hood_probing::value) { + cuco::test::check_robin_hood_invariant(map); + } + SECTION("All inserted keys should be correctly recovered during find") { map.find(keys_begin, keys_begin + num_keys, d_results.begin()); @@ -163,12 +170,30 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), + // Robin Hood mirrors the linear-probing rows. Only single-CAS (<= 8-byte) slots are + // unconditional: RH displacement swaps into an *occupied* slot, which requires a packed CAS of + // the whole slot. The wider-slot RH rows therefore live under CUCO_HAS_128BIT_ATOMICS below + // (LP/DH avoid this because they only ever CAS into empty slots via back-to-back CAS). + (int8_t, int8_t, cuco::test::probe_sequence::robin_hood, 1), + (int8_t, int8_t, cuco::test::probe_sequence::robin_hood, 2), + (int8_t, int16_t, cuco::test::probe_sequence::robin_hood, 2), + (int16_t, int16_t, cuco::test::probe_sequence::robin_hood, 1), + (int16_t, int16_t, cuco::test::probe_sequence::robin_hood, 2), + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 2) #if defined(CUCO_HAS_128BIT_ATOMICS) , (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), - (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2), + // Wider-slot Robin Hood rows: the packed displacement CAS needs atom.cas.b128. + (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), + (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 2), + (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), + (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 2), + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 2) #endif ) { @@ -177,9 +202,12 @@ TEMPLATE_TEST_CASE_SIG( // XXX: testing static extent is intended, DO NOT CHANGE using extent_type = cuco::extent; using probe = std::conditional_t< - Probe == cuco::test::probe_sequence::linear_probing, - cuco::linear_probing>, - cuco::double_hashing, cuco::murmurhash3_32>>; + Probe == cuco::test::probe_sequence::double_hashing, + cuco::double_hashing, cuco::murmurhash3_32>, + std::conditional_t< + Probe == cuco::test::probe_sequence::robin_hood, + cuco::robin_hood_probing>>, + cuco::linear_probing>>>; constexpr size_type gold_capacity = [&]() { if constexpr (cuco::is_double_hashing::value) { diff --git a/tests/static_map/insert_and_find_test.cu b/tests/static_map/insert_and_find_test.cu index 17665eb2b..d380bf724 100644 --- a/tests/static_map/insert_and_find_test.cu +++ b/tests/static_map/insert_and_find_test.cu @@ -15,6 +15,7 @@ * limitations under the License. */ +#include #include #include @@ -52,12 +53,23 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), + // Robin Hood mirrors the linear-probing rows. Only single-CAS (<= 8-byte) slots are + // unconditional; wider-slot RH displacement needs a packed atom.cas.b128 (gated below). + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 2) #if defined(CUCO_HAS_128BIT_ATOMICS) , (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), - (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2), + // Wider-slot Robin Hood rows: the packed displacement CAS needs atom.cas.b128. + (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), + (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 2), + (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), + (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 2), + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 2) #endif ) { @@ -66,9 +78,12 @@ TEMPLATE_TEST_CASE_SIG( #endif { using probe = std::conditional_t< - Probe == cuco::test::probe_sequence::linear_probing, - cuco::linear_probing>, - cuco::double_hashing, cuco::murmurhash3_32>>; + Probe == cuco::test::probe_sequence::double_hashing, + cuco::double_hashing, cuco::murmurhash3_32>, + std::conditional_t< + Probe == cuco::test::probe_sequence::robin_hood, + cuco::robin_hood_probing>>, + cuco::linear_probing>>>; constexpr size_type num_keys{400}; @@ -100,6 +115,12 @@ TEMPLATE_TEST_CASE_SIG( map.insert_and_find(pairs_begin, pairs_begin + num_keys, found2.begin(), inserted.begin()); REQUIRE(cuco::test::none_of(inserted.begin(), inserted.end(), cuda::std::identity{})); + // Robin Hood-specific: the populated table must satisfy the per-bucket layout invariant. + if constexpr (cuco::is_robin_hood_probing< + typename std::decay_t::probing_scheme_type>::value) { + cuco::test::check_robin_hood_invariant(map); + } + // both found1 and found2 should be same, as keys will be referring to same slot REQUIRE( cuco::test::equal(found1.begin(), found1.end(), found2.begin(), cuda::std::equal_to{})); diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu index 0a4d07ea3..6bc151b17 100644 --- a/tests/static_map/insert_or_apply_test.cu +++ b/tests/static_map/insert_or_apply_test.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include @@ -54,6 +55,12 @@ void test_insert_or_apply(Map& map, size_type num_keys, size_type num_unique_key map.insert_or_apply(pairs_begin, pairs_begin + num_keys, plus_op); } + // Robin Hood-specific: the populated table must satisfy the per-bucket layout invariant. + if constexpr (cuco::is_robin_hood_probing< + typename std::decay_t::probing_scheme_type>::value) { + cuco::test::check_robin_hood_invariant(map); + } + REQUIRE(map.size() == num_unique_keys); thrust::device_vector d_keys(num_unique_keys); @@ -161,12 +168,23 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), + // Robin Hood mirrors the linear-probing rows. Only single-CAS (<= 8-byte) slots are + // unconditional; wider-slot RH displacement needs a packed atom.cas.b128 (gated below). + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 2) #if defined(CUCO_HAS_128BIT_ATOMICS) , (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), - (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2), + // Wider-slot Robin Hood rows: the packed displacement CAS needs atom.cas.b128. + (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), + (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 2), + (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), + (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 2), + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 2) #endif ) { @@ -174,9 +192,12 @@ TEMPLATE_TEST_CASE_SIG( constexpr size_type num_unique_keys{100}; using probe = std::conditional_t< - Probe == cuco::test::probe_sequence::linear_probing, - cuco::linear_probing>, - cuco::double_hashing, cuco::murmurhash3_32>>; + Probe == cuco::test::probe_sequence::double_hashing, + cuco::double_hashing, cuco::murmurhash3_32>, + std::conditional_t< + Probe == cuco::test::probe_sequence::robin_hood, + cuco::robin_hood_probing>>, + cuco::linear_probing>>>; using map_type = cuco::static_map(map, num_keys, num_unique_keys, static_cast(0)); } } + +// Dedicated Robin Hood coverage for insert_or_apply: the probe-enum test above is disabled upstream, +// so this is the only active exercise of the displacing RH insert_or_apply (reduction + lock-free +// displacement together). It runs at a high load factor (~0.95 on the unique keys) so displacement +// actually fires, and reuses `test_insert_or_apply`, whose tail asserts the structural RH invariant. +TEMPLATE_TEST_CASE_SIG("static_map robin_hood insert_or_apply (high load)", + "", + ((typename Key, typename Value, int CGSize), Key, Value, CGSize), + (int32_t, int32_t, 1), + (int32_t, int32_t, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (int64_t, int64_t, 1), + (int64_t, int64_t, 2) +#endif +) +{ + constexpr size_type num_unique_keys = 5'000; + constexpr size_type num_keys = 10'000; // each unique key inserted twice + + using probe = cuco::robin_hood_probing>>; + using map_type = cuco::static_map, + cuda::thread_scope_device, + cuda::std::equal_to, + probe, + cuco::cuda_allocator, + cuco::storage<2>>; + + // Size the table for ~0.95 load on the unique keys, so it is nearly full and the displacing insert + // path (and the structural invariant check inside the helper) is genuinely stressed. + constexpr size_type capacity = static_cast(num_unique_keys / 0.95); + + SECTION("sentinel equals init; has_init = true") + { + auto map = map_type{capacity, cuco::empty_key{-1}, cuco::empty_value{0}}; + test_insert_or_apply(map, num_keys, num_unique_keys, static_cast(0)); + } + SECTION("sentinel equals init; has_init = false") + { + auto map = map_type{capacity, cuco::empty_key{-1}, cuco::empty_value{0}}; + test_insert_or_apply(map, num_keys, num_unique_keys, static_cast(0)); + } +} diff --git a/tests/static_map/insert_or_assign_test.cu b/tests/static_map/insert_or_assign_test.cu index 2cccd62ad..067224e62 100644 --- a/tests/static_map/insert_or_assign_test.cu +++ b/tests/static_map/insert_or_assign_test.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include @@ -21,9 +22,11 @@ #include #include +#include #include #include #include +#include #include #include @@ -53,6 +56,12 @@ void test_insert_or_assign(Map& map, size_type num_keys) map.insert_or_assign(query_pairs_begin, query_pairs_begin + num_keys); + // Robin Hood-specific: the populated table must satisfy the per-bucket layout invariant. + if constexpr (cuco::is_robin_hood_probing< + typename std::decay_t::probing_scheme_type>::value) { + cuco::test::check_robin_hood_invariant(map); + } + auto const updated_size = map.size(); // all keys are present in the map so the size shouldn't change REQUIRE(updated_size == initial_size); @@ -93,21 +102,35 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), + // Robin Hood mirrors the linear-probing rows. Only single-CAS (<= 8-byte) slots are + // unconditional; wider-slot RH displacement needs a packed atom.cas.b128 (gated below). + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 2) #if defined(CUCO_HAS_128BIT_ATOMICS) , (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), - (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2), + // Wider-slot Robin Hood rows: the packed displacement CAS needs atom.cas.b128. + (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), + (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 2), + (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), + (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 2), + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 2) #endif ) { constexpr size_type num_keys{400}; using probe = std::conditional_t< - Probe == cuco::test::probe_sequence::linear_probing, - cuco::linear_probing>, - cuco::double_hashing, cuco::murmurhash3_32>>; + Probe == cuco::test::probe_sequence::double_hashing, + cuco::double_hashing, cuco::murmurhash3_32>, + std::conditional_t< + Probe == cuco::test::probe_sequence::robin_hood, + cuco::robin_hood_probing>>, + cuco::linear_probing>>>; auto map = cuco::static_mapkey value swap from a misplaced assign). +TEMPLATE_TEST_CASE_SIG("static_map robin_hood insert_or_assign (high load)", + "", + ((typename Key, typename Value, int CGSize), Key, Value, CGSize), + (int32_t, int32_t, 1), + (int32_t, int32_t, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (int64_t, int64_t, 1), + (int64_t, int64_t, 2) +#endif +) +{ + constexpr size_type num_unique_keys = 5'000; + constexpr size_type num_keys = 10'000; // each unique key assigned twice (same value) + + using probe = cuco::robin_hood_probing>>; + using map_type = cuco::static_map, + cuda::thread_scope_device, + cuda::std::equal_to, + probe, + cuco::cuda_allocator, + cuco::storage<2>>; + + // ~0.95 load on the unique keys, so the table is nearly full and displacement is stressed. + constexpr size_type capacity = static_cast(num_unique_keys / 0.95); + auto map = map_type{capacity, cuco::empty_key{-1}, cuco::empty_value{-1}}; + + // Every occurrence of key k carries the same value (k * 2). + auto pairs_begin = cuda::make_transform_iterator( + cuda::counting_iterator(0), + cuda::proclaim_return_type>([num_unique_keys] __device__(auto i) { + auto const k = static_cast(i % num_unique_keys); + return cuco::pair{k, static_cast(k * 2)}; + })); + + map.insert_or_assign(pairs_begin, pairs_begin + num_keys); + + REQUIRE(map.size() == num_unique_keys); + cuco::test::check_robin_hood_invariant(map); + + thrust::device_vector d_keys(num_unique_keys); + thrust::device_vector d_values(num_unique_keys); + map.retrieve_all(d_keys.begin(), d_values.begin()); + + auto const zip = thrust::make_zip_iterator(cuda::std::tuple{d_keys.begin(), d_values.begin()}); + REQUIRE(cuco::test::all_of( + zip, zip + num_unique_keys, cuda::proclaim_return_type([] __device__(auto const& p) { + return cuda::std::get<1>(p) == static_cast(cuda::std::get<0>(p) * 2); + }))); +} diff --git a/tests/static_map/robin_hood_invariant.cuh b/tests/static_map/robin_hood_invariant.cuh new file mode 100644 index 000000000..5d44be7e6 --- /dev/null +++ b/tests/static_map/robin_hood_invariant.cuh @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include + +#include + +namespace cuco { +namespace test { + +// Per-probe-step Robin Hood layout check. The unit is the *stride group* of `cg_size * bucket_size` +// contiguous slots that one probing step examines -- a single bucket for scalar probing, the whole +// cooperative-group window for CG probing. Within a stride group the slot order is free (the probe +// step distance is identical for every slot in it, so the intra-group offset cancels in all +// comparisons), so the invariant is only meaningful *between* groups. For each occupied group `g` +// (with predecessor `pg`), the resident probe-step distances ("ages") must satisfy: +// +// (1) Contiguity. If `g` holds any overflowed resident (distance >= 1), `pg` must be full -- +// otherwise that resident would have stopped in `pg`'s free slot instead of probing past it. +// (2) Balance. No resident of `pg` may be more than one probing step *richer* than the poorest +// resident of `g` (`min_age(pg) >= max_age(g) - 1`) -- otherwise the poorest resident of `g` +// should have displaced it. This is the property that distinguishes Robin Hood from plain +// linear probing, and (via condition 1) it inductively forces the whole home-to-position run +// to be full. +// +// `probe_distance` is reused here -- it is exercised independently by the utility probe-distance +// test, so a bug in *insert* (a layout that violates the invariant) is still caught. +template +__global__ void robin_hood_invariant_kernel(Ref ref, int* violations) +{ + using size_type = typename Ref::size_type; + constexpr int bs = Ref::bucket_size; + constexpr int stride = Ref::cg_size * Ref::bucket_size; + auto const storage_ref = ref.storage_ref(); + auto const slots = storage_ref.data(); + auto const num_groups = storage_ref.capacity() / stride; + auto const extent = storage_ref.extent(); + auto const empty_key = ref.empty_key_sentinel(); + auto const scheme = ref.probing_scheme(); + + for (size_type g = blockIdx.x * blockDim.x + threadIdx.x; g < num_groups; + g += gridDim.x * blockDim.x) { + int occupied_g = 0; + size_type max_age_g = 0; + for (int s = 0; s < stride; ++s) { + auto const slot = slots[g * stride + s]; + if (slot.first != empty_key) { + ++occupied_g; + auto const age = scheme.template probe_distance( + slot.first, static_cast(g * stride + s), extent); + if (age > max_age_g) { max_age_g = age; } + } + } + if (occupied_g == 0) { continue; } + + size_type const pg = (g + num_groups - 1) % num_groups; + int occupied_p = 0; + size_type min_age_p = 0; + for (int s = 0; s < stride; ++s) { + auto const slot = slots[pg * stride + s]; + if (slot.first != empty_key) { + auto const age = scheme.template probe_distance( + slot.first, static_cast(pg * stride + s), extent); + if (occupied_p == 0 || age < min_age_p) { min_age_p = age; } + ++occupied_p; + } + } + + if (max_age_g >= 1 && occupied_p < stride) { atomicAdd(violations, 1); } // (1) + if (occupied_p > 0 && min_age_p + 1 < max_age_g) { atomicAdd(violations, 1); } // (2) + } +} + +// Asserts that a populated Robin Hood `map` satisfies the per-bucket layout invariant above. No-op +// to call only on Robin Hood maps -- `probe_distance` exists only on `robin_hood_probing`, so guard +// the call site with `cuco::is_robin_hood_probing<...>`. +template +void check_robin_hood_invariant(Map& map) +{ + auto const ref = map.ref(cuco::op::find); + + thrust::device_vector d_violations(1, 0); + auto constexpr block_size = 128; + auto const grid_size = (map.capacity() + block_size - 1) / block_size; + robin_hood_invariant_kernel<<>>( + ref, thrust::raw_pointer_cast(d_violations.data())); + CUCO_CUDA_TRY(cudaDeviceSynchronize()); + + REQUIRE(d_violations[0] == 0); +} + +} // namespace test +} // namespace cuco diff --git a/tests/test_utils.hpp b/tests/test_utils.hpp index 2c77eda36..d71336f8b 100644 --- a/tests/test_utils.hpp +++ b/tests/test_utils.hpp @@ -33,7 +33,7 @@ namespace cg = cooperative_groups; constexpr int32_t block_size = 128; -enum class probe_sequence { linear_probing, double_hashing }; +enum class probe_sequence { linear_probing, double_hashing, robin_hood }; // User-defined logical algorithms to reduce compilation time template From d2341e53ce74f6c60a20a1723a59e8f7aac17f6f Mon Sep 17 00:00:00 2001 From: aterenin Date: Fri, 5 Jun 2026 15:53:47 -0400 Subject: [PATCH 09/11] Add Robin Hood erase and tests. --- .../open_addressing_ref_impl.cuh | 196 +++++++++++--- .../cuco/detail/static_map/static_map_ref.inl | 84 ++++-- tests/static_map/erase_test.cu | 254 +++++++++++++++++- tests/static_map/robin_hood_invariant.cuh | 16 +- 4 files changed, 474 insertions(+), 76 deletions(-) diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index 7fc5daa59..e64efb43d 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -400,7 +400,13 @@ class open_addressing_ref_impl // If the key is already in the container, return false if (eq_res == detail::equal_result::EQUAL) { return false; } } - if (eq_res == detail::equal_result::AVAILABLE) { + // Robin Hood claims only a true empty here; a tombstone carries an age and is handled as a + // resident by the displacement test below. Skipping it must gate the CAS (once claimed it is + // already consumed), so it is folded into this condition. For non-Robin-Hood the second + // clause is a compile-time `false`, leaving the original `eq_res == AVAILABLE`. + if (eq_res == detail::equal_result::AVAILABLE and + not(cuco::is_robin_hood_probing::value and + this->is_erased(slot_content))) { auto const intra_bucket_index = cuda::std::distance(bucket_slots.begin(), &slot_content); switch (attempt_insert( this->get_slot_ptr(*probing_iter, intra_bucket_index), slot_content, val)) { @@ -431,20 +437,22 @@ class open_addressing_ref_impl } // Robin Hood swap test. A resident "richer" than the in-flight pair (a smaller probe - // distance than our current probe step) is displaced: we swap our pair into its slot, - // adopt the evicted resident, and re-probe forward. + // distance than our current probe step) is displaced: we swap our pair into its slot, adopt + // the evicted resident, and re-probe forward. A tombstone is treated as a resident too -- + // its age comes from its payload (`robin_hood_age`) -- but picking one up *consumes* it: we + // take the slot and are done, since there is nothing to carry forward. if constexpr (cuco::is_robin_hood_probing::value) { - if (eq_res == detail::equal_result::UNEQUAL) { + if (eq_res == detail::equal_result::UNEQUAL or this->is_erased(slot_content)) { auto const intra_bucket_index = cuda::std::distance(bucket_slots.begin(), &slot_content); - auto const evicted_age = probing_scheme_.template probe_distance( - this->extract_key(slot_content), - static_cast(*probing_iter + intra_bucket_index), - storage_ref_.extent()); + auto const evicted_age = this->robin_hood_age( + slot_content, static_cast(*probing_iter + intra_bucket_index)); if (evicted_age < probe_step) { if (this->attempt_insert( this->get_slot_ptr(*probing_iter, intra_bucket_index), slot_content, val) == insert_result::SUCCESS) { + // Consuming a tombstone reuses its freed slot -- nothing to carry, so we are done. + if (this->is_erased(slot_content)) { return true; } // Adopt the evicted pair and re-probe THIS bucket -- its bucket distance here is // `evicted_age`, and it may belong in another slot of the same bucket: an empty one, // or one holding an even-richer resident it can displace in turn. Re-reading the @@ -506,9 +514,18 @@ class open_addressing_ref_impl if (result.state_ == detail::equal_result::UNEQUAL) { switch (this->predicate_.template operator()( key, this->extract_key(bucket_slots[i()]))) { - case detail::equal_result::AVAILABLE: - result = bucket_probing_results{detail::equal_result::AVAILABLE, i()}; + case detail::equal_result::AVAILABLE: { + // Robin Hood: only a true empty is AVAILABLE; a tombstone is a resident handled by + // the displacement scan below, so leave it UNEQUAL here. + bool empty_slot = true; + if constexpr (cuco::is_robin_hood_probing::value) { + empty_slot = not this->is_erased(bucket_slots[i()]); + } + if (empty_slot) { + result = bucket_probing_results{detail::equal_result::AVAILABLE, i()}; + } break; + } case detail::equal_result::EQUAL: { if constexpr (!allows_duplicates) { result = bucket_probing_results{detail::equal_result::EQUAL, i()}; @@ -565,10 +582,10 @@ class open_addressing_ref_impl size_type evicted_age = 0; cuda::static_for([&] __device__(auto i) { if (displace_idx < 0) { - auto const age = probing_scheme_.template probe_distance( - this->extract_key(bucket_slots[i()]), - static_cast(*probing_iter + i()), - storage_ref_.extent()); + // `robin_hood_age` so a tombstone uses its payload-stored age: it is displaced (i.e. + // consumed) exactly when richer than the in-flight pair, like any other resident. + auto const age = + this->robin_hood_age(bucket_slots[i()], static_cast(*probing_iter + i())); if (age < probe_step) { displace_idx = i(); evicted_age = age; @@ -589,6 +606,8 @@ class open_addressing_ref_impl attempt_insert(this->get_slot_ptr(*probing_iter, displace_idx), evicted, val); } if (group.shfl(status, src_lane) == insert_result::SUCCESS) { + // Consuming a tombstone reuses its freed slot -- nothing to carry, so we are done. + if (group.shfl(this->is_erased(evicted), src_lane)) { return true; } // Broadcast the evicted pair and its probe distance from the winning lane, and adopt // it on every lane (all lanes need the new in-flight pair for the next scan). auto const new_key = group.shfl(this->extract_key(evicted), src_lane); @@ -666,7 +685,11 @@ class open_addressing_ref_impl this->maybe_wait_for_payload(slot_ptr); return {iterator{slot_ptr}, false}; } - if (eq_res == detail::equal_result::AVAILABLE) { + // Robin Hood claims only a true empty here; a tombstone is handled as a resident by the + // displacement test below (see `insert`). + if (eq_res == detail::equal_result::AVAILABLE and + not(cuco::is_robin_hood_probing::value and + this->is_erased(bucket_slots[i]))) { switch (this->attempt_insert_stable(slot_ptr, bucket_slots[i], val)) { case insert_result::SUCCESS: { // The in-flight pair is placed in an empty slot, ending any displacement chain. The @@ -696,16 +719,23 @@ class open_addressing_ref_impl } } - // Robin Hood swap test (see `insert` for the full rationale). + // Robin Hood swap test (see `insert` for the full rationale). A tombstone is a resident too + // (age from its payload); picking one up consumes it -- the in-flight pair lands there and we + // are done. if constexpr (cuco::is_robin_hood_probing::value) { - if (eq_res == detail::equal_result::UNEQUAL) { - auto const evicted_age = probing_scheme_.template probe_distance( - this->extract_key(bucket_slots[i]), - static_cast(*probing_iter + i), - storage_ref_.extent()); + if (eq_res == detail::equal_result::UNEQUAL or this->is_erased(bucket_slots[i])) { + auto const evicted_age = + this->robin_hood_age(bucket_slots[i], static_cast(*probing_iter + i)); if (evicted_age < probe_step) { if (this->attempt_insert(slot_ptr, bucket_slots[i], val) == insert_result::SUCCESS) { + if (this->is_erased(bucket_slots[i])) { + // Consumed a tombstone: the in-flight pair is placed here; return the original + // key's slot (this one if it was never displaced). + auto* result_ptr = (placed_ptr != nullptr) ? placed_ptr : slot_ptr; + this->maybe_wait_for_payload(result_ptr); + return {iterator{result_ptr}, true}; + } if (placed_ptr == nullptr) { placed_ptr = slot_ptr; } // original key's slot val = cuda::std::bit_cast(bucket_slots[i]); key = this->extract_key(val); @@ -774,6 +804,13 @@ class open_addressing_ref_impl if (result.state_ == detail::equal_result::UNEQUAL) { auto res = this->predicate_.template operator()( key, this->extract_key(bucket_slots[i()])); + // Robin Hood: a tombstone is a resident handled by the displacement scan below, not + // AVAILABLE, so leave it UNEQUAL here. + if constexpr (cuco::is_robin_hood_probing::value) { + if (res == detail::equal_result::AVAILABLE and this->is_erased(bucket_slots[i()])) { + res = detail::equal_result::UNEQUAL; + } + } if (res != detail::equal_result::UNEQUAL) { result = bucket_probing_results{res, i()}; } } }); @@ -827,10 +864,10 @@ class open_addressing_ref_impl size_type evicted_age = 0; cuda::static_for([&] __device__(auto i) { if (displace_idx < 0) { - auto const age = probing_scheme_.template probe_distance( - this->extract_key(bucket_slots[i()]), - static_cast(*probing_iter + i()), - storage_ref_.extent()); + // `robin_hood_age` so a tombstone uses its payload-stored age: it is displaced (i.e. + // consumed) exactly when richer than the in-flight pair, like any other resident. + auto const age = + this->robin_hood_age(bucket_slots[i()], static_cast(*probing_iter + i())); if (age < probe_step) { displace_idx = i(); evicted_age = age; @@ -852,6 +889,15 @@ class open_addressing_ref_impl } if (group.shfl(status, src_lane) == insert_result::SUCCESS) { if (placed_ptr == 0) { placed_ptr = group.shfl(displaced, src_lane); } + // Consumed a tombstone: the in-flight pair is placed in its slot; we are done. Return + // the original key's slot (`placed_ptr`, which is this slot if it was never displaced). + if (group.shfl(this->is_erased(evicted), src_lane)) { + if (group.thread_rank() == src_lane) { + this->maybe_wait_for_payload(reinterpret_cast(displaced)); + } + group.sync(); + return {iterator{reinterpret_cast(placed_ptr)}, true}; + } auto const new_key = group.shfl(this->extract_key(evicted), src_lane); auto const new_age = group.shfl(evicted_age, src_lane); value_type evicted_slot; @@ -905,9 +951,15 @@ class open_addressing_ref_impl // Key exists, return true if successfully deleted if (eq_res == detail::equal_result::EQUAL) { auto const intra_bucket_index = cuda::std::distance(bucket_slots.begin(), &slot_content); - switch (attempt_insert_stable(this->get_slot_ptr(*probing_iter, intra_bucket_index), - slot_content, - this->erased_slot_sentinel())) { + // Robin Hood records the erased key's age in the tombstone payload (1a); other schemes use + // the plain erased sentinel. + value_type erased = this->erased_slot_sentinel(); + if constexpr (cuco::is_robin_hood_probing::value) { + erased = this->robin_hood_erased_sentinel( + slot_content, static_cast(*probing_iter + intra_bucket_index)); + } + switch (attempt_insert_stable( + this->get_slot_ptr(*probing_iter, intra_bucket_index), slot_content, erased)) { case insert_result::SUCCESS: return true; case insert_result::DUPLICATE: return false; default: continue; @@ -956,12 +1008,19 @@ class open_addressing_ref_impl auto const group_contains_equal = group.ballot(state == detail::equal_result::EQUAL); if (group_contains_equal) { auto const src_lane = __ffs(group_contains_equal) - 1; - auto const status = - (group.thread_rank() == src_lane) - ? attempt_insert_stable(this->get_slot_ptr(*probing_iter, intra_bucket_index), - bucket_slots[intra_bucket_index], - this->erased_slot_sentinel()) - : insert_result::CONTINUE; + auto status = insert_result::CONTINUE; + if (group.thread_rank() == src_lane) { + // Robin Hood records the erased key's age in the tombstone payload (1a); other schemes use + // the plain erased sentinel. + value_type erased = this->erased_slot_sentinel(); + if constexpr (cuco::is_robin_hood_probing::value) { + erased = this->robin_hood_erased_sentinel( + bucket_slots[intra_bucket_index], + static_cast(*probing_iter + intra_bucket_index)); + } + status = attempt_insert_stable( + this->get_slot_ptr(*probing_iter, intra_bucket_index), bucket_slots[intra_bucket_index], erased); + } switch (group.shfl(status, src_lane)) { case insert_result::SUCCESS: return true; @@ -1871,15 +1930,76 @@ class open_addressing_ref_impl { bool richer = false; cuda::static_for([&](auto i) { - auto const resident_age = probing_scheme_.template probe_distance( - this->extract_key(bucket_slots[i()]), - static_cast(bucket_base + i()), - storage_ref_.extent()); + auto const resident_age = + this->robin_hood_age(bucket_slots[i()], static_cast(bucket_base + i())); if (resident_age < probe_step) { richer = true; } }); return richer; } + /** + * @brief Whether `slot` holds a tombstone (erased marker). + * + * @note Returns false when erase is disabled (the erased and empty sentinels coincide, so no slot + * is a tombstone) -- this keeps the test correct even for empty slots. + * + * @param slot The slot to test + * + * @return True if `slot` is an erased tombstone + */ + [[nodiscard]] __device__ bool is_erased(value_type const& slot) const noexcept + { + return not cuco::detail::bitwise_compare(this->erased_key_sentinel(), + this->empty_key_sentinel()) and + cuco::detail::bitwise_compare(this->extract_key(slot), this->erased_key_sentinel()); + } + + /** + * @brief Robin Hood probe distance ("age") of an occupied slot. + * + * A live key's age is its `probe_distance`. A Robin Hood tombstone keeps the age of the key it + * replaced in its payload (the original key is gone and cannot be rehashed; see `erase`), so it + * is read back here -- a tombstone then participates in every Robin Hood comparison exactly like + * the resident it stood in for. + * + * @param slot The (occupied) slot + * @param slot_index The slot's index + * + * @return The slot's probe distance + */ + [[nodiscard]] __device__ size_type robin_hood_age(value_type const& slot, + size_type slot_index) const noexcept + { + if constexpr (has_payload) { + if (this->is_erased(slot)) { return static_cast(this->extract_payload(slot)); } + } + return probing_scheme_.template probe_distance( + this->extract_key(slot), slot_index, storage_ref_.extent()); + } + + /** + * @brief The Robin Hood tombstone for erasing the live key currently in `slot` at `slot_index`. + * + * The erased key's age is stashed in the payload (1a) so the tombstone keeps its place in the + * Robin Hood ordering (the original key is gone and cannot be rehashed). Other probing schemes use + * the plain `erased_slot_sentinel()` and never call this. + * + * @param slot The slot's current (live) contents + * @param slot_index The slot's index + * + * @return The value to CAS into the slot to erase it + */ + [[nodiscard]] __device__ value_type robin_hood_erased_sentinel(value_type const& slot, + size_type slot_index) const noexcept + { + static_assert(has_payload, + "Robin Hood erase requires a mapped payload to store the tombstone age"); + auto const age = probing_scheme_.template probe_distance( + this->extract_key(slot), slot_index, storage_ref_.extent()); + return cuco::pair{this->erased_key_sentinel(), + static_castempty_value_sentinel())>(age)}; + } + /** * @brief Extracts the key from a given value type. * diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl index df323db6d..767e3b944 100644 --- a/include/cuco/detail/static_map/static_map_ref.inl +++ b/include/cuco/detail/static_map/static_map_ref.inl @@ -557,29 +557,33 @@ class operator_impl< } if (eq_res == detail::equal_result::AVAILABLE) { if constexpr (cuco::is_robin_hood_probing::value) { - // Insert the new pair with a full-slot CAS; on a lost CAS (rival insert) or a duplicate, - // retry -- the loop re-finds the key and assigns it via the EQUAL full-slot CAS. - if (ref_.impl_.attempt_insert(slot_ptr, slot_content, val) == - detail::insert_result::SUCCESS) { - return; + // Claim only a true empty; a tombstone is handled as a resident by the displacement test + // below. On a lost CAS (rival insert) or a duplicate, retry -- the loop re-finds the key + // and assigns it via the EQUAL full-slot CAS. + if (not ref_.impl_.is_erased(slot_content)) { + if (ref_.impl_.attempt_insert(slot_ptr, slot_content, val) == + detail::insert_result::SUCCESS) { + return; + } + retry = true; + break; } - retry = true; - break; } else { if (attempt_insert_or_assign(slot_ptr, val)) { return; } } } - // Robin Hood swap test (see `open_addressing_ref_impl::insert` for the rationale). + // Robin Hood swap test (see `open_addressing_ref_impl::insert` for the rationale). A + // tombstone is a resident too (age from its payload); picking one up consumes it -- the pair + // lands there and we are done. if constexpr (cuco::is_robin_hood_probing::value) { - if (eq_res == detail::equal_result::UNEQUAL) { - auto const evicted_age = probing_scheme.template probe_distance( - ref_.impl_.extract_key(slot_content), - static_cast(*probing_iter + intra_bucket_index), - storage_ref.extent()); + if (eq_res == detail::equal_result::UNEQUAL or ref_.impl_.is_erased(slot_content)) { + auto const evicted_age = ref_.impl_.robin_hood_age( + slot_content, static_cast(*probing_iter + intra_bucket_index)); if (evicted_age < probe_step) { if (ref_.impl_.attempt_insert(slot_ptr, slot_content, val) == detail::insert_result::SUCCESS) { + if (ref_.impl_.is_erased(slot_content)) { return; } // consumed tombstone -- done val = cuda::std::bit_cast(slot_content); key = ref_.impl_.extract_key(val); probe_step = evicted_age; @@ -636,6 +640,14 @@ class operator_impl< if (result.state_ == detail::equal_result::UNEQUAL) { auto res = ref_.impl_.predicate_.template operator()( key, bucket_slots[i()].first); + // Robin Hood: a tombstone is a resident handled by the displacement scan, not AVAILABLE. + if constexpr (cuco::is_robin_hood_probing< + typename ref_type::probing_scheme_type>::value) { + if (res == detail::equal_result::AVAILABLE and + ref_.impl_.is_erased(bucket_slots[i()])) { + res = detail::equal_result::UNEQUAL; + } + } if (res != detail::equal_result::UNEQUAL) { result = detail::bucket_probing_results{res, i()}; } @@ -698,10 +710,9 @@ class operator_impl< size_type evicted_age = 0; cuda::static_for([&] __device__(auto i) { if (displace_idx < 0) { - auto const age = probing_scheme.template probe_distance( - ref_.impl_.extract_key(bucket_slots[i()]), - static_cast(*probing_iter + i()), - storage_ref.extent()); + // `robin_hood_age` so a tombstone uses its payload-stored age (like any resident). + auto const age = ref_.impl_.robin_hood_age( + bucket_slots[i()], static_cast(*probing_iter + i())); if (age < probe_step) { displace_idx = i(); evicted_age = age; @@ -720,6 +731,8 @@ class operator_impl< ref_.impl_.get_slot_ptr(*probing_iter, displace_idx), evicted, val); } if (group.shfl(status, src_lane) == detail::insert_result::SUCCESS) { + // Consuming a tombstone reuses its slot -- nothing to carry, so we are done. + if (group.shfl(ref_.impl_.is_erased(evicted), src_lane)) { return; } auto const new_key = group.shfl(ref_.impl_.extract_key(evicted), src_lane); auto const new_payload = group.shfl(ref_.impl_.extract_payload(evicted), src_lane); auto const new_age = group.shfl(evicted_age, src_lane); @@ -1056,7 +1069,11 @@ class operator_impl< return false; } } - if (eq_res == detail::equal_result::AVAILABLE) { + // Robin Hood claims only a true empty here; a tombstone is handled as a resident by the + // displacement test below. + if (eq_res == detail::equal_result::AVAILABLE and + not(cuco::is_robin_hood_probing::value and + ref_.impl_.is_erased(slot_content))) { switch (ref_.template attempt_insert_or_apply( slot_ptr, slot_content, val, op)) { case insert_result::SUCCESS: return true; @@ -1089,16 +1106,18 @@ class operator_impl< } } - // Robin Hood swap test (see `open_addressing_ref_impl::insert` for the rationale). + // Robin Hood swap test (see `open_addressing_ref_impl::insert` for the rationale). A + // tombstone is a resident too (age from its payload); picking one up consumes it -- the + // in-flight pair lands there, completing the insert. if constexpr (cuco::is_robin_hood_probing::value) { - if (eq_res == detail::equal_result::UNEQUAL) { - auto const evicted_age = probing_scheme.template probe_distance( - ref_.impl_.extract_key(slot_content), - static_cast(*probing_iter + intra_bucket_index), - storage_ref.extent()); + if (eq_res == detail::equal_result::UNEQUAL or ref_.impl_.is_erased(slot_content)) { + auto const evicted_age = ref_.impl_.robin_hood_age( + slot_content, static_cast(*probing_iter + intra_bucket_index)); if (evicted_age < probe_step) { if (ref_.impl_.attempt_insert(slot_ptr, slot_content, val) == detail::insert_result::SUCCESS) { + // Consuming a tombstone places the in-flight pair in its slot -- insert complete. + if (ref_.impl_.is_erased(slot_content)) { return true; } val = cuda::std::bit_cast(slot_content); key = ref_.impl_.extract_key(val); probe_step = evicted_age; @@ -1168,6 +1187,14 @@ class operator_impl< if (result.state_ == detail::equal_result::UNEQUAL) { auto res = ref_.impl_.predicate_.template operator()( key, bucket_slots[i()].first); + // Robin Hood: a tombstone is a resident handled by the displacement scan, not AVAILABLE. + if constexpr (cuco::is_robin_hood_probing< + typename ref_type::probing_scheme_type>::value) { + if (res == detail::equal_result::AVAILABLE and + ref_.impl_.is_erased(bucket_slots[i()])) { + res = detail::equal_result::UNEQUAL; + } + } if (res != detail::equal_result::UNEQUAL) { result = detail::bucket_probing_results{res, i()}; } @@ -1241,10 +1268,9 @@ class operator_impl< size_type evicted_age = 0; cuda::static_for([&] __device__(auto i) { if (displace_idx < 0) { - auto const age = probing_scheme.template probe_distance( - ref_.impl_.extract_key(bucket_slots[i()]), - static_cast(*probing_iter + i()), - storage_ref.extent()); + // `robin_hood_age` so a tombstone uses its payload-stored age (like any resident). + auto const age = ref_.impl_.robin_hood_age( + bucket_slots[i()], static_cast(*probing_iter + i())); if (age < probe_step) { displace_idx = i(); evicted_age = age; @@ -1263,6 +1289,8 @@ class operator_impl< ref_.impl_.get_slot_ptr(*probing_iter, displace_idx), evicted, val); } if (group.shfl(status, src_lane) == detail::insert_result::SUCCESS) { + // Consuming a tombstone places the in-flight pair in its slot -- insert complete. + if (group.shfl(ref_.impl_.is_erased(evicted), src_lane)) { return true; } auto const new_key = group.shfl(ref_.impl_.extract_key(evicted), src_lane); auto const new_payload = group.shfl(ref_.impl_.extract_payload(evicted), src_lane); auto const new_age = group.shfl(evicted_age, src_lane); diff --git a/tests/static_map/erase_test.cu b/tests/static_map/erase_test.cu index 8bf09abe2..024068242 100644 --- a/tests/static_map/erase_test.cu +++ b/tests/static_map/erase_test.cu @@ -14,10 +14,12 @@ * limitations under the License. */ +#include #include #include #include +#include #include #include @@ -102,21 +104,35 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), + // Robin Hood mirrors the linear-probing rows. Only single-CAS (<= 8-byte) slots are + // unconditional; wider-slot RH displacement needs a packed atom.cas.b128 (gated below). + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 2) #if defined(CUCO_HAS_128BIT_ATOMICS) , (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), - (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2), + // Wider-slot Robin Hood rows: the packed displacement CAS needs atom.cas.b128. + (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), + (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 2), + (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), + (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 2), + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 2) #endif ) { constexpr size_type num_keys{1'000'000}; using probe = std::conditional_t< - Probe == cuco::test::probe_sequence::linear_probing, - cuco::linear_probing>, - cuco::double_hashing, cuco::murmurhash3_32>>; + Probe == cuco::test::probe_sequence::double_hashing, + cuco::double_hashing, cuco::murmurhash3_32>, + std::conditional_t< + Probe == cuco::test::probe_sequence::robin_hood, + cuco::robin_hood_probing>>, + cuco::linear_probing>>>; auto map = cuco::static_map>>; + using map_type = cuco::static_map, + cuda::thread_scope_device, + cuda::std::equal_to, + probe, + cuco::cuda_allocator, + cuco::storage<2>>; + + constexpr size_type capacity = static_cast(num_keys / 0.85); + auto map = map_type{capacity, + cuco::empty_key{-1}, + cuco::empty_value{-1}, + cuco::erased_key{-2}}; + + // Keys 1..num_keys (avoid the -1 / -2 sentinels). + auto keys_begin = cuda::counting_iterator(1); + auto pairs_begin = cuda::make_transform_iterator( + keys_begin, cuda::proclaim_return_type>([] __device__(Key k) { + return cuco::pair{k, static_cast(k)}; + })); + + map.insert(pairs_begin, pairs_begin + num_keys); + REQUIRE(map.size() == num_keys); + + constexpr size_type num_erased = num_keys / 2; + map.erase(keys_begin, keys_begin + num_erased); + REQUIRE(map.size() == num_keys - num_erased); + + thrust::device_vector d_contained(num_keys); + map.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); + REQUIRE(cuco::test::none_of( + d_contained.begin(), d_contained.begin() + num_erased, cuda::std::identity{})); + REQUIRE( + cuco::test::all_of(d_contained.begin() + num_erased, d_contained.end(), cuda::std::identity{})); +} + +// Robin Hood erase reuse + structural invariant at high load: insert, erase half (-> tombstones), +// re-insert (-> consume tombstones), checking the per-bucket Robin Hood layout invariant after each +// step (tombstones counted as residents, age read from their payload). A wrongly consumed tombstone +// (the age inversion) corrupts the layout and trips the invariant; a lost/duplicate key trips +// size/contains. +TEMPLATE_TEST_CASE_SIG("static_map robin_hood erase reuse + invariant", + "", + ((typename Key, typename Value, int CGSize), Key, Value, CGSize), + (int32_t, int32_t, 1), + (int32_t, int32_t, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (int64_t, int64_t, 1), + (int64_t, int64_t, 2) +#endif +) +{ + constexpr size_type num_keys = 10'000; + + using probe = cuco::robin_hood_probing>>; + using map_type = cuco::static_map, + cuda::thread_scope_device, + cuda::std::equal_to, + probe, + cuco::cuda_allocator, + cuco::storage<2>>; + + constexpr size_type capacity = static_cast(num_keys / 0.85); + auto map = map_type{capacity, + cuco::empty_key{-1}, + cuco::empty_value{-1}, + cuco::erased_key{-2}}; + + auto keys_begin = cuda::counting_iterator(1); + auto pairs_begin = cuda::make_transform_iterator( + keys_begin, cuda::proclaim_return_type>([] __device__(Key k) { + return cuco::pair{k, static_cast(k)}; + })); + + map.insert(pairs_begin, pairs_begin + num_keys); + REQUIRE(map.size() == num_keys); + cuco::test::check_robin_hood_invariant(map); + + constexpr size_type num_erased = num_keys / 2; + map.erase(keys_begin, keys_begin + num_erased); + REQUIRE(map.size() == num_keys - num_erased); + cuco::test::check_robin_hood_invariant(map); // tombstones-as-residents layout still valid + + map.insert(pairs_begin, pairs_begin + num_erased); // consume tombstones / fill empties + REQUIRE(map.size() == num_keys); + cuco::test::check_robin_hood_invariant(map); // layout valid after reuse + + thrust::device_vector d_contained(num_keys); + map.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); + REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); +} + +namespace { +enum class reinsert_via { insert_or_assign, insert_or_apply, insert_and_find }; + +// Robin Hood erase + reuse through a specific re-insert API: insert, check invariant, erase the first +// half (-> tombstones), check invariant, then re-insert that half via `how` (-> consume tombstones) +// and check invariant + that every key is present. Exercises the tombstone path of the chosen insert +// variant. +template +void test_rh_erase_reuse(size_type num_keys, reinsert_via how) +{ + using probe = cuco::robin_hood_probing>>; + using map_type = cuco::static_map, + cuda::thread_scope_device, + cuda::std::equal_to, + probe, + cuco::cuda_allocator, + cuco::storage<2>>; + + auto const capacity = static_cast(num_keys / 0.85); + auto map = map_type{capacity, + cuco::empty_key{-1}, + cuco::empty_value{-1}, + cuco::erased_key{-2}}; + + auto keys_begin = cuda::counting_iterator(1); + auto pairs_begin = cuda::make_transform_iterator( + keys_begin, cuda::proclaim_return_type>([] __device__(Key k) { + return cuco::pair{k, static_cast(k)}; + })); + + map.insert(pairs_begin, pairs_begin + num_keys); + REQUIRE(map.size() == num_keys); + cuco::test::check_robin_hood_invariant(map); + + auto const num_erased = num_keys / 2; + map.erase(keys_begin, keys_begin + num_erased); + REQUIRE(map.size() == num_keys - num_erased); + cuco::test::check_robin_hood_invariant(map); + + // re-insert the erased half through the variant under test (consuming tombstones) + switch (how) { + case reinsert_via::insert_or_assign: + map.insert_or_assign(pairs_begin, pairs_begin + num_erased); + break; + case reinsert_via::insert_or_apply: + map.insert_or_apply(pairs_begin, pairs_begin + num_erased, cuco::reduce::plus{}); + break; + case reinsert_via::insert_and_find: { + thrust::device_vector found(num_erased); + thrust::device_vector inserted(num_erased); + map.insert_and_find(pairs_begin, pairs_begin + num_erased, found.begin(), inserted.begin()); + break; + } + } + + REQUIRE(map.size() == num_keys); + cuco::test::check_robin_hood_invariant(map); + + thrust::device_vector d_contained(num_keys); + map.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); + REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); +} +} // namespace + +TEMPLATE_TEST_CASE_SIG("static_map robin_hood erase + insert_or_assign reuse", + "", + ((typename Key, typename Value, int CGSize), Key, Value, CGSize), + (int32_t, int32_t, 1), + (int32_t, int32_t, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (int64_t, int64_t, 1), + (int64_t, int64_t, 2) +#endif +) +{ + test_rh_erase_reuse(10'000, reinsert_via::insert_or_assign); +} + +TEMPLATE_TEST_CASE_SIG("static_map robin_hood erase + insert_or_apply reuse", + "", + ((typename Key, typename Value, int CGSize), Key, Value, CGSize), + (int32_t, int32_t, 1), + (int32_t, int32_t, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (int64_t, int64_t, 1), + (int64_t, int64_t, 2) +#endif +) +{ + test_rh_erase_reuse(10'000, reinsert_via::insert_or_apply); +} + +TEMPLATE_TEST_CASE_SIG("static_map robin_hood erase + insert_and_find reuse", + "", + ((typename Key, typename Value, int CGSize), Key, Value, CGSize), + (int32_t, int32_t, 1), + (int32_t, int32_t, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (int64_t, int64_t, 1), + (int64_t, int64_t, 2) +#endif +) +{ + test_rh_erase_reuse(10'000, reinsert_via::insert_and_find); +} diff --git a/tests/static_map/robin_hood_invariant.cuh b/tests/static_map/robin_hood_invariant.cuh index 5d44be7e6..78750c4ad 100644 --- a/tests/static_map/robin_hood_invariant.cuh +++ b/tests/static_map/robin_hood_invariant.cuh @@ -54,6 +54,7 @@ __global__ void robin_hood_invariant_kernel(Ref ref, int* violations) auto const num_groups = storage_ref.capacity() / stride; auto const extent = storage_ref.extent(); auto const empty_key = ref.empty_key_sentinel(); + auto const erased_key = ref.erased_key_sentinel(); auto const scheme = ref.probing_scheme(); for (size_type g = blockIdx.x * blockDim.x + threadIdx.x; g < num_groups; @@ -62,10 +63,13 @@ __global__ void robin_hood_invariant_kernel(Ref ref, int* violations) size_type max_age_g = 0; for (int s = 0; s < stride; ++s) { auto const slot = slots[g * stride + s]; - if (slot.first != empty_key) { + if (slot.first != empty_key) { // tombstones count as residents (erase enabled => != empty) ++occupied_g; - auto const age = scheme.template probe_distance( - slot.first, static_cast(g * stride + s), extent); + // A tombstone keeps its age in its payload; a live key's age is its probe distance. + auto const age = (slot.first == erased_key) + ? static_cast(slot.second) + : scheme.template probe_distance( + slot.first, static_cast(g * stride + s), extent); if (age > max_age_g) { max_age_g = age; } } } @@ -77,8 +81,10 @@ __global__ void robin_hood_invariant_kernel(Ref ref, int* violations) for (int s = 0; s < stride; ++s) { auto const slot = slots[pg * stride + s]; if (slot.first != empty_key) { - auto const age = scheme.template probe_distance( - slot.first, static_cast(pg * stride + s), extent); + auto const age = (slot.first == erased_key) + ? static_cast(slot.second) + : scheme.template probe_distance( + slot.first, static_cast(pg * stride + s), extent); if (occupied_p == 0 || age < min_age_p) { min_age_p = age; } ++occupied_p; } From 1f3d64e4eb072741f5bb5b7b032741b6b9506480 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 5 Jun 2026 19:56:32 +0000 Subject: [PATCH 10/11] [pre-commit.ci] auto code formatting --- .../open_addressing_ref_impl.cuh | 76 ++++++++++--------- .../probing_scheme/probing_scheme_impl.inl | 6 +- .../cuco/detail/static_map/static_map_ref.inl | 51 +++++++------ include/cuco/probing_scheme.cuh | 8 +- tests/static_map/contains_test.cu | 2 +- tests/static_map/erase_test.cu | 28 +++---- tests/static_map/find_test.cu | 6 +- tests/static_map/insert_and_find_test.cu | 2 +- tests/static_map/insert_or_apply_test.cu | 15 ++-- tests/static_map/insert_or_assign_test.cu | 6 +- .../probing_scheme_invariants_test.cu | 22 +++--- tests/static_map/robin_hood_invariant.cuh | 6 +- 12 files changed, 113 insertions(+), 115 deletions(-) diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index e64efb43d..0c63eef8b 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -401,8 +401,8 @@ class open_addressing_ref_impl if (eq_res == detail::equal_result::EQUAL) { return false; } } // Robin Hood claims only a true empty here; a tombstone carries an age and is handled as a - // resident by the displacement test below. Skipping it must gate the CAS (once claimed it is - // already consumed), so it is folded into this condition. For non-Robin-Hood the second + // resident by the displacement test below. Skipping it must gate the CAS (once claimed it + // is already consumed), so it is folded into this condition. For non-Robin-Hood the second // clause is a compile-time `false`, leaving the original `eq_res == AVAILABLE`. if (eq_res == detail::equal_result::AVAILABLE and not(cuco::is_robin_hood_probing::value and @@ -448,25 +448,26 @@ class open_addressing_ref_impl auto const evicted_age = this->robin_hood_age( slot_content, static_cast(*probing_iter + intra_bucket_index)); if (evicted_age < probe_step) { - if (this->attempt_insert( - this->get_slot_ptr(*probing_iter, intra_bucket_index), slot_content, val) == - insert_result::SUCCESS) { + if (this->attempt_insert(this->get_slot_ptr(*probing_iter, intra_bucket_index), + slot_content, + val) == insert_result::SUCCESS) { // Consuming a tombstone reuses its freed slot -- nothing to carry, so we are done. if (this->is_erased(slot_content)) { return true; } // Adopt the evicted pair and re-probe THIS bucket -- its bucket distance here is - // `evicted_age`, and it may belong in another slot of the same bucket: an empty one, - // or one holding an even-richer resident it can displace in turn. Re-reading the - // bucket (rather than advancing past it) is the within-bucket linear probe, i.e. the - // combined bucket+slot distance that makes displacement correct for bucket_size > 1. - // The `slot_distance` term cancels in every comparison, so it never appears here; it - // shows up only as this slot-by-slot continuation. `bit_cast` keeps the adoption - // valid for heterogeneous insert types (layout-compatible by contract; identity in - // the common case). + // `evicted_age`, and it may belong in another slot of the same bucket: an empty + // one, or one holding an even-richer resident it can displace in turn. Re-reading + // the bucket (rather than advancing past it) is the within-bucket linear probe, + // i.e. the combined bucket+slot distance that makes displacement correct for + // bucket_size > 1. The `slot_distance` term cancels in every comparison, so it + // never appears here; it shows up only as this slot-by-slot continuation. + // `bit_cast` keeps the adoption valid for heterogeneous insert types + // (layout-compatible by contract; identity in the common case). val = cuda::std::bit_cast(slot_content); key = this->extract_key(val); probe_step = evicted_age; } - retry = true; // re-read this bucket: re-probe with the victim, or re-evaluate a lost CAS + retry = + true; // re-read this bucket: re-probe with the victim, or re-evaluate a lost CAS break; } } @@ -584,8 +585,8 @@ class open_addressing_ref_impl if (displace_idx < 0) { // `robin_hood_age` so a tombstone uses its payload-stored age: it is displaced (i.e. // consumed) exactly when richer than the in-flight pair, like any other resident. - auto const age = - this->robin_hood_age(bucket_slots[i()], static_cast(*probing_iter + i())); + auto const age = this->robin_hood_age(bucket_slots[i()], + static_cast(*probing_iter + i())); if (age < probe_step) { displace_idx = i(); evicted_age = age; @@ -666,8 +667,8 @@ class open_addressing_ref_impl probing_scheme_.template make_iterator(key, storage_ref_.extent()); auto const init_idx = *probing_iter; [[maybe_unused]] size_type probe_step = 0; - // Robin Hood may displace the original key before the chain ends; remember the slot it landed in - // so we return an iterator to it (not to a later victim's slot). + // Robin Hood may displace the original key before the chain ends; remember the slot it landed + // in so we return an iterator to it (not to a later victim's slot). [[maybe_unused]] value_type* placed_ptr = nullptr; while (true) { @@ -720,15 +721,14 @@ class open_addressing_ref_impl } // Robin Hood swap test (see `insert` for the full rationale). A tombstone is a resident too - // (age from its payload); picking one up consumes it -- the in-flight pair lands there and we - // are done. + // (age from its payload); picking one up consumes it -- the in-flight pair lands there and + // we are done. if constexpr (cuco::is_robin_hood_probing::value) { if (eq_res == detail::equal_result::UNEQUAL or this->is_erased(bucket_slots[i])) { auto const evicted_age = this->robin_hood_age(bucket_slots[i], static_cast(*probing_iter + i)); if (evicted_age < probe_step) { - if (this->attempt_insert(slot_ptr, bucket_slots[i], val) == - insert_result::SUCCESS) { + if (this->attempt_insert(slot_ptr, bucket_slots[i], val) == insert_result::SUCCESS) { if (this->is_erased(bucket_slots[i])) { // Consumed a tombstone: the in-flight pair is placed here; return the original // key's slot (this one if it was never displaced). @@ -866,8 +866,8 @@ class open_addressing_ref_impl if (displace_idx < 0) { // `robin_hood_age` so a tombstone uses its payload-stored age: it is displaced (i.e. // consumed) exactly when richer than the in-flight pair, like any other resident. - auto const age = - this->robin_hood_age(bucket_slots[i()], static_cast(*probing_iter + i())); + auto const age = this->robin_hood_age(bucket_slots[i()], + static_cast(*probing_iter + i())); if (age < probe_step) { displace_idx = i(); evicted_age = age; @@ -890,7 +890,8 @@ class open_addressing_ref_impl if (group.shfl(status, src_lane) == insert_result::SUCCESS) { if (placed_ptr == 0) { placed_ptr = group.shfl(displaced, src_lane); } // Consumed a tombstone: the in-flight pair is placed in its slot; we are done. Return - // the original key's slot (`placed_ptr`, which is this slot if it was never displaced). + // the original key's slot (`placed_ptr`, which is this slot if it was never + // displaced). if (group.shfl(this->is_erased(evicted), src_lane)) { if (group.thread_rank() == src_lane) { this->maybe_wait_for_payload(reinterpret_cast(displaced)); @@ -951,8 +952,8 @@ class open_addressing_ref_impl // Key exists, return true if successfully deleted if (eq_res == detail::equal_result::EQUAL) { auto const intra_bucket_index = cuda::std::distance(bucket_slots.begin(), &slot_content); - // Robin Hood records the erased key's age in the tombstone payload (1a); other schemes use - // the plain erased sentinel. + // Robin Hood records the erased key's age in the tombstone payload (1a); other schemes + // use the plain erased sentinel. value_type erased = this->erased_slot_sentinel(); if constexpr (cuco::is_robin_hood_probing::value) { erased = this->robin_hood_erased_sentinel( @@ -1010,16 +1011,17 @@ class open_addressing_ref_impl auto const src_lane = __ffs(group_contains_equal) - 1; auto status = insert_result::CONTINUE; if (group.thread_rank() == src_lane) { - // Robin Hood records the erased key's age in the tombstone payload (1a); other schemes use - // the plain erased sentinel. + // Robin Hood records the erased key's age in the tombstone payload (1a); other schemes + // use the plain erased sentinel. value_type erased = this->erased_slot_sentinel(); if constexpr (cuco::is_robin_hood_probing::value) { erased = this->robin_hood_erased_sentinel( bucket_slots[intra_bucket_index], static_cast(*probing_iter + intra_bucket_index)); } - status = attempt_insert_stable( - this->get_slot_ptr(*probing_iter, intra_bucket_index), bucket_slots[intra_bucket_index], erased); + status = attempt_insert_stable(this->get_slot_ptr(*probing_iter, intra_bucket_index), + bucket_slots[intra_bucket_index], + erased); } switch (group.shfl(status, src_lane)) { @@ -1925,8 +1927,8 @@ class open_addressing_ref_impl */ template [[nodiscard]] __device__ bool robin_hood_proves_absent(BucketSlots const& bucket_slots, - size_type bucket_base, - size_type probe_step) const noexcept + size_type bucket_base, + size_type probe_step) const noexcept { bool richer = false; cuda::static_for([&](auto i) { @@ -1981,16 +1983,16 @@ class open_addressing_ref_impl * @brief The Robin Hood tombstone for erasing the live key currently in `slot` at `slot_index`. * * The erased key's age is stashed in the payload (1a) so the tombstone keeps its place in the - * Robin Hood ordering (the original key is gone and cannot be rehashed). Other probing schemes use - * the plain `erased_slot_sentinel()` and never call this. + * Robin Hood ordering (the original key is gone and cannot be rehashed). Other probing schemes + * use the plain `erased_slot_sentinel()` and never call this. * * @param slot The slot's current (live) contents * @param slot_index The slot's index * * @return The value to CAS into the slot to erase it */ - [[nodiscard]] __device__ value_type robin_hood_erased_sentinel(value_type const& slot, - size_type slot_index) const noexcept + [[nodiscard]] __device__ value_type + robin_hood_erased_sentinel(value_type const& slot, size_type slot_index) const noexcept { static_assert(has_payload, "Robin Hood erase requires a mapped payload to store the tombstone age"); diff --git a/include/cuco/detail/probing_scheme/probing_scheme_impl.inl b/include/cuco/detail/probing_scheme/probing_scheme_impl.inl index 8b93076c3..2dbb9bdcb 100644 --- a/include/cuco/detail/probing_scheme/probing_scheme_impl.inl +++ b/include/cuco/detail/probing_scheme/probing_scheme_impl.inl @@ -231,11 +231,7 @@ namespace detail { * @param upper_bound Upper bound of the iteration * @return The resident's probe distance, in probing steps */ -template +template [[nodiscard]] __host__ __device__ constexpr typename Extent::value_type probe_distance( linear_probing const& scheme, ProbeKey resident_key, diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl index 767e3b944..b71a40f9f 100644 --- a/include/cuco/detail/static_map/static_map_ref.inl +++ b/include/cuco/detail/static_map/static_map_ref.inl @@ -538,7 +538,8 @@ class operator_impl< // If the key is already in the container, update the payload and return if (eq_res == detail::equal_result::EQUAL) { - if constexpr (cuco::is_robin_hood_probing::value) { + if constexpr (cuco::is_robin_hood_probing< + typename ref_type::probing_scheme_type>::value) { // Robin Hood may relocate this key; assign via a full-slot CAS that keeps the key and // fails if it moved. On failure re-probe and retry -- the loop re-finds the key. auto desired = slot_content; @@ -556,10 +557,11 @@ class operator_impl< } } if (eq_res == detail::equal_result::AVAILABLE) { - if constexpr (cuco::is_robin_hood_probing::value) { - // Claim only a true empty; a tombstone is handled as a resident by the displacement test - // below. On a lost CAS (rival insert) or a duplicate, retry -- the loop re-finds the key - // and assigns it via the EQUAL full-slot CAS. + if constexpr (cuco::is_robin_hood_probing< + typename ref_type::probing_scheme_type>::value) { + // Claim only a true empty; a tombstone is handled as a resident by the displacement + // test below. On a lost CAS (rival insert) or a duplicate, retry -- the loop re-finds + // the key and assigns it via the EQUAL full-slot CAS. if (not ref_.impl_.is_erased(slot_content)) { if (ref_.impl_.attempt_insert(slot_ptr, slot_content, val) == detail::insert_result::SUCCESS) { @@ -574,8 +576,8 @@ class operator_impl< } // Robin Hood swap test (see `open_addressing_ref_impl::insert` for the rationale). A - // tombstone is a resident too (age from its payload); picking one up consumes it -- the pair - // lands there and we are done. + // tombstone is a resident too (age from its payload); picking one up consumes it -- the + // pair lands there and we are done. if constexpr (cuco::is_robin_hood_probing::value) { if (eq_res == detail::equal_result::UNEQUAL or ref_.impl_.is_erased(slot_content)) { auto const evicted_age = ref_.impl_.robin_hood_age( @@ -640,7 +642,8 @@ class operator_impl< if (result.state_ == detail::equal_result::UNEQUAL) { auto res = ref_.impl_.predicate_.template operator()( key, bucket_slots[i()].first); - // Robin Hood: a tombstone is a resident handled by the displacement scan, not AVAILABLE. + // Robin Hood: a tombstone is a resident handled by the displacement scan, not + // AVAILABLE. if constexpr (cuco::is_robin_hood_probing< typename ref_type::probing_scheme_type>::value) { if (res == detail::equal_result::AVAILABLE and @@ -662,8 +665,8 @@ class operator_impl< if (group_contains_equal) { auto const src_lane = __ffs(group_contains_equal) - 1; if constexpr (cuco::is_robin_hood_probing::value) { - // src_lane assigns via a full-slot CAS (key fixed); a relocation or rival update fails it, - // so the group re-probes and retries -- the loop re-finds the key. + // src_lane assigns via a full-slot CAS (key fixed); a relocation or rival update fails + // it, so the group re-probes and retries -- the loop re-finds the key. auto const success = [&, target_idx = intra_bucket_index]() { if (group.thread_rank() != src_lane) { return false; } auto desired = bucket_slots[target_idx]; @@ -687,8 +690,8 @@ class operator_impl< if (group_contains_available) { auto const src_lane = __ffs(group_contains_available) - 1; if constexpr (cuco::is_robin_hood_probing::value) { - // Insert the new pair with a full-slot CAS; on a lost CAS or duplicate, re-probe and retry - // (the loop re-finds the key and assigns via the EQUAL full-slot CAS). + // Insert the new pair with a full-slot CAS; on a lost CAS or duplicate, re-probe and + // retry (the loop re-finds the key and assigns via the EQUAL full-slot CAS). auto const success = [&, target_idx = intra_bucket_index]() { if (group.thread_rank() != src_lane) { return false; } return ref_.impl_.attempt_insert(slot_ptr, bucket_slots[target_idx], val) == @@ -1050,9 +1053,9 @@ class operator_impl< // Lift `op` to the whole slot, keeping the key, and CAS it. A relocation (or a rival // update) makes the CAS fail; re-probe and retry -- the loop re-finds the key. auto desired = slot_content; - // `desired` is a local copy, so this `op` is just local arithmetic -- the `atomic_ref`'s - // atomicity does nothing here and is used only because `Op`'s signature requires one. The - // real atomic is the full-slot CAS below. + // `desired` is a local copy, so this `op` is just local arithmetic -- the + // `atomic_ref`'s atomicity does nothing here and is used only because `Op`'s signature + // requires one. The real atomic is the full-slot CAS below. op(cuda::atomic_ref{desired.second}, val.second); if (ref_.impl_.attempt_insert(slot_ptr, slot_content, desired) == detail::insert_result::SUCCESS) { @@ -1080,7 +1083,8 @@ class operator_impl< case insert_result::DUPLICATE: { if constexpr (cuco::is_robin_hood_probing< typename ref_type::probing_scheme_type>::value) { - // Key is present now; re-probe so it is found EQUAL and updated via the full-slot CAS. + // Key is present now; re-probe so it is found EQUAL and updated via the full-slot + // CAS. retry = true; break; } @@ -1101,7 +1105,8 @@ class operator_impl< } } } - if constexpr (cuco::is_robin_hood_probing::value) { + if constexpr (cuco::is_robin_hood_probing< + typename ref_type::probing_scheme_type>::value) { if (retry) { break; } } } @@ -1187,7 +1192,8 @@ class operator_impl< if (result.state_ == detail::equal_result::UNEQUAL) { auto res = ref_.impl_.predicate_.template operator()( key, bucket_slots[i()].first); - // Robin Hood: a tombstone is a resident handled by the displacement scan, not AVAILABLE. + // Robin Hood: a tombstone is a resident handled by the displacement scan, not + // AVAILABLE. if constexpr (cuco::is_robin_hood_probing< typename ref_type::probing_scheme_type>::value) { if (res == detail::equal_result::AVAILABLE and @@ -1208,16 +1214,15 @@ class operator_impl< auto const group_contains_equal = group.ballot(state == detail::equal_result::EQUAL); if (group_contains_equal) { auto const src_lane = __ffs(group_contains_equal) - 1; - if constexpr (cuco::is_robin_hood_probing< - typename ref_type::probing_scheme_type>::value) { + if constexpr (cuco::is_robin_hood_probing::value) { // src_lane lifts `op` to the slot (key fixed) and CASes it; a relocation or rival update // fails the CAS, so the group re-probes and retries -- the loop re-finds the key. auto const success = [&, target_idx = intra_bucket_index]() { if (group.thread_rank() != src_lane) { return false; } auto desired = bucket_slots[target_idx]; - // `desired` is a local copy, so this `op` is just local arithmetic -- the `atomic_ref`'s - // atomicity does nothing here and is used only because `Op`'s signature requires one. The - // real atomic is the full-slot CAS below. + // `desired` is a local copy, so this `op` is just local arithmetic -- the + // `atomic_ref`'s atomicity does nothing here and is used only because `Op`'s signature + // requires one. The real atomic is the full-slot CAS below. op(cuda::atomic_ref{desired.second}, val.second); return ref_.impl_.attempt_insert(slot_ptr, bucket_slots[target_idx], desired) == detail::insert_result::SUCCESS; diff --git a/include/cuco/probing_scheme.cuh b/include/cuco/probing_scheme.cuh index f74c8fe80..3bf8b98c2 100644 --- a/include/cuco/probing_scheme.cuh +++ b/include/cuco/probing_scheme.cuh @@ -233,10 +233,10 @@ class double_hashing : private detail::probing_scheme_base { template class robin_hood_probing : private Underlying { public: - using Underlying::cg_size; ///< Cooperative group size (from the underlying scheme) - using typename Underlying::hasher; ///< Hash function type (from the underlying scheme) - using Underlying::hash_function; ///< Forwarded: gets the function(s) used to hash keys - using Underlying::make_iterator; ///< Forwarded: the (unchanged) forward probe sequence + using typename Underlying::hasher; ///< Hash function type (from the underlying scheme) + using Underlying::cg_size; ///< Cooperative group size (from the underlying scheme) + using Underlying::hash_function; ///< Forwarded: gets the function(s) used to hash keys + using Underlying::make_iterator; ///< Forwarded: the (unchanged) forward probe sequence /** * @brief Constructs a Robin Hood probing scheme wrapping the given underlying scheme. diff --git a/tests/static_map/contains_test.cu b/tests/static_map/contains_test.cu index 9f2769a56..9c1312482 100644 --- a/tests/static_map/contains_test.cu +++ b/tests/static_map/contains_test.cu @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include @@ -30,6 +29,7 @@ #include #include +#include using size_type = int32_t; diff --git a/tests/static_map/erase_test.cu b/tests/static_map/erase_test.cu index 024068242..5f2942ef1 100644 --- a/tests/static_map/erase_test.cu +++ b/tests/static_map/erase_test.cu @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include @@ -26,6 +25,7 @@ #include #include +#include using size_type = int32_t; @@ -180,10 +180,8 @@ TEMPLATE_TEST_CASE_SIG("static_map robin_hood erase read-path", cuco::storage<2>>; constexpr size_type capacity = static_cast(num_keys / 0.85); - auto map = map_type{capacity, - cuco::empty_key{-1}, - cuco::empty_value{-1}, - cuco::erased_key{-2}}; + auto map = map_type{ + capacity, cuco::empty_key{-1}, cuco::empty_value{-1}, cuco::erased_key{-2}}; // Keys 1..num_keys (avoid the -1 / -2 sentinels). auto keys_begin = cuda::counting_iterator(1); @@ -237,10 +235,8 @@ TEMPLATE_TEST_CASE_SIG("static_map robin_hood erase reuse + invariant", cuco::storage<2>>; constexpr size_type capacity = static_cast(num_keys / 0.85); - auto map = map_type{capacity, - cuco::empty_key{-1}, - cuco::empty_value{-1}, - cuco::erased_key{-2}}; + auto map = map_type{ + capacity, cuco::empty_key{-1}, cuco::empty_value{-1}, cuco::erased_key{-2}}; auto keys_begin = cuda::counting_iterator(1); auto pairs_begin = cuda::make_transform_iterator( @@ -269,10 +265,10 @@ TEMPLATE_TEST_CASE_SIG("static_map robin_hood erase reuse + invariant", namespace { enum class reinsert_via { insert_or_assign, insert_or_apply, insert_and_find }; -// Robin Hood erase + reuse through a specific re-insert API: insert, check invariant, erase the first -// half (-> tombstones), check invariant, then re-insert that half via `how` (-> consume tombstones) -// and check invariant + that every key is present. Exercises the tombstone path of the chosen insert -// variant. +// Robin Hood erase + reuse through a specific re-insert API: insert, check invariant, erase the +// first half (-> tombstones), check invariant, then re-insert that half via `how` (-> consume +// tombstones) and check invariant + that every key is present. Exercises the tombstone path of the +// chosen insert variant. template void test_rh_erase_reuse(size_type num_keys, reinsert_via how) { @@ -287,10 +283,8 @@ void test_rh_erase_reuse(size_type num_keys, reinsert_via how) cuco::storage<2>>; auto const capacity = static_cast(num_keys / 0.85); - auto map = map_type{capacity, - cuco::empty_key{-1}, - cuco::empty_value{-1}, - cuco::erased_key{-2}}; + auto map = map_type{ + capacity, cuco::empty_key{-1}, cuco::empty_value{-1}, cuco::erased_key{-2}}; auto keys_begin = cuda::counting_iterator(1); auto pairs_begin = cuda::make_transform_iterator( diff --git a/tests/static_map/find_test.cu b/tests/static_map/find_test.cu index e9fbe3b48..7d6b85ed7 100644 --- a/tests/static_map/find_test.cu +++ b/tests/static_map/find_test.cu @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include @@ -31,6 +30,7 @@ #include #include +#include using size_type = int32_t; @@ -74,8 +74,8 @@ void test_unique_sequence(Map& map, size_type num_keys) map.insert(pairs_begin, pairs_begin + num_keys); - // Robin Hood-specific: the populated table must satisfy the per-bucket Robin Hood layout invariant - // (a no-op for linear/double hashing). + // Robin Hood-specific: the populated table must satisfy the per-bucket Robin Hood layout + // invariant (a no-op for linear/double hashing). if constexpr (cuco::is_robin_hood_probing::value) { cuco::test::check_robin_hood_invariant(map); } diff --git a/tests/static_map/insert_and_find_test.cu b/tests/static_map/insert_and_find_test.cu index d380bf724..3010c3333 100644 --- a/tests/static_map/insert_and_find_test.cu +++ b/tests/static_map/insert_and_find_test.cu @@ -15,7 +15,6 @@ * limitations under the License. */ -#include #include #include @@ -27,6 +26,7 @@ #include #include +#include using size_type = std::size_t; diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu index 6bc151b17..692e7af31 100644 --- a/tests/static_map/insert_or_apply_test.cu +++ b/tests/static_map/insert_or_apply_test.cu @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include @@ -28,6 +27,7 @@ #include #include +#include #include @@ -310,10 +310,11 @@ TEMPLATE_TEST_CASE_SIG( } } -// Dedicated Robin Hood coverage for insert_or_apply: the probe-enum test above is disabled upstream, -// so this is the only active exercise of the displacing RH insert_or_apply (reduction + lock-free -// displacement together). It runs at a high load factor (~0.95 on the unique keys) so displacement -// actually fires, and reuses `test_insert_or_apply`, whose tail asserts the structural RH invariant. +// Dedicated Robin Hood coverage for insert_or_apply: the probe-enum test above is disabled +// upstream, so this is the only active exercise of the displacing RH insert_or_apply (reduction + +// lock-free displacement together). It runs at a high load factor (~0.95 on the unique keys) so +// displacement actually fires, and reuses `test_insert_or_apply`, whose tail asserts the structural +// RH invariant. TEMPLATE_TEST_CASE_SIG("static_map robin_hood insert_or_apply (high load)", "", ((typename Key, typename Value, int CGSize), Key, Value, CGSize), @@ -339,8 +340,8 @@ TEMPLATE_TEST_CASE_SIG("static_map robin_hood insert_or_apply (high load)", cuco::cuda_allocator, cuco::storage<2>>; - // Size the table for ~0.95 load on the unique keys, so it is nearly full and the displacing insert - // path (and the structural invariant check inside the helper) is genuinely stressed. + // Size the table for ~0.95 load on the unique keys, so it is nearly full and the displacing + // insert path (and the structural invariant check inside the helper) is genuinely stressed. constexpr size_type capacity = static_cast(num_unique_keys / 0.95); SECTION("sentinel equals init; has_init = true") diff --git a/tests/static_map/insert_or_assign_test.cu b/tests/static_map/insert_or_assign_test.cu index 067224e62..66b1359a7 100644 --- a/tests/static_map/insert_or_assign_test.cu +++ b/tests/static_map/insert_or_assign_test.cu @@ -14,7 +14,6 @@ * limitations under the License. */ -#include #include #include @@ -30,6 +29,7 @@ #include #include +#include using size_type = std::size_t; @@ -146,8 +146,8 @@ TEMPLATE_TEST_CASE_SIG( } // Dedicated Robin Hood coverage for *concurrent* insert + assign. The probe-enum test above is -// phased (insert all keys, then assign in a separate pass), so it never exercises an assign racing a -// displacement -- the hazard RH introduces. Here every occurrence of key k assigns the same value +// phased (insert all keys, then assign in a separate pass), so it never exercises an assign racing +// a displacement -- the hazard RH introduces. Here every occurrence of key k assigns the same value // (k * 2), so the final value is deterministic regardless of ordering; but a displacement-vs-assign // race would land an assign on a different key's slot and corrupt it. Runs at ~0.95 load in one // concurrent pass with duplicates, and verifies each key's value individually (a value-multiset diff --git a/tests/static_map/probing_scheme_invariants_test.cu b/tests/static_map/probing_scheme_invariants_test.cu index 2fb4bdd88..277ec1921 100644 --- a/tests/static_map/probing_scheme_invariants_test.cu +++ b/tests/static_map/probing_scheme_invariants_test.cu @@ -44,21 +44,21 @@ struct identity_hash { // then queried with keys chosen to exercise each lookup-termination rule. TEST_CASE("static_map robin_hood read-path early-exit", "") { - using Key = std::int32_t; - using Value = std::int32_t; - using size_type = std::int32_t; - auto constexpr capacity = size_type{16}; + using Key = std::int32_t; + using Value = std::int32_t; + using size_type = std::int32_t; + auto constexpr capacity = size_type{16}; using extent_type = cuco::extent; using probe_type = cuco::robin_hood_probing>>; using map_type = cuco::static_map, - probe_type, - cuco::cuda_allocator, - cuco::storage<1>>; + Value, + extent_type, + cuda::thread_scope_device, + cuda::std::equal_to, + probe_type, + cuco::cuda_allocator, + cuco::storage<1>>; using value_type = typename map_type::value_type; // cuco::pair auto map = map_type{extent_type{}, cuco::empty_key{-1}, cuco::empty_value{-1}}; diff --git a/tests/static_map/robin_hood_invariant.cuh b/tests/static_map/robin_hood_invariant.cuh index 78750c4ad..34165435c 100644 --- a/tests/static_map/robin_hood_invariant.cuh +++ b/tests/static_map/robin_hood_invariant.cuh @@ -75,7 +75,7 @@ __global__ void robin_hood_invariant_kernel(Ref ref, int* violations) } if (occupied_g == 0) { continue; } - size_type const pg = (g + num_groups - 1) % num_groups; + size_type const pg = (g + num_groups - 1) % num_groups; int occupied_p = 0; size_type min_age_p = 0; for (int s = 0; s < stride; ++s) { @@ -90,8 +90,8 @@ __global__ void robin_hood_invariant_kernel(Ref ref, int* violations) } } - if (max_age_g >= 1 && occupied_p < stride) { atomicAdd(violations, 1); } // (1) - if (occupied_p > 0 && min_age_p + 1 < max_age_g) { atomicAdd(violations, 1); } // (2) + if (max_age_g >= 1 && occupied_p < stride) { atomicAdd(violations, 1); } // (1) + if (occupied_p > 0 && min_age_p + 1 < max_age_g) { atomicAdd(violations, 1); } // (2) } } From f26495adc0fd4827be0362398b5c3e2b65d43e66 Mon Sep 17 00:00:00 2001 From: aterenin Date: Sat, 6 Jun 2026 12:38:04 -0400 Subject: [PATCH 11/11] Disallow 96-bit slots for Robin Hood. --- .../open_addressing/open_addressing_ref_impl.cuh | 16 ++++++++++++++++ tests/static_map/erase_test.cu | 9 ++++----- tests/static_map/find_test.cu | 9 ++++----- tests/static_map/insert_and_find_test.cu | 9 ++++----- tests/static_map/insert_or_assign_test.cu | 9 ++++----- 5 files changed, 32 insertions(+), 20 deletions(-) diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index 0c63eef8b..60a845eac 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -130,6 +130,22 @@ class open_addressing_ref_impl storage_ref_type::bucket_size; ///< Number of elements handled per bucket static constexpr auto thread_scope = Scope; ///< CUDA thread scope + // Robin Hood displacement swaps the in-flight pair into an *occupied* slot, which needs a single + // atomic CAS of the whole slot. That requires a packable slot: <= 8 bytes (atom.cas.b64), or + // padding-free and <= 16 bytes on an sm_90+ build (atom.cas.b128). A non-packable slot (e.g. a + // padded `pair`) would fall back to a split key/value CAS, which cannot move an + // occupied slot -- displacement would livelock. Reject it at compile time rather than hang. + static constexpr bool robin_hood_slot_is_single_cas = sizeof(value_type) <= 8 +#if defined(CUCO_HAS_128BIT_ATOMICS) + or cuco::detail::is_packable() +#endif + ; + static_assert(not cuco::is_robin_hood_probing::value or + robin_hood_slot_is_single_cas, + "Robin Hood probing requires a single-CAS slot: the key+value must fit in 8 bytes, " + "or be packable (padding-free) and <= 16 bytes on an sm_90+ build. A padded slot " + "(e.g. pair) is unsupported -- displacement would livelock."); + /** * @brief Constructs open_addressing_ref_impl. * diff --git a/tests/static_map/erase_test.cu b/tests/static_map/erase_test.cu index 5f2942ef1..749c9e25e 100644 --- a/tests/static_map/erase_test.cu +++ b/tests/static_map/erase_test.cu @@ -114,11 +114,10 @@ TEMPLATE_TEST_CASE_SIG( (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2), - // Wider-slot Robin Hood rows: the packed displacement CAS needs atom.cas.b128. - (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), - (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 2), - (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), - (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 2), + // Wider-slot Robin Hood rows: RH displacement needs a single packed atom.cas.b128, so the slot + // must be packable (padding-free). Only int64/int64 qualifies -- int32/int64 and int64/int32 are + // padded (not is_packable), fall back to a split (back-to-back) CAS, and RH displacement would + // livelock on an occupied slot. (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 2) #endif diff --git a/tests/static_map/find_test.cu b/tests/static_map/find_test.cu index 7d6b85ed7..2dda5e89f 100644 --- a/tests/static_map/find_test.cu +++ b/tests/static_map/find_test.cu @@ -187,11 +187,10 @@ TEMPLATE_TEST_CASE_SIG( (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2), - // Wider-slot Robin Hood rows: the packed displacement CAS needs atom.cas.b128. - (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), - (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 2), - (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), - (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 2), + // Wider-slot Robin Hood rows: RH displacement needs a single packed atom.cas.b128, so the slot + // must be packable (padding-free). Only int64/int64 qualifies -- int32/int64 and int64/int32 are + // padded (not is_packable), fall back to a split (back-to-back) CAS, and RH displacement would + // livelock on an occupied slot. (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 2) #endif diff --git a/tests/static_map/insert_and_find_test.cu b/tests/static_map/insert_and_find_test.cu index 3010c3333..be3c55a45 100644 --- a/tests/static_map/insert_and_find_test.cu +++ b/tests/static_map/insert_and_find_test.cu @@ -63,11 +63,10 @@ TEMPLATE_TEST_CASE_SIG( (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2), - // Wider-slot Robin Hood rows: the packed displacement CAS needs atom.cas.b128. - (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), - (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 2), - (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), - (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 2), + // Wider-slot Robin Hood rows: RH displacement needs a single packed atom.cas.b128, so the slot + // must be packable (padding-free). Only int64/int64 qualifies -- int32/int64 and int64/int32 are + // padded (not is_packable), fall back to a split (back-to-back) CAS, and RH displacement would + // livelock on an occupied slot. (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 2) #endif diff --git a/tests/static_map/insert_or_assign_test.cu b/tests/static_map/insert_or_assign_test.cu index 66b1359a7..843417072 100644 --- a/tests/static_map/insert_or_assign_test.cu +++ b/tests/static_map/insert_or_assign_test.cu @@ -112,11 +112,10 @@ TEMPLATE_TEST_CASE_SIG( (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2), - // Wider-slot Robin Hood rows: the packed displacement CAS needs atom.cas.b128. - (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), - (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 2), - (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), - (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 2), + // Wider-slot Robin Hood rows: RH displacement needs a single packed atom.cas.b128, so the slot + // must be packable (padding-free). Only int64/int64 qualifies -- int32/int64 and int64/int32 are + // padded (not is_packable), fall back to a split (back-to-back) CAS, and RH displacement would + // livelock on an occupied slot. (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 2) #endif