diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index ada8792bb..60a845eac 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -129,6 +130,22 @@ class open_addressing_ref_impl storage_ref_type::bucket_size; ///< Number of elements handled per bucket static constexpr auto thread_scope = Scope; ///< CUDA thread scope + // Robin Hood displacement swaps the in-flight pair into an *occupied* slot, which needs a single + // atomic CAS of the whole slot. That requires a packable slot: <= 8 bytes (atom.cas.b64), or + // padding-free and <= 16 bytes on an sm_90+ build (atom.cas.b128). A non-packable slot (e.g. a + // padded `pair`) would fall back to a split key/value CAS, which cannot move an + // occupied slot -- displacement would livelock. Reject it at compile time rather than hang. + static constexpr bool robin_hood_slot_is_single_cas = sizeof(value_type) <= 8 +#if defined(CUCO_HAS_128BIT_ATOMICS) + or cuco::detail::is_packable() +#endif + ; + static_assert(not cuco::is_robin_hood_probing::value or + robin_hood_slot_is_single_cas, + "Robin Hood probing requires a single-CAS slot: the key+value must fit in 8 bytes, " + "or be packable (padding-free) and <= 16 bytes on an sm_90+ build. A padded slot " + "(e.g. pair) is unsupported -- displacement would livelock."); + /** * @brief Constructs open_addressing_ref_impl. * @@ -378,16 +395,19 @@ class open_addressing_ref_impl { static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); - auto const val = this->heterogeneous_value(value); - auto const key = this->extract_key(val); + auto val = this->heterogeneous_value(value); + auto key = this->extract_key(val); auto probing_iter = probing_scheme_.template make_iterator(key, storage_ref_.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; + [[maybe_unused]] size_type probe_step = 0; while (true) { auto const bucket_slots = storage_ref_[*probing_iter]; + [[maybe_unused]] bool retry = false; + for (auto& slot_content : bucket_slots) { auto const eq_res = this->predicate_.template operator()( key, this->extract_key(slot_content)); @@ -396,7 +416,13 @@ class open_addressing_ref_impl // If the key is already in the container, return false if (eq_res == detail::equal_result::EQUAL) { return false; } } - if (eq_res == detail::equal_result::AVAILABLE) { + // Robin Hood claims only a true empty here; a tombstone carries an age and is handled as a + // resident by the displacement test below. Skipping it must gate the CAS (once claimed it + // is already consumed), so it is folded into this condition. For non-Robin-Hood the second + // clause is a compile-time `false`, leaving the original `eq_res == AVAILABLE`. + if (eq_res == detail::equal_result::AVAILABLE and + not(cuco::is_robin_hood_probing::value and + this->is_erased(slot_content))) { auto const intra_bucket_index = cuda::std::distance(bucket_slots.begin(), &slot_content); switch (attempt_insert( this->get_slot_ptr(*probing_iter, intra_bucket_index), slot_content, val)) { @@ -407,11 +433,68 @@ class open_addressing_ref_impl return false; } } - case insert_result::CONTINUE: continue; + case insert_result::CONTINUE: { + // Retry on a lost CAS. Plain probing keeps scanning this (now stale) bucket; Robin + // Hood must re-read it instead, so the in-flight pair is re-evaluated against the new + // occupants -- otherwise it could be placed past a slot it should have displaced, + // breaking the invariant (and therefore lookups). + if constexpr (cuco::is_robin_hood_probing::value) { + retry = true; + break; + } else { + continue; + } + } case insert_result::SUCCESS: return true; } + if constexpr (cuco::is_robin_hood_probing::value) { + if (retry) { break; } // leave the scan to re-read the bucket + } + } + + // Robin Hood swap test. A resident "richer" than the in-flight pair (a smaller probe + // distance than our current probe step) is displaced: we swap our pair into its slot, adopt + // the evicted resident, and re-probe forward. A tombstone is treated as a resident too -- + // its age comes from its payload (`robin_hood_age`) -- but picking one up *consumes* it: we + // take the slot and are done, since there is nothing to carry forward. + if constexpr (cuco::is_robin_hood_probing::value) { + if (eq_res == detail::equal_result::UNEQUAL or this->is_erased(slot_content)) { + auto const intra_bucket_index = + cuda::std::distance(bucket_slots.begin(), &slot_content); + auto const evicted_age = this->robin_hood_age( + slot_content, static_cast(*probing_iter + intra_bucket_index)); + if (evicted_age < probe_step) { + if (this->attempt_insert(this->get_slot_ptr(*probing_iter, intra_bucket_index), + slot_content, + val) == insert_result::SUCCESS) { + // Consuming a tombstone reuses its freed slot -- nothing to carry, so we are done. + if (this->is_erased(slot_content)) { return true; } + // Adopt the evicted pair and re-probe THIS bucket -- its bucket distance here is + // `evicted_age`, and it may belong in another slot of the same bucket: an empty + // one, or one holding an even-richer resident it can displace in turn. Re-reading + // the bucket (rather than advancing past it) is the within-bucket linear probe, + // i.e. the combined bucket+slot distance that makes displacement correct for + // bucket_size > 1. The `slot_distance` term cancels in every comparison, so it + // never appears here; it shows up only as this slot-by-slot continuation. + // `bit_cast` keeps the adoption valid for heterogeneous insert types + // (layout-compatible by contract; identity in the common case). + val = cuda::std::bit_cast(slot_content); + key = this->extract_key(val); + probe_step = evicted_age; + } + retry = + true; // re-read this bucket: re-probe with the victim, or re-evaluate a lost CAS + break; + } + } } } + + if constexpr (cuco::is_robin_hood_probing::value) { + if (retry) { continue; } // re-probe (re-read this bucket, or move on after displacement) + ++probe_step; + } + ++probing_iter; if (*probing_iter == init_idx) { return false; } } @@ -432,11 +515,12 @@ class open_addressing_ref_impl __device__ bool insert(cooperative_groups::thread_block_tile group, Value value) noexcept { - auto const val = this->heterogeneous_value(value); - auto const key = this->extract_key(val); + auto val = this->heterogeneous_value(value); + auto key = this->extract_key(val); auto probing_iter = probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; + [[maybe_unused]] size_type probe_step = 0; while (true) { auto const bucket_slots = storage_ref_[*probing_iter]; @@ -447,9 +531,18 @@ class open_addressing_ref_impl if (result.state_ == detail::equal_result::UNEQUAL) { switch (this->predicate_.template operator()( key, this->extract_key(bucket_slots[i()]))) { - case detail::equal_result::AVAILABLE: - result = bucket_probing_results{detail::equal_result::AVAILABLE, i()}; + case detail::equal_result::AVAILABLE: { + // Robin Hood: only a true empty is AVAILABLE; a tombstone is a resident handled by + // the displacement scan below, so leave it UNEQUAL here. + bool empty_slot = true; + if constexpr (cuco::is_robin_hood_probing::value) { + empty_slot = not this->is_erased(bucket_slots[i()]); + } + if (empty_slot) { + result = bucket_probing_results{detail::equal_result::AVAILABLE, i()}; + } break; + } case detail::equal_result::EQUAL: { if constexpr (!allows_duplicates) { result = bucket_probing_results{detail::equal_result::EQUAL, i()}; @@ -496,6 +589,62 @@ class open_addressing_ref_impl default: continue; } } else { + if constexpr (cuco::is_robin_hood_probing::value) { + // Robin Hood displacement: no match, no empty slot in this bucket. Displace the first + // resident in probe (lane) order that is richer than the in-flight pair, adopt it, and + // re-probe THIS bucket -- the victim may belong in another slot of it. The within-bucket + // linear probe (combined bucket+slot distance) is identical to the scalar path; the + // `slot_distance` term cancels, so the test is again `resident distance < probe_step`. + cuda::std::int32_t displace_idx = -1; + size_type evicted_age = 0; + cuda::static_for([&] __device__(auto i) { + if (displace_idx < 0) { + // `robin_hood_age` so a tombstone uses its payload-stored age: it is displaced (i.e. + // consumed) exactly when richer than the in-flight pair, like any other resident. + auto const age = this->robin_hood_age(bucket_slots[i()], + static_cast(*probing_iter + i())); + if (age < probe_step) { + displace_idx = i(); + evicted_age = age; + } + } + }); + + auto const group_displaceable = group.ballot(displace_idx >= 0); + if (group_displaceable) { + auto const src_lane = __ffs(group_displaceable) - 1; + auto status = insert_result::CONTINUE; + // Only `src_lane` reads `evicted` meaningfully; other lanes just need a valid value to + // feed the broadcast `shfl` below, so seed it with the empty-slot sentinel. + value_type evicted = this->empty_slot_sentinel(); + if (group.thread_rank() == src_lane) { + evicted = bucket_slots[displace_idx]; + status = + attempt_insert(this->get_slot_ptr(*probing_iter, displace_idx), evicted, val); + } + if (group.shfl(status, src_lane) == insert_result::SUCCESS) { + // Consuming a tombstone reuses its freed slot -- nothing to carry, so we are done. + if (group.shfl(this->is_erased(evicted), src_lane)) { return true; } + // Broadcast the evicted pair and its probe distance from the winning lane, and adopt + // it on every lane (all lanes need the new in-flight pair for the next scan). + auto const new_key = group.shfl(this->extract_key(evicted), src_lane); + auto const new_age = group.shfl(evicted_age, src_lane); + value_type evicted_slot; + if constexpr (has_payload) { + auto const new_payload = group.shfl(this->extract_payload(evicted), src_lane); + evicted_slot = value_type{new_key, new_payload}; + } else { + evicted_slot = new_key; + } + val = cuda::std::bit_cast(evicted_slot); + key = this->extract_key(val); + probe_step = new_age; + } + continue; // success: re-probe this bucket with the victim; lost CAS: re-read it + } + // No displaceable resident: fall through to the shared advance below. + ++probe_step; + } ++probing_iter; if (*probing_iter == init_idx) { return false; } } @@ -528,15 +677,21 @@ class open_addressing_ref_impl "pre-Volta GPUs."); #endif - auto const val = this->heterogeneous_value(value); - auto const key = this->extract_key(val); + auto val = this->heterogeneous_value(value); + auto key = this->extract_key(val); auto probing_iter = probing_scheme_.template make_iterator(key, storage_ref_.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; + [[maybe_unused]] size_type probe_step = 0; + // Robin Hood may displace the original key before the chain ends; remember the slot it landed + // in so we return an iterator to it (not to a later victim's slot). + [[maybe_unused]] value_type* placed_ptr = nullptr; while (true) { auto const bucket_slots = storage_ref_[*probing_iter]; + [[maybe_unused]] bool retry = false; + for (auto i = 0; i < bucket_size; ++i) { auto const eq_res = this->predicate_.template operator()( key, this->extract_key(bucket_slots[i])); @@ -547,19 +702,71 @@ class open_addressing_ref_impl this->maybe_wait_for_payload(slot_ptr); return {iterator{slot_ptr}, false}; } - if (eq_res == detail::equal_result::AVAILABLE) { + // Robin Hood claims only a true empty here; a tombstone is handled as a resident by the + // displacement test below (see `insert`). + if (eq_res == detail::equal_result::AVAILABLE and + not(cuco::is_robin_hood_probing::value and + this->is_erased(bucket_slots[i]))) { switch (this->attempt_insert_stable(slot_ptr, bucket_slots[i], val)) { case insert_result::SUCCESS: { - this->maybe_wait_for_payload(slot_ptr); - return {iterator{slot_ptr}, true}; + // The in-flight pair is placed in an empty slot, ending any displacement chain. The + // iterator to return is the original key's slot (captured on its first placement). + auto* result_ptr = slot_ptr; + if constexpr (cuco::is_robin_hood_probing::value) { + if (placed_ptr != nullptr) { result_ptr = placed_ptr; } + } + this->maybe_wait_for_payload(result_ptr); + return {iterator{result_ptr}, true}; } case insert_result::DUPLICATE: { this->maybe_wait_for_payload(slot_ptr); return {iterator{slot_ptr}, false}; } - default: continue; + case insert_result::CONTINUE: { + if constexpr (cuco::is_robin_hood_probing::value) { + retry = true; + break; + } else { + continue; + } + } + } + if constexpr (cuco::is_robin_hood_probing::value) { + if (retry) { break; } } } + + // Robin Hood swap test (see `insert` for the full rationale). A tombstone is a resident too + // (age from its payload); picking one up consumes it -- the in-flight pair lands there and + // we are done. + if constexpr (cuco::is_robin_hood_probing::value) { + if (eq_res == detail::equal_result::UNEQUAL or this->is_erased(bucket_slots[i])) { + auto const evicted_age = + this->robin_hood_age(bucket_slots[i], static_cast(*probing_iter + i)); + if (evicted_age < probe_step) { + if (this->attempt_insert(slot_ptr, bucket_slots[i], val) == insert_result::SUCCESS) { + if (this->is_erased(bucket_slots[i])) { + // Consumed a tombstone: the in-flight pair is placed here; return the original + // key's slot (this one if it was never displaced). + auto* result_ptr = (placed_ptr != nullptr) ? placed_ptr : slot_ptr; + this->maybe_wait_for_payload(result_ptr); + return {iterator{result_ptr}, true}; + } + if (placed_ptr == nullptr) { placed_ptr = slot_ptr; } // original key's slot + val = cuda::std::bit_cast(bucket_slots[i]); + key = this->extract_key(val); + probe_step = evicted_age; + } + retry = true; + break; + } + } + } + } + + if constexpr (cuco::is_robin_hood_probing::value) { + if (retry) { continue; } + ++probe_step; } ++probing_iter; if (*probing_iter == init_idx) { return {this->end(), false}; } @@ -594,11 +801,15 @@ class open_addressing_ref_impl "pre-Volta GPUs."); #endif - auto const val = this->heterogeneous_value(value); - auto const key = this->extract_key(val); + auto val = this->heterogeneous_value(value); + auto key = this->extract_key(val); auto probing_iter = probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; + [[maybe_unused]] size_type probe_step = 0; + // Robin Hood may displace the original key before the chain ends; remember (broadcast) the slot + // it first landed in so we return an iterator to it. 0 means "not yet placed". + [[maybe_unused]] intptr_t placed_ptr = 0; while (true) { auto const bucket_slots = storage_ref_[*probing_iter]; @@ -609,6 +820,13 @@ class open_addressing_ref_impl if (result.state_ == detail::equal_result::UNEQUAL) { auto res = this->predicate_.template operator()( key, this->extract_key(bucket_slots[i()])); + // Robin Hood: a tombstone is a resident handled by the displacement scan below, not + // AVAILABLE, so leave it UNEQUAL here. + if constexpr (cuco::is_robin_hood_probing::value) { + if (res == detail::equal_result::AVAILABLE and this->is_erased(bucket_slots[i()])) { + res = detail::equal_result::UNEQUAL; + } + } if (res != detail::equal_result::UNEQUAL) { result = bucket_probing_results{res, i()}; } } }); @@ -638,9 +856,15 @@ class open_addressing_ref_impl switch (group.shfl(status, src_lane)) { case insert_result::SUCCESS: { + // The in-flight pair is placed in an empty slot, ending any displacement chain. Return + // the original key's slot (the first placement) if it was displaced earlier. + auto result = res; + if constexpr (cuco::is_robin_hood_probing::value) { + if (placed_ptr != 0) { result = placed_ptr; } + } if (group.thread_rank() == src_lane) { this->maybe_wait_for_payload(slot_ptr); } group.sync(); - return {iterator{reinterpret_cast(res)}, true}; + return {iterator{reinterpret_cast(result)}, true}; } case insert_result::DUPLICATE: { if (group.thread_rank() == src_lane) { this->maybe_wait_for_payload(slot_ptr); } @@ -650,6 +874,64 @@ class open_addressing_ref_impl default: continue; } } else { + if constexpr (cuco::is_robin_hood_probing::value) { + // Robin Hood displacement (see CG `insert` for the full rationale). + cuda::std::int32_t displace_idx = -1; + size_type evicted_age = 0; + cuda::static_for([&] __device__(auto i) { + if (displace_idx < 0) { + // `robin_hood_age` so a tombstone uses its payload-stored age: it is displaced (i.e. + // consumed) exactly when richer than the in-flight pair, like any other resident. + auto const age = this->robin_hood_age(bucket_slots[i()], + static_cast(*probing_iter + i())); + if (age < probe_step) { + displace_idx = i(); + evicted_age = age; + } + } + }); + + auto const group_displaceable = group.ballot(displace_idx >= 0); + if (group_displaceable) { + auto const src_lane = __ffs(group_displaceable) - 1; + auto status = insert_result::CONTINUE; + value_type evicted = this->empty_slot_sentinel(); + intptr_t displaced = 0; + if (group.thread_rank() == src_lane) { + auto* dptr = this->get_slot_ptr(*probing_iter, displace_idx); + evicted = bucket_slots[displace_idx]; + status = attempt_insert(dptr, evicted, val); + displaced = reinterpret_cast(dptr); + } + if (group.shfl(status, src_lane) == insert_result::SUCCESS) { + if (placed_ptr == 0) { placed_ptr = group.shfl(displaced, src_lane); } + // Consumed a tombstone: the in-flight pair is placed in its slot; we are done. Return + // the original key's slot (`placed_ptr`, which is this slot if it was never + // displaced). + if (group.shfl(this->is_erased(evicted), src_lane)) { + if (group.thread_rank() == src_lane) { + this->maybe_wait_for_payload(reinterpret_cast(displaced)); + } + group.sync(); + return {iterator{reinterpret_cast(placed_ptr)}, true}; + } + auto const new_key = group.shfl(this->extract_key(evicted), src_lane); + auto const new_age = group.shfl(evicted_age, src_lane); + value_type evicted_slot; + if constexpr (has_payload) { + auto const new_payload = group.shfl(this->extract_payload(evicted), src_lane); + evicted_slot = value_type{new_key, new_payload}; + } else { + evicted_slot = new_key; + } + val = cuda::std::bit_cast(evicted_slot); + key = this->extract_key(val); + probe_step = new_age; + } + continue; + } + ++probe_step; + } ++probing_iter; if (*probing_iter == init_idx) { return {this->end(), false}; } } @@ -686,9 +968,15 @@ class open_addressing_ref_impl // Key exists, return true if successfully deleted if (eq_res == detail::equal_result::EQUAL) { auto const intra_bucket_index = cuda::std::distance(bucket_slots.begin(), &slot_content); - switch (attempt_insert_stable(this->get_slot_ptr(*probing_iter, intra_bucket_index), - slot_content, - this->erased_slot_sentinel())) { + // Robin Hood records the erased key's age in the tombstone payload (1a); other schemes + // use the plain erased sentinel. + value_type erased = this->erased_slot_sentinel(); + if constexpr (cuco::is_robin_hood_probing::value) { + erased = this->robin_hood_erased_sentinel( + slot_content, static_cast(*probing_iter + intra_bucket_index)); + } + switch (attempt_insert_stable( + this->get_slot_ptr(*probing_iter, intra_bucket_index), slot_content, erased)) { case insert_result::SUCCESS: return true; case insert_result::DUPLICATE: return false; default: continue; @@ -737,12 +1025,20 @@ class open_addressing_ref_impl auto const group_contains_equal = group.ballot(state == detail::equal_result::EQUAL); if (group_contains_equal) { auto const src_lane = __ffs(group_contains_equal) - 1; - auto const status = - (group.thread_rank() == src_lane) - ? attempt_insert_stable(this->get_slot_ptr(*probing_iter, intra_bucket_index), - bucket_slots[intra_bucket_index], - this->erased_slot_sentinel()) - : insert_result::CONTINUE; + auto status = insert_result::CONTINUE; + if (group.thread_rank() == src_lane) { + // Robin Hood records the erased key's age in the tombstone payload (1a); other schemes + // use the plain erased sentinel. + value_type erased = this->erased_slot_sentinel(); + if constexpr (cuco::is_robin_hood_probing::value) { + erased = this->robin_hood_erased_sentinel( + bucket_slots[intra_bucket_index], + static_cast(*probing_iter + intra_bucket_index)); + } + status = attempt_insert_stable(this->get_slot_ptr(*probing_iter, intra_bucket_index), + bucket_slots[intra_bucket_index], + erased); + } switch (group.shfl(status, src_lane)) { case insert_result::SUCCESS: return true; @@ -777,7 +1073,8 @@ class open_addressing_ref_impl static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); auto probing_iter = probing_scheme_.template make_iterator(key, storage_ref_.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; + [[maybe_unused]] size_type probe_step = 0; while (true) { // TODO atomic_ref::load if insert operator is present @@ -791,6 +1088,13 @@ class open_addressing_ref_impl case detail::equal_result::EQUAL: return true; } } + // Robin Hood: a resident richer than us proves the key is absent. + if constexpr (cuco::is_robin_hood_probing::value) { + if (this->robin_hood_proves_absent(bucket_slots, *probing_iter, probe_step)) { + return false; + } + ++probe_step; + } ++probing_iter; if (*probing_iter == init_idx) { return false; } } @@ -816,7 +1120,8 @@ class open_addressing_ref_impl { auto probing_iter = probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; + [[maybe_unused]] size_type probe_step = 0; while (true) { auto const bucket_slots = storage_ref_[*probing_iter]; @@ -834,6 +1139,13 @@ class open_addressing_ref_impl if (group.any(state == detail::equal_result::EQUAL)) { return true; } if (group.any(state == detail::equal_result::EMPTY)) { return false; } + // Robin Hood: a resident richer than us (in any lane's bucket) proves the key is absent. + if constexpr (cuco::is_robin_hood_probing::value) { + if (group.any(this->robin_hood_proves_absent(bucket_slots, *probing_iter, probe_step))) { + return false; + } + ++probe_step; + } ++probing_iter; if (*probing_iter == init_idx) { return false; } } @@ -857,7 +1169,8 @@ class open_addressing_ref_impl static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); auto probing_iter = probing_scheme_.template make_iterator(key, storage_ref_.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; + [[maybe_unused]] size_type probe_step = 0; while (true) { // TODO atomic_ref::load if insert operator is present @@ -875,6 +1188,13 @@ class open_addressing_ref_impl default: continue; } } + // Robin Hood: a resident richer than us proves the key is absent. + if constexpr (cuco::is_robin_hood_probing::value) { + if (this->robin_hood_proves_absent(bucket_slots, *probing_iter, probe_step)) { + return this->end(); + } + ++probe_step; + } ++probing_iter; if (*probing_iter == init_idx) { return this->end(); } } @@ -900,7 +1220,8 @@ class open_addressing_ref_impl { auto probing_iter = probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; + [[maybe_unused]] size_type probe_step = 0; while (true) { auto const bucket_slots = storage_ref_[*probing_iter]; @@ -930,6 +1251,13 @@ class open_addressing_ref_impl // Find an empty slot, meaning that the probe key isn't present in the container if (group.any(state == detail::equal_result::EMPTY)) { return this->end(); } + // Robin Hood: a resident richer than us (in any lane's bucket) proves the key is absent. + if constexpr (cuco::is_robin_hood_probing::value) { + if (group.any(this->robin_hood_proves_absent(bucket_slots, *probing_iter, probe_step))) { + return this->end(); + } + ++probe_step; + } ++probing_iter; if (*probing_iter == init_idx) { return this->end(); } } @@ -1592,6 +1920,104 @@ class open_addressing_ref_impl return storage_ref_.data() + probing_idx + intra_bucket_idx; } + /** + * @brief Determines whether the Robin Hood invariant proves the probe key absent at the current + * probe step. + * + * @note Only meaningful for Robin Hood probing. The key is proven absent when the bucket holds a + * resident that is "richer" than the probe key — i.e. whose own probe distance is smaller than + * the probe key's probe distance at the current step (`probe_step`). Such a resident would have + * been displaced on insertion if the probe key lived here, so the probe key cannot be present. + * + * @note Behavior is only well-defined when every slot in the bucket is occupied (the callers + * reach this check only after ruling out empty and matching slots), since probe distance is + * meaningless for an empty slot. + * + * @tparam BucketSlots Bucket slot array type + * + * @param bucket_slots The slots of the bucket currently being probed + * @param bucket_base The slot index of the first slot in the bucket + * @param probe_step The probe key's own probe distance at the current step + * + * @return True if some resident in the bucket is richer than the probe key + */ + template + [[nodiscard]] __device__ bool robin_hood_proves_absent(BucketSlots const& bucket_slots, + size_type bucket_base, + size_type probe_step) const noexcept + { + bool richer = false; + cuda::static_for([&](auto i) { + auto const resident_age = + this->robin_hood_age(bucket_slots[i()], static_cast(bucket_base + i())); + if (resident_age < probe_step) { richer = true; } + }); + return richer; + } + + /** + * @brief Whether `slot` holds a tombstone (erased marker). + * + * @note Returns false when erase is disabled (the erased and empty sentinels coincide, so no slot + * is a tombstone) -- this keeps the test correct even for empty slots. + * + * @param slot The slot to test + * + * @return True if `slot` is an erased tombstone + */ + [[nodiscard]] __device__ bool is_erased(value_type const& slot) const noexcept + { + return not cuco::detail::bitwise_compare(this->erased_key_sentinel(), + this->empty_key_sentinel()) and + cuco::detail::bitwise_compare(this->extract_key(slot), this->erased_key_sentinel()); + } + + /** + * @brief Robin Hood probe distance ("age") of an occupied slot. + * + * A live key's age is its `probe_distance`. A Robin Hood tombstone keeps the age of the key it + * replaced in its payload (the original key is gone and cannot be rehashed; see `erase`), so it + * is read back here -- a tombstone then participates in every Robin Hood comparison exactly like + * the resident it stood in for. + * + * @param slot The (occupied) slot + * @param slot_index The slot's index + * + * @return The slot's probe distance + */ + [[nodiscard]] __device__ size_type robin_hood_age(value_type const& slot, + size_type slot_index) const noexcept + { + if constexpr (has_payload) { + if (this->is_erased(slot)) { return static_cast(this->extract_payload(slot)); } + } + return probing_scheme_.template probe_distance( + this->extract_key(slot), slot_index, storage_ref_.extent()); + } + + /** + * @brief The Robin Hood tombstone for erasing the live key currently in `slot` at `slot_index`. + * + * The erased key's age is stashed in the payload (1a) so the tombstone keeps its place in the + * Robin Hood ordering (the original key is gone and cannot be rehashed). Other probing schemes + * use the plain `erased_slot_sentinel()` and never call this. + * + * @param slot The slot's current (live) contents + * @param slot_index The slot's index + * + * @return The value to CAS into the slot to erase it + */ + [[nodiscard]] __device__ value_type + robin_hood_erased_sentinel(value_type const& slot, size_type slot_index) const noexcept + { + static_assert(has_payload, + "Robin Hood erase requires a mapped payload to store the tombstone age"); + auto const age = probing_scheme_.template probe_distance( + this->extract_key(slot), slot_index, storage_ref_.extent()); + return cuco::pair{this->erased_key_sentinel(), + static_castempty_value_sentinel())>(age)}; + } + /** * @brief Extracts the key from a given value type. * diff --git a/include/cuco/detail/probing_scheme/probing_scheme_impl.inl b/include/cuco/detail/probing_scheme/probing_scheme_impl.inl index 8bed41778..2dbb9bdcb 100644 --- a/include/cuco/detail/probing_scheme/probing_scheme_impl.inl +++ b/include/cuco/detail/probing_scheme/probing_scheme_impl.inl @@ -208,4 +208,83 @@ double_hashing::hash_function() const noexcept return {hash1_, hash2_}; } +namespace detail { + +/** + * @brief Robin Hood inverse primitive for the linear probing sequence. + * + * @note Recovers a resident's probe distance ("age") from the slot it occupies: how many probing + * steps the resident sits from its own home bucket. For the linear sequence this is a single + * subtract — `(slot_base - resident_home) / stride mod num_buckets`. This is the linear-only + * overload; a `double_hashing` variant would add its own overload here (a modular inverse of the + * resident's per-key step, or a stored age), which is the single place that change lands. + * + * @tparam BucketSize Size of the bucket + * @tparam CGSize Size of CUDA Cooperative Groups + * @tparam Hash Unary callable type + * @tparam ProbeKey Type of probing key + * @tparam Extent Type of extent + * + * @param scheme The underlying linear probing scheme (supplies the hash function) + * @param resident_key The key currently residing in the slot + * @param slot_index The slot index at which `resident_key` resides + * @param upper_bound Upper bound of the iteration + * @return The resident's probe distance, in probing steps + */ +template +[[nodiscard]] __host__ __device__ constexpr typename Extent::value_type probe_distance( + linear_probing const& scheme, + ProbeKey resident_key, + typename Extent::value_type slot_index, + Extent upper_bound) noexcept +{ + using size_type = typename Extent::value_type; + size_type constexpr stride = CGSize * BucketSize; + auto const bound = static_cast(upper_bound); + auto const hash = scheme.hash_function(); + + // Home bucket base of the resident, using the same alignment as `make_iterator`. + size_type const resident_home = + cuco::detail::sanitize_hash(hash(resident_key)) % (bound / stride) * stride; + + // Bucket-strided base of the slot the resident currently occupies. The per-lane `thread_rank` + // offset (which is < stride) is stripped by the floor division so that the distance is measured + // in whole probing steps, consistent with the forward sequence. + size_type const slot_base = (slot_index / stride) * stride; + + // (slot_base - resident_home) mod capacity, expressed in probing steps. + return static_cast((slot_base + bound - resident_home) % bound) / stride; +} + +} // namespace detail + +template +__host__ __device__ constexpr robin_hood_probing::robin_hood_probing( + Underlying const& probing) + : Underlying{probing} +{ +} + +template +template +__host__ __device__ constexpr auto robin_hood_probing::rebind_hash_function( + NewHash const& hash) const noexcept +{ + auto const inner = static_cast(*this).rebind_hash_function(hash); + return robin_hood_probing>{inner}; +} + +template +template +__host__ __device__ constexpr typename Extent::value_type +robin_hood_probing::probe_distance(ProbeKey resident_key, + typename Extent::value_type slot_index, + Extent upper_bound) const noexcept +{ + // Delegate to the per-underlying-scheme inverse overload. Only `linear_probing` provides one + // today; wrapping a scheme without a matching overload is a compile-time error. + return cuco::detail::probe_distance( + static_cast(*this), resident_key, slot_index, upper_bound); +} + } // namespace cuco diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl index 7f8ff043b..b71a40f9f 100644 --- a/include/cuco/detail/static_map/static_map_ref.inl +++ b/include/cuco/detail/static_map/static_map_ref.inl @@ -495,6 +495,7 @@ class operator_impl< using key_type = typename base_type::key_type; using value_type = typename base_type::value_type; using mapped_type = T; + using size_type = typename base_type::size_type; static constexpr auto cg_size = base_type::cg_size; static constexpr auto bucket_size = base_type::bucket_size; @@ -515,17 +516,20 @@ class operator_impl< ref_type& ref_ = static_cast(*this); - auto const val = ref_.impl_.heterogeneous_value(value); - auto const key = ref_.impl_.extract_key(val); + auto val = ref_.impl_.heterogeneous_value(value); + auto key = ref_.impl_.extract_key(val); auto const probing_scheme = ref_.impl_.probing_scheme(); auto storage_ref = ref_.impl_.storage_ref(); auto probing_iter = probing_scheme.template make_iterator(key, storage_ref.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; + [[maybe_unused]] size_type probe_step = 0; while (true) { auto const bucket_slots = storage_ref[*probing_iter]; + [[maybe_unused]] bool retry = false; + for (auto& slot_content : bucket_slots) { auto const eq_res = ref_.impl_.predicate_.template operator()(key, slot_content.first); @@ -534,14 +538,69 @@ class operator_impl< // If the key is already in the container, update the payload and return if (eq_res == detail::equal_result::EQUAL) { - cuda::atomic_ref payload_ref(slot_ptr->second); - payload_ref.store(val.second, cuda::memory_order_relaxed); - return; + if constexpr (cuco::is_robin_hood_probing< + typename ref_type::probing_scheme_type>::value) { + // Robin Hood may relocate this key; assign via a full-slot CAS that keeps the key and + // fails if it moved. On failure re-probe and retry -- the loop re-finds the key. + auto desired = slot_content; + desired.second = val.second; + if (ref_.impl_.attempt_insert(slot_ptr, slot_content, desired) == + detail::insert_result::SUCCESS) { + return; + } + retry = true; + break; + } else { + cuda::atomic_ref payload_ref(slot_ptr->second); + payload_ref.store(val.second, cuda::memory_order_relaxed); + return; + } } if (eq_res == detail::equal_result::AVAILABLE) { - if (attempt_insert_or_assign(slot_ptr, val)) { return; } + if constexpr (cuco::is_robin_hood_probing< + typename ref_type::probing_scheme_type>::value) { + // Claim only a true empty; a tombstone is handled as a resident by the displacement + // test below. On a lost CAS (rival insert) or a duplicate, retry -- the loop re-finds + // the key and assigns it via the EQUAL full-slot CAS. + if (not ref_.impl_.is_erased(slot_content)) { + if (ref_.impl_.attempt_insert(slot_ptr, slot_content, val) == + detail::insert_result::SUCCESS) { + return; + } + retry = true; + break; + } + } else { + if (attempt_insert_or_assign(slot_ptr, val)) { return; } + } + } + + // Robin Hood swap test (see `open_addressing_ref_impl::insert` for the rationale). A + // tombstone is a resident too (age from its payload); picking one up consumes it -- the + // pair lands there and we are done. + if constexpr (cuco::is_robin_hood_probing::value) { + if (eq_res == detail::equal_result::UNEQUAL or ref_.impl_.is_erased(slot_content)) { + auto const evicted_age = ref_.impl_.robin_hood_age( + slot_content, static_cast(*probing_iter + intra_bucket_index)); + if (evicted_age < probe_step) { + if (ref_.impl_.attempt_insert(slot_ptr, slot_content, val) == + detail::insert_result::SUCCESS) { + if (ref_.impl_.is_erased(slot_content)) { return; } // consumed tombstone -- done + val = cuda::std::bit_cast(slot_content); + key = ref_.impl_.extract_key(val); + probe_step = evicted_age; + } + retry = true; + break; + } + } } } + + if constexpr (cuco::is_robin_hood_probing::value) { + if (retry) { continue; } + ++probe_step; + } ++probing_iter; if (*probing_iter == init_idx) { return; } } @@ -565,13 +624,14 @@ class operator_impl< { ref_type& ref_ = static_cast(*this); - auto const val = ref_.impl_.heterogeneous_value(value); - auto const key = ref_.impl_.extract_key(val); + auto val = ref_.impl_.heterogeneous_value(value); + auto key = ref_.impl_.extract_key(val); auto const probing_scheme = ref_.impl_.probing_scheme(); auto storage_ref = ref_.impl_.storage_ref(); auto probing_iter = probing_scheme.template make_iterator(group, key, storage_ref.extent()); - auto const init_idx = *probing_iter; + auto const init_idx = *probing_iter; + [[maybe_unused]] size_type probe_step = 0; while (true) { auto const bucket_slots = storage_ref[*probing_iter]; @@ -582,6 +642,15 @@ class operator_impl< if (result.state_ == detail::equal_result::UNEQUAL) { auto res = ref_.impl_.predicate_.template operator()( key, bucket_slots[i()].first); + // Robin Hood: a tombstone is a resident handled by the displacement scan, not + // AVAILABLE. + if constexpr (cuco::is_robin_hood_probing< + typename ref_type::probing_scheme_type>::value) { + if (res == detail::equal_result::AVAILABLE and + ref_.impl_.is_erased(bucket_slots[i()])) { + res = detail::equal_result::UNEQUAL; + } + } if (res != detail::equal_result::UNEQUAL) { result = detail::bucket_probing_results{res, i()}; } @@ -595,23 +664,89 @@ class operator_impl< auto const group_contains_equal = group.ballot(state == detail::equal_result::EQUAL); if (group_contains_equal) { auto const src_lane = __ffs(group_contains_equal) - 1; - if (group.thread_rank() == src_lane) { - cuda::atomic_ref payload_ref(slot_ptr->second); - payload_ref.store(val.second, cuda::memory_order_relaxed); + if constexpr (cuco::is_robin_hood_probing::value) { + // src_lane assigns via a full-slot CAS (key fixed); a relocation or rival update fails + // it, so the group re-probes and retries -- the loop re-finds the key. + auto const success = [&, target_idx = intra_bucket_index]() { + if (group.thread_rank() != src_lane) { return false; } + auto desired = bucket_slots[target_idx]; + desired.second = val.second; + return ref_.impl_.attempt_insert(slot_ptr, bucket_slots[target_idx], desired) == + detail::insert_result::SUCCESS; + }(); + if (group.shfl(success, src_lane)) { return; } + continue; + } else { + if (group.thread_rank() == src_lane) { + cuda::atomic_ref payload_ref(slot_ptr->second); + payload_ref.store(val.second, cuda::memory_order_relaxed); + } + group.sync(); + return; } - group.sync(); - return; } auto const group_contains_available = group.ballot(state == detail::equal_result::AVAILABLE); if (group_contains_available) { auto const src_lane = __ffs(group_contains_available) - 1; - auto const status = - (group.thread_rank() == src_lane) ? attempt_insert_or_assign(slot_ptr, val) : false; + if constexpr (cuco::is_robin_hood_probing::value) { + // Insert the new pair with a full-slot CAS; on a lost CAS or duplicate, re-probe and + // retry (the loop re-finds the key and assigns via the EQUAL full-slot CAS). + auto const success = [&, target_idx = intra_bucket_index]() { + if (group.thread_rank() != src_lane) { return false; } + return ref_.impl_.attempt_insert(slot_ptr, bucket_slots[target_idx], val) == + detail::insert_result::SUCCESS; + }(); + if (group.shfl(success, src_lane)) { return; } + continue; + } else { + auto const status = + (group.thread_rank() == src_lane) ? attempt_insert_or_assign(slot_ptr, val) : false; - // Exit if inserted or assigned - if (group.shfl(status, src_lane)) { return; } + // Exit if inserted or assigned + if (group.shfl(status, src_lane)) { return; } + } } else { + if constexpr (cuco::is_robin_hood_probing::value) { + // Robin Hood displacement (see CG `open_addressing_ref_impl::insert`). + cuda::std::int32_t displace_idx = -1; + size_type evicted_age = 0; + cuda::static_for([&] __device__(auto i) { + if (displace_idx < 0) { + // `robin_hood_age` so a tombstone uses its payload-stored age (like any resident). + auto const age = ref_.impl_.robin_hood_age( + bucket_slots[i()], static_cast(*probing_iter + i())); + if (age < probe_step) { + displace_idx = i(); + evicted_age = age; + } + } + }); + + auto const group_displaceable = group.ballot(displace_idx >= 0); + if (group_displaceable) { + auto const src_lane = __ffs(group_displaceable) - 1; + auto status = detail::insert_result::CONTINUE; + value_type evicted = ref_.impl_.empty_slot_sentinel(); + if (group.thread_rank() == src_lane) { + evicted = bucket_slots[displace_idx]; + status = ref_.impl_.attempt_insert( + ref_.impl_.get_slot_ptr(*probing_iter, displace_idx), evicted, val); + } + if (group.shfl(status, src_lane) == detail::insert_result::SUCCESS) { + // Consuming a tombstone reuses its slot -- nothing to carry, so we are done. + if (group.shfl(ref_.impl_.is_erased(evicted), src_lane)) { return; } + auto const new_key = group.shfl(ref_.impl_.extract_key(evicted), src_lane); + auto const new_payload = group.shfl(ref_.impl_.extract_payload(evicted), src_lane); + auto const new_age = group.shfl(evicted_age, src_lane); + val = cuda::std::bit_cast(value_type{new_key, new_payload}); + key = ref_.impl_.extract_key(val); + probe_step = new_age; + } + continue; + } + ++probe_step; + } ++probing_iter; if (*probing_iter == init_idx) { return; } } @@ -670,6 +805,7 @@ class operator_impl< using ref_type = static_map_ref; using key_type = typename base_type::key_type; using value_type = typename base_type::value_type; + using size_type = typename base_type::size_type; static constexpr auto cg_size = base_type::cg_size; static constexpr auto bucket_size = base_type::bucket_size; @@ -886,14 +1022,15 @@ class operator_impl< { ref_type& ref_ = static_cast(*this); - auto const val = ref_.impl_.heterogeneous_value(value); - auto const key = ref_.impl_.extract_key(val); + auto val = ref_.impl_.heterogeneous_value(value); + auto key = ref_.impl_.extract_key(val); auto const probing_scheme = ref_.impl_.probing_scheme(); auto storage_ref = ref_.impl_.storage_ref(); auto probing_iter = probing_scheme.template make_iterator(key, storage_ref.extent()); - auto const init_idx = *probing_iter; - auto const empty_value = ref_.empty_value_sentinel(); + auto const init_idx = *probing_iter; + auto const empty_value = ref_.empty_value_sentinel(); + [[maybe_unused]] size_type probe_step = 0; // wait for payload only when init != sentinel and insert strategy is not `packed_cas` auto constexpr wait_for_payload = (not UseDirectApply) and (sizeof(value_type) > 8); @@ -901,6 +1038,8 @@ class operator_impl< while (true) { auto const bucket_slots = storage_ref[*probing_iter]; + [[maybe_unused]] bool retry = false; + for (auto& slot_content : bucket_slots) { auto const eq_res = ref_.impl_.predicate_.template operator()(key, slot_content.first); @@ -909,18 +1048,46 @@ class operator_impl< // If the key is already in the container, update the payload and return if (eq_res == detail::equal_result::EQUAL) { - // wait for payload only when performing insert operation - if constexpr (wait_for_payload) { - ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); + if constexpr (cuco::is_robin_hood_probing< + typename ref_type::probing_scheme_type>::value) { + // Lift `op` to the whole slot, keeping the key, and CAS it. A relocation (or a rival + // update) makes the CAS fail; re-probe and retry -- the loop re-finds the key. + auto desired = slot_content; + // `desired` is a local copy, so this `op` is just local arithmetic -- the + // `atomic_ref`'s atomicity does nothing here and is used only because `Op`'s signature + // requires one. The real atomic is the full-slot CAS below. + op(cuda::atomic_ref{desired.second}, val.second); + if (ref_.impl_.attempt_insert(slot_ptr, slot_content, desired) == + detail::insert_result::SUCCESS) { + return false; + } + retry = true; + break; + } else { + // wait for payload only when performing insert operation + if constexpr (wait_for_payload) { + ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); + } + op(cuda::atomic_ref{slot_ptr->second}, val.second); + return false; } - op(cuda::atomic_ref{slot_ptr->second}, val.second); - return false; } - if (eq_res == detail::equal_result::AVAILABLE) { + // Robin Hood claims only a true empty here; a tombstone is handled as a resident by the + // displacement test below. + if (eq_res == detail::equal_result::AVAILABLE and + not(cuco::is_robin_hood_probing::value and + ref_.impl_.is_erased(slot_content))) { switch (ref_.template attempt_insert_or_apply( slot_ptr, slot_content, val, op)) { case insert_result::SUCCESS: return true; case insert_result::DUPLICATE: { + if constexpr (cuco::is_robin_hood_probing< + typename ref_type::probing_scheme_type>::value) { + // Key is present now; re-probe so it is found EQUAL and updated via the full-slot + // CAS. + retry = true; + break; + } // wait for payload only when performing insert operation if constexpr (wait_for_payload) { ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); @@ -928,10 +1095,49 @@ class operator_impl< op(cuda::atomic_ref{slot_ptr->second}, val.second); return false; } - default: continue; + case insert_result::CONTINUE: { + if constexpr (cuco::is_robin_hood_probing< + typename ref_type::probing_scheme_type>::value) { + retry = true; + break; + } else { + continue; + } + } + } + if constexpr (cuco::is_robin_hood_probing< + typename ref_type::probing_scheme_type>::value) { + if (retry) { break; } + } + } + + // Robin Hood swap test (see `open_addressing_ref_impl::insert` for the rationale). A + // tombstone is a resident too (age from its payload); picking one up consumes it -- the + // in-flight pair lands there, completing the insert. + if constexpr (cuco::is_robin_hood_probing::value) { + if (eq_res == detail::equal_result::UNEQUAL or ref_.impl_.is_erased(slot_content)) { + auto const evicted_age = ref_.impl_.robin_hood_age( + slot_content, static_cast(*probing_iter + intra_bucket_index)); + if (evicted_age < probe_step) { + if (ref_.impl_.attempt_insert(slot_ptr, slot_content, val) == + detail::insert_result::SUCCESS) { + // Consuming a tombstone places the in-flight pair in its slot -- insert complete. + if (ref_.impl_.is_erased(slot_content)) { return true; } + val = cuda::std::bit_cast(slot_content); + key = ref_.impl_.extract_key(val); + probe_step = evicted_age; + } + retry = true; + break; + } } } } + + if constexpr (cuco::is_robin_hood_probing::value) { + if (retry) { continue; } + ++probe_step; + } ++probing_iter; if (*probing_iter == init_idx) { return false; } } @@ -964,14 +1170,15 @@ class operator_impl< { ref_type& ref_ = static_cast(*this); - auto const val = ref_.impl_.heterogeneous_value(value); - auto const key = ref_.impl_.extract_key(val); + auto val = ref_.impl_.heterogeneous_value(value); + auto key = ref_.impl_.extract_key(val); auto const probing_scheme = ref_.impl_.probing_scheme(); auto storage_ref = ref_.impl_.storage_ref(); auto probing_iter = probing_scheme.template make_iterator(group, key, storage_ref.extent()); - auto const init_idx = *probing_iter; - auto const empty_value = ref_.empty_value_sentinel(); + auto const init_idx = *probing_iter; + auto const empty_value = ref_.empty_value_sentinel(); + [[maybe_unused]] size_type probe_step = 0; // wait for payload only when init != sentinel and insert strategy is not `packed_cas` auto constexpr wait_for_payload = (not UseDirectApply) and (sizeof(value_type) > 8); @@ -985,6 +1192,15 @@ class operator_impl< if (result.state_ == detail::equal_result::UNEQUAL) { auto res = ref_.impl_.predicate_.template operator()( key, bucket_slots[i()].first); + // Robin Hood: a tombstone is a resident handled by the displacement scan, not + // AVAILABLE. + if constexpr (cuco::is_robin_hood_probing< + typename ref_type::probing_scheme_type>::value) { + if (res == detail::equal_result::AVAILABLE and + ref_.impl_.is_erased(bucket_slots[i()])) { + res = detail::equal_result::UNEQUAL; + } + } if (res != detail::equal_result::UNEQUAL) { result = detail::bucket_probing_results{res, i()}; } @@ -998,13 +1214,30 @@ class operator_impl< auto const group_contains_equal = group.ballot(state == detail::equal_result::EQUAL); if (group_contains_equal) { auto const src_lane = __ffs(group_contains_equal) - 1; - if (group.thread_rank() == src_lane) { - if constexpr (wait_for_payload) { - ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); + if constexpr (cuco::is_robin_hood_probing::value) { + // src_lane lifts `op` to the slot (key fixed) and CASes it; a relocation or rival update + // fails the CAS, so the group re-probes and retries -- the loop re-finds the key. + auto const success = [&, target_idx = intra_bucket_index]() { + if (group.thread_rank() != src_lane) { return false; } + auto desired = bucket_slots[target_idx]; + // `desired` is a local copy, so this `op` is just local arithmetic -- the + // `atomic_ref`'s atomicity does nothing here and is used only because `Op`'s signature + // requires one. The real atomic is the full-slot CAS below. + op(cuda::atomic_ref{desired.second}, val.second); + return ref_.impl_.attempt_insert(slot_ptr, bucket_slots[target_idx], desired) == + detail::insert_result::SUCCESS; + }(); + if (group.shfl(success, src_lane)) { return false; } + continue; + } else { + if (group.thread_rank() == src_lane) { + if constexpr (wait_for_payload) { + ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); + } + op(cuda::atomic_ref{slot_ptr->second}, val.second); } - op(cuda::atomic_ref{slot_ptr->second}, val.second); + return false; } - return false; } auto const group_contains_available = group.ballot(state == detail::equal_result::AVAILABLE); @@ -1019,6 +1252,10 @@ class operator_impl< switch (group.shfl(status, src_lane)) { case insert_result::SUCCESS: return true; case insert_result::DUPLICATE: { + if constexpr (cuco::is_robin_hood_probing< + typename ref_type::probing_scheme_type>::value) { + continue; // key present now: re-probe, find it EQUAL, apply via the full-slot CAS + } if (group.thread_rank() == src_lane) { if constexpr (wait_for_payload) { ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); @@ -1030,6 +1267,46 @@ class operator_impl< default: continue; } } else { + if constexpr (cuco::is_robin_hood_probing::value) { + // Robin Hood displacement (see CG `open_addressing_ref_impl::insert`). + cuda::std::int32_t displace_idx = -1; + size_type evicted_age = 0; + cuda::static_for([&] __device__(auto i) { + if (displace_idx < 0) { + // `robin_hood_age` so a tombstone uses its payload-stored age (like any resident). + auto const age = ref_.impl_.robin_hood_age( + bucket_slots[i()], static_cast(*probing_iter + i())); + if (age < probe_step) { + displace_idx = i(); + evicted_age = age; + } + } + }); + + auto const group_displaceable = group.ballot(displace_idx >= 0); + if (group_displaceable) { + auto const src_lane = __ffs(group_displaceable) - 1; + auto status = detail::insert_result::CONTINUE; + value_type evicted = ref_.impl_.empty_slot_sentinel(); + if (group.thread_rank() == src_lane) { + evicted = bucket_slots[displace_idx]; + status = ref_.impl_.attempt_insert( + ref_.impl_.get_slot_ptr(*probing_iter, displace_idx), evicted, val); + } + if (group.shfl(status, src_lane) == detail::insert_result::SUCCESS) { + // Consuming a tombstone places the in-flight pair in its slot -- insert complete. + if (group.shfl(ref_.impl_.is_erased(evicted), src_lane)) { return true; } + auto const new_key = group.shfl(ref_.impl_.extract_key(evicted), src_lane); + auto const new_payload = group.shfl(ref_.impl_.extract_payload(evicted), src_lane); + auto const new_age = group.shfl(evicted_age, src_lane); + val = cuda::std::bit_cast(value_type{new_key, new_payload}); + key = ref_.impl_.extract_key(val); + probe_step = new_age; + } + continue; + } + ++probe_step; + } ++probing_iter; if (*probing_iter == init_idx) { return false; } } diff --git a/include/cuco/probing_scheme.cuh b/include/cuco/probing_scheme.cuh index c809794dc..3bf8b98c2 100644 --- a/include/cuco/probing_scheme.cuh +++ b/include/cuco/probing_scheme.cuh @@ -209,6 +209,87 @@ class double_hashing : private detail::probing_scheme_base { Hash2 hash2_; }; +/** + * @brief Public Robin Hood probing scheme class. + * + * @note Robin Hood probing wraps an underlying probe sequence (e.g. `cuco::linear_probing`) and + * pairs it with the Robin Hood invariant: on insert, an in-flight key displaces any resident that + * sits closer to its own home than the in-flight key is to its home, and the displaced resident is + * then re-inserted from that point onward. This keeps probe lengths tightly distributed, which is + * especially valuable on GPUs where a tile's tail latency is set by its longest probe. + * + * @note This class is a thin decorator over `Underlying`. It forwards the forward probe sequence + * (`make_iterator`, `hash_function`) unchanged and contributes the `cuco::is_robin_hood_probing` + * trait that selects the displacement (insert) and early-termination (find) control flow in the + * open-addressing ref implementation. The invariant's one extra requirement — the inverse + * primitive `probe_distance`, which recovers a resident's probe distance ("age") from the slot it + * occupies — is delegated to `cuco::detail::probe_distance`, which is overloaded per underlying + * scheme. Only `cuco::linear_probing` provides that overload today; a `cuco::double_hashing` + * variant would compose simply by adding a matching `cuco::detail::probe_distance` overload, with + * no change to this class or the ref implementation. + * + * @tparam Underlying The wrapped probe-sequence scheme (e.g. `cuco::linear_probing`) + */ +template +class robin_hood_probing : private Underlying { + public: + using typename Underlying::hasher; ///< Hash function type (from the underlying scheme) + using Underlying::cg_size; ///< Cooperative group size (from the underlying scheme) + using Underlying::hash_function; ///< Forwarded: gets the function(s) used to hash keys + using Underlying::make_iterator; ///< Forwarded: the (unchanged) forward probe sequence + + /** + * @brief Constructs a Robin Hood probing scheme wrapping the given underlying scheme. + * + * @param probing The underlying probe-sequence scheme to wrap + */ + __host__ __device__ constexpr robin_hood_probing(Underlying const& probing = {}); + + /** + * @brief Makes a copy of the current probing method with the given hasher. + * + * @note Forwards to the underlying scheme's `rebind_hash_function` and re-wraps the result so the + * returned scheme is again a `robin_hood_probing`. + * + * @tparam NewHash New hasher type + * + * @param hash Hasher + * + * @return Copy of the current probing method + */ + template + [[nodiscard]] __host__ __device__ constexpr auto rebind_hash_function( + NewHash const& hash) const noexcept; + + /** + * @brief Computes the probe distance ("age") of a resident key. + * + * @note This is the inverse of the probe sequence: given a resident key and the slot index at + * which it currently lives, it returns how many probing steps that resident is from its own home + * bucket. The Robin Hood insert/find logic compares this against the in-flight key's own probe + * distance to decide displacement (insert) or early termination (find). + * + * @note Delegates to the `cuco::detail::probe_distance` overload for `Underlying`. Instantiating + * this for an `Underlying` without such an overload (e.g. `cuco::double_hashing` today) is a + * compile-time error — that is the single seam where a new underlying scheme would supply its own + * inverse. + * + * @tparam BucketSize Size of the bucket + * @tparam ProbeKey Type of probing key + * @tparam Extent Type of extent + * + * @param resident_key The key currently residing in the slot + * @param slot_index The slot index at which `resident_key` resides + * @param upper_bound Upper bound of the iteration + * @return The resident's probe distance, in probing steps + */ + template + [[nodiscard]] __host__ __device__ constexpr typename Extent::value_type probe_distance( + ProbeKey resident_key, + typename Extent::value_type slot_index, + Extent upper_bound) const noexcept; +}; + /** * @brief Trait indicating whether the given probing scheme is of `double_hashing` type or not * @@ -227,6 +308,22 @@ struct is_double_hashing : cuda::std::false_type {}; template struct is_double_hashing> : cuda::std::true_type {}; +/** + * @brief Trait indicating whether the given probing scheme is of `robin_hood_probing` type or not + * + * @tparam T Input probing scheme type + */ +template +struct is_robin_hood_probing : cuda::std::false_type {}; + +/** + * @brief Trait indicating whether the given probing scheme is of `robin_hood_probing` type or not + * + * @tparam Underlying The wrapped probe-sequence scheme + */ +template +struct is_robin_hood_probing> : cuda::std::true_type {}; + } // namespace cuco #include diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 51d4f42c1..07465b86e 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -89,6 +89,7 @@ ConfigureTest(STATIC_MAP_TEST static_map/insert_or_assign_test.cu static_map/insert_or_apply_test.cu static_map/key_sentinel_test.cu + static_map/probing_scheme_invariants_test.cu static_map/shared_memory_test.cu static_map/stream_test.cu static_map/rehash_test.cu diff --git a/tests/static_map/contains_test.cu b/tests/static_map/contains_test.cu index 0b3604528..9c1312482 100644 --- a/tests/static_map/contains_test.cu +++ b/tests/static_map/contains_test.cu @@ -29,6 +29,7 @@ #include #include +#include using size_type = int32_t; @@ -79,6 +80,12 @@ void test_unique_sequence(Map& map, size_type num_keys) map.insert(pairs_begin, pairs_begin + num_keys); + // Robin Hood-specific: the populated table must satisfy the per-bucket layout invariant. + if constexpr (cuco::is_robin_hood_probing< + typename std::decay_t::probing_scheme_type>::value) { + cuco::test::check_robin_hood_invariant(map); + } + SECTION("All inserted keys should be contained.") { REQUIRE(map.count(keys_begin, keys_begin + num_keys) == num_keys); @@ -131,12 +138,19 @@ TEMPLATE_TEST_CASE_SIG( (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), + // Robin Hood mirrors the linear-probing rows. Only single-CAS (<= 8-byte) slots are + // unconditional; wider-slot RH displacement needs a packed atom.cas.b128 (gated below). + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 2) #if defined(CUCO_HAS_128BIT_ATOMICS) , (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), - (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2), + // Wider-slot Robin Hood rows: the packed displacement CAS needs atom.cas.b128. + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 2) #endif ) { @@ -145,9 +159,12 @@ TEMPLATE_TEST_CASE_SIG( // XXX: testing static extent is intended, DO NOT CHANGE using extent_type = cuco::extent; using probe = std::conditional_t< - Probe == cuco::test::probe_sequence::linear_probing, - cuco::linear_probing>, - cuco::double_hashing, cuco::murmurhash3_32>>; + Probe == cuco::test::probe_sequence::double_hashing, + cuco::double_hashing, cuco::murmurhash3_32>, + std::conditional_t< + Probe == cuco::test::probe_sequence::robin_hood, + cuco::robin_hood_probing>>, + cuco::linear_probing>>>; auto map = cuco::static_map #include +#include #include #include #include #include +#include using size_type = int32_t; @@ -102,21 +104,34 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), + // Robin Hood mirrors the linear-probing rows. Only single-CAS (<= 8-byte) slots are + // unconditional; wider-slot RH displacement needs a packed atom.cas.b128 (gated below). + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 2) #if defined(CUCO_HAS_128BIT_ATOMICS) , (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), - (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2), + // Wider-slot Robin Hood rows: RH displacement needs a single packed atom.cas.b128, so the slot + // must be packable (padding-free). Only int64/int64 qualifies -- int32/int64 and int64/int32 are + // padded (not is_packable), fall back to a split (back-to-back) CAS, and RH displacement would + // livelock on an occupied slot. + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 2) #endif ) { constexpr size_type num_keys{1'000'000}; using probe = std::conditional_t< - Probe == cuco::test::probe_sequence::linear_probing, - cuco::linear_probing>, - cuco::double_hashing, cuco::murmurhash3_32>>; + Probe == cuco::test::probe_sequence::double_hashing, + cuco::double_hashing, cuco::murmurhash3_32>, + std::conditional_t< + Probe == cuco::test::probe_sequence::robin_hood, + cuco::robin_hood_probing>>, + cuco::linear_probing>>>; auto map = cuco::static_map>>; + using map_type = cuco::static_map, + cuda::thread_scope_device, + cuda::std::equal_to, + probe, + cuco::cuda_allocator, + cuco::storage<2>>; + + constexpr size_type capacity = static_cast(num_keys / 0.85); + auto map = map_type{ + capacity, cuco::empty_key{-1}, cuco::empty_value{-1}, cuco::erased_key{-2}}; + + // Keys 1..num_keys (avoid the -1 / -2 sentinels). + auto keys_begin = cuda::counting_iterator(1); + auto pairs_begin = cuda::make_transform_iterator( + keys_begin, cuda::proclaim_return_type>([] __device__(Key k) { + return cuco::pair{k, static_cast(k)}; + })); + + map.insert(pairs_begin, pairs_begin + num_keys); + REQUIRE(map.size() == num_keys); + + constexpr size_type num_erased = num_keys / 2; + map.erase(keys_begin, keys_begin + num_erased); + REQUIRE(map.size() == num_keys - num_erased); + + thrust::device_vector d_contained(num_keys); + map.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); + REQUIRE(cuco::test::none_of( + d_contained.begin(), d_contained.begin() + num_erased, cuda::std::identity{})); + REQUIRE( + cuco::test::all_of(d_contained.begin() + num_erased, d_contained.end(), cuda::std::identity{})); +} + +// Robin Hood erase reuse + structural invariant at high load: insert, erase half (-> tombstones), +// re-insert (-> consume tombstones), checking the per-bucket Robin Hood layout invariant after each +// step (tombstones counted as residents, age read from their payload). A wrongly consumed tombstone +// (the age inversion) corrupts the layout and trips the invariant; a lost/duplicate key trips +// size/contains. +TEMPLATE_TEST_CASE_SIG("static_map robin_hood erase reuse + invariant", + "", + ((typename Key, typename Value, int CGSize), Key, Value, CGSize), + (int32_t, int32_t, 1), + (int32_t, int32_t, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (int64_t, int64_t, 1), + (int64_t, int64_t, 2) +#endif +) +{ + constexpr size_type num_keys = 10'000; + + using probe = cuco::robin_hood_probing>>; + using map_type = cuco::static_map, + cuda::thread_scope_device, + cuda::std::equal_to, + probe, + cuco::cuda_allocator, + cuco::storage<2>>; + + constexpr size_type capacity = static_cast(num_keys / 0.85); + auto map = map_type{ + capacity, cuco::empty_key{-1}, cuco::empty_value{-1}, cuco::erased_key{-2}}; + + auto keys_begin = cuda::counting_iterator(1); + auto pairs_begin = cuda::make_transform_iterator( + keys_begin, cuda::proclaim_return_type>([] __device__(Key k) { + return cuco::pair{k, static_cast(k)}; + })); + + map.insert(pairs_begin, pairs_begin + num_keys); + REQUIRE(map.size() == num_keys); + cuco::test::check_robin_hood_invariant(map); + + constexpr size_type num_erased = num_keys / 2; + map.erase(keys_begin, keys_begin + num_erased); + REQUIRE(map.size() == num_keys - num_erased); + cuco::test::check_robin_hood_invariant(map); // tombstones-as-residents layout still valid + + map.insert(pairs_begin, pairs_begin + num_erased); // consume tombstones / fill empties + REQUIRE(map.size() == num_keys); + cuco::test::check_robin_hood_invariant(map); // layout valid after reuse + + thrust::device_vector d_contained(num_keys); + map.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); + REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); +} + +namespace { +enum class reinsert_via { insert_or_assign, insert_or_apply, insert_and_find }; + +// Robin Hood erase + reuse through a specific re-insert API: insert, check invariant, erase the +// first half (-> tombstones), check invariant, then re-insert that half via `how` (-> consume +// tombstones) and check invariant + that every key is present. Exercises the tombstone path of the +// chosen insert variant. +template +void test_rh_erase_reuse(size_type num_keys, reinsert_via how) +{ + using probe = cuco::robin_hood_probing>>; + using map_type = cuco::static_map, + cuda::thread_scope_device, + cuda::std::equal_to, + probe, + cuco::cuda_allocator, + cuco::storage<2>>; + + auto const capacity = static_cast(num_keys / 0.85); + auto map = map_type{ + capacity, cuco::empty_key{-1}, cuco::empty_value{-1}, cuco::erased_key{-2}}; + + auto keys_begin = cuda::counting_iterator(1); + auto pairs_begin = cuda::make_transform_iterator( + keys_begin, cuda::proclaim_return_type>([] __device__(Key k) { + return cuco::pair{k, static_cast(k)}; + })); + + map.insert(pairs_begin, pairs_begin + num_keys); + REQUIRE(map.size() == num_keys); + cuco::test::check_robin_hood_invariant(map); + + auto const num_erased = num_keys / 2; + map.erase(keys_begin, keys_begin + num_erased); + REQUIRE(map.size() == num_keys - num_erased); + cuco::test::check_robin_hood_invariant(map); + + // re-insert the erased half through the variant under test (consuming tombstones) + switch (how) { + case reinsert_via::insert_or_assign: + map.insert_or_assign(pairs_begin, pairs_begin + num_erased); + break; + case reinsert_via::insert_or_apply: + map.insert_or_apply(pairs_begin, pairs_begin + num_erased, cuco::reduce::plus{}); + break; + case reinsert_via::insert_and_find: { + thrust::device_vector found(num_erased); + thrust::device_vector inserted(num_erased); + map.insert_and_find(pairs_begin, pairs_begin + num_erased, found.begin(), inserted.begin()); + break; + } + } + + REQUIRE(map.size() == num_keys); + cuco::test::check_robin_hood_invariant(map); + + thrust::device_vector d_contained(num_keys); + map.contains(keys_begin, keys_begin + num_keys, d_contained.begin()); + REQUIRE(cuco::test::all_of(d_contained.begin(), d_contained.end(), cuda::std::identity{})); +} +} // namespace + +TEMPLATE_TEST_CASE_SIG("static_map robin_hood erase + insert_or_assign reuse", + "", + ((typename Key, typename Value, int CGSize), Key, Value, CGSize), + (int32_t, int32_t, 1), + (int32_t, int32_t, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (int64_t, int64_t, 1), + (int64_t, int64_t, 2) +#endif +) +{ + test_rh_erase_reuse(10'000, reinsert_via::insert_or_assign); +} + +TEMPLATE_TEST_CASE_SIG("static_map robin_hood erase + insert_or_apply reuse", + "", + ((typename Key, typename Value, int CGSize), Key, Value, CGSize), + (int32_t, int32_t, 1), + (int32_t, int32_t, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (int64_t, int64_t, 1), + (int64_t, int64_t, 2) +#endif +) +{ + test_rh_erase_reuse(10'000, reinsert_via::insert_or_apply); +} + +TEMPLATE_TEST_CASE_SIG("static_map robin_hood erase + insert_and_find reuse", + "", + ((typename Key, typename Value, int CGSize), Key, Value, CGSize), + (int32_t, int32_t, 1), + (int32_t, int32_t, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (int64_t, int64_t, 1), + (int64_t, int64_t, 2) +#endif +) +{ + test_rh_erase_reuse(10'000, reinsert_via::insert_and_find); +} diff --git a/tests/static_map/find_test.cu b/tests/static_map/find_test.cu index 5d9376309..2dda5e89f 100644 --- a/tests/static_map/find_test.cu +++ b/tests/static_map/find_test.cu @@ -30,6 +30,7 @@ #include #include +#include using size_type = int32_t; @@ -73,6 +74,12 @@ void test_unique_sequence(Map& map, size_type num_keys) map.insert(pairs_begin, pairs_begin + num_keys); + // Robin Hood-specific: the populated table must satisfy the per-bucket Robin Hood layout + // invariant (a no-op for linear/double hashing). + if constexpr (cuco::is_robin_hood_probing::value) { + cuco::test::check_robin_hood_invariant(map); + } + SECTION("All inserted keys should be correctly recovered during find") { map.find(keys_begin, keys_begin + num_keys, d_results.begin()); @@ -163,12 +170,29 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), + // Robin Hood mirrors the linear-probing rows. Only single-CAS (<= 8-byte) slots are + // unconditional: RH displacement swaps into an *occupied* slot, which requires a packed CAS of + // the whole slot. The wider-slot RH rows therefore live under CUCO_HAS_128BIT_ATOMICS below + // (LP/DH avoid this because they only ever CAS into empty slots via back-to-back CAS). + (int8_t, int8_t, cuco::test::probe_sequence::robin_hood, 1), + (int8_t, int8_t, cuco::test::probe_sequence::robin_hood, 2), + (int8_t, int16_t, cuco::test::probe_sequence::robin_hood, 2), + (int16_t, int16_t, cuco::test::probe_sequence::robin_hood, 1), + (int16_t, int16_t, cuco::test::probe_sequence::robin_hood, 2), + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 2) #if defined(CUCO_HAS_128BIT_ATOMICS) , (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), - (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2), + // Wider-slot Robin Hood rows: RH displacement needs a single packed atom.cas.b128, so the slot + // must be packable (padding-free). Only int64/int64 qualifies -- int32/int64 and int64/int32 are + // padded (not is_packable), fall back to a split (back-to-back) CAS, and RH displacement would + // livelock on an occupied slot. + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 2) #endif ) { @@ -177,9 +201,12 @@ TEMPLATE_TEST_CASE_SIG( // XXX: testing static extent is intended, DO NOT CHANGE using extent_type = cuco::extent; using probe = std::conditional_t< - Probe == cuco::test::probe_sequence::linear_probing, - cuco::linear_probing>, - cuco::double_hashing, cuco::murmurhash3_32>>; + Probe == cuco::test::probe_sequence::double_hashing, + cuco::double_hashing, cuco::murmurhash3_32>, + std::conditional_t< + Probe == cuco::test::probe_sequence::robin_hood, + cuco::robin_hood_probing>>, + cuco::linear_probing>>>; constexpr size_type gold_capacity = [&]() { if constexpr (cuco::is_double_hashing::value) { diff --git a/tests/static_map/insert_and_find_test.cu b/tests/static_map/insert_and_find_test.cu index 17665eb2b..be3c55a45 100644 --- a/tests/static_map/insert_and_find_test.cu +++ b/tests/static_map/insert_and_find_test.cu @@ -26,6 +26,7 @@ #include #include +#include using size_type = std::size_t; @@ -52,12 +53,22 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), + // Robin Hood mirrors the linear-probing rows. Only single-CAS (<= 8-byte) slots are + // unconditional; wider-slot RH displacement needs a packed atom.cas.b128 (gated below). + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 2) #if defined(CUCO_HAS_128BIT_ATOMICS) , (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), - (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2), + // Wider-slot Robin Hood rows: RH displacement needs a single packed atom.cas.b128, so the slot + // must be packable (padding-free). Only int64/int64 qualifies -- int32/int64 and int64/int32 are + // padded (not is_packable), fall back to a split (back-to-back) CAS, and RH displacement would + // livelock on an occupied slot. + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 2) #endif ) { @@ -66,9 +77,12 @@ TEMPLATE_TEST_CASE_SIG( #endif { using probe = std::conditional_t< - Probe == cuco::test::probe_sequence::linear_probing, - cuco::linear_probing>, - cuco::double_hashing, cuco::murmurhash3_32>>; + Probe == cuco::test::probe_sequence::double_hashing, + cuco::double_hashing, cuco::murmurhash3_32>, + std::conditional_t< + Probe == cuco::test::probe_sequence::robin_hood, + cuco::robin_hood_probing>>, + cuco::linear_probing>>>; constexpr size_type num_keys{400}; @@ -100,6 +114,12 @@ TEMPLATE_TEST_CASE_SIG( map.insert_and_find(pairs_begin, pairs_begin + num_keys, found2.begin(), inserted.begin()); REQUIRE(cuco::test::none_of(inserted.begin(), inserted.end(), cuda::std::identity{})); + // Robin Hood-specific: the populated table must satisfy the per-bucket layout invariant. + if constexpr (cuco::is_robin_hood_probing< + typename std::decay_t::probing_scheme_type>::value) { + cuco::test::check_robin_hood_invariant(map); + } + // both found1 and found2 should be same, as keys will be referring to same slot REQUIRE( cuco::test::equal(found1.begin(), found1.end(), found2.begin(), cuda::std::equal_to{})); diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu index 0a4d07ea3..692e7af31 100644 --- a/tests/static_map/insert_or_apply_test.cu +++ b/tests/static_map/insert_or_apply_test.cu @@ -27,6 +27,7 @@ #include #include +#include #include @@ -54,6 +55,12 @@ void test_insert_or_apply(Map& map, size_type num_keys, size_type num_unique_key map.insert_or_apply(pairs_begin, pairs_begin + num_keys, plus_op); } + // Robin Hood-specific: the populated table must satisfy the per-bucket layout invariant. + if constexpr (cuco::is_robin_hood_probing< + typename std::decay_t::probing_scheme_type>::value) { + cuco::test::check_robin_hood_invariant(map); + } + REQUIRE(map.size() == num_unique_keys); thrust::device_vector d_keys(num_unique_keys); @@ -161,12 +168,23 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), + // Robin Hood mirrors the linear-probing rows. Only single-CAS (<= 8-byte) slots are + // unconditional; wider-slot RH displacement needs a packed atom.cas.b128 (gated below). + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 2) #if defined(CUCO_HAS_128BIT_ATOMICS) , (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), - (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2), + // Wider-slot Robin Hood rows: the packed displacement CAS needs atom.cas.b128. + (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), + (int32_t, int64_t, cuco::test::probe_sequence::robin_hood, 2), + (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), + (int64_t, int32_t, cuco::test::probe_sequence::robin_hood, 2), + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 2) #endif ) { @@ -174,9 +192,12 @@ TEMPLATE_TEST_CASE_SIG( constexpr size_type num_unique_keys{100}; using probe = std::conditional_t< - Probe == cuco::test::probe_sequence::linear_probing, - cuco::linear_probing>, - cuco::double_hashing, cuco::murmurhash3_32>>; + Probe == cuco::test::probe_sequence::double_hashing, + cuco::double_hashing, cuco::murmurhash3_32>, + std::conditional_t< + Probe == cuco::test::probe_sequence::robin_hood, + cuco::robin_hood_probing>>, + cuco::linear_probing>>>; using map_type = cuco::static_map(map, num_keys, num_unique_keys, static_cast(0)); } } + +// Dedicated Robin Hood coverage for insert_or_apply: the probe-enum test above is disabled +// upstream, so this is the only active exercise of the displacing RH insert_or_apply (reduction + +// lock-free displacement together). It runs at a high load factor (~0.95 on the unique keys) so +// displacement actually fires, and reuses `test_insert_or_apply`, whose tail asserts the structural +// RH invariant. +TEMPLATE_TEST_CASE_SIG("static_map robin_hood insert_or_apply (high load)", + "", + ((typename Key, typename Value, int CGSize), Key, Value, CGSize), + (int32_t, int32_t, 1), + (int32_t, int32_t, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (int64_t, int64_t, 1), + (int64_t, int64_t, 2) +#endif +) +{ + constexpr size_type num_unique_keys = 5'000; + constexpr size_type num_keys = 10'000; // each unique key inserted twice + + using probe = cuco::robin_hood_probing>>; + using map_type = cuco::static_map, + cuda::thread_scope_device, + cuda::std::equal_to, + probe, + cuco::cuda_allocator, + cuco::storage<2>>; + + // Size the table for ~0.95 load on the unique keys, so it is nearly full and the displacing + // insert path (and the structural invariant check inside the helper) is genuinely stressed. + constexpr size_type capacity = static_cast(num_unique_keys / 0.95); + + SECTION("sentinel equals init; has_init = true") + { + auto map = map_type{capacity, cuco::empty_key{-1}, cuco::empty_value{0}}; + test_insert_or_apply(map, num_keys, num_unique_keys, static_cast(0)); + } + SECTION("sentinel equals init; has_init = false") + { + auto map = map_type{capacity, cuco::empty_key{-1}, cuco::empty_value{0}}; + test_insert_or_apply(map, num_keys, num_unique_keys, static_cast(0)); + } +} diff --git a/tests/static_map/insert_or_assign_test.cu b/tests/static_map/insert_or_assign_test.cu index 2cccd62ad..843417072 100644 --- a/tests/static_map/insert_or_assign_test.cu +++ b/tests/static_map/insert_or_assign_test.cu @@ -21,12 +21,15 @@ #include #include +#include #include #include #include +#include #include #include +#include using size_type = std::size_t; @@ -53,6 +56,12 @@ void test_insert_or_assign(Map& map, size_type num_keys) map.insert_or_assign(query_pairs_begin, query_pairs_begin + num_keys); + // Robin Hood-specific: the populated table must satisfy the per-bucket layout invariant. + if constexpr (cuco::is_robin_hood_probing< + typename std::decay_t::probing_scheme_type>::value) { + cuco::test::check_robin_hood_invariant(map); + } + auto const updated_size = map.size(); // all keys are present in the map so the size shouldn't change REQUIRE(updated_size == initial_size); @@ -93,21 +102,34 @@ TEMPLATE_TEST_CASE_SIG( (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), - (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2) + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), + // Robin Hood mirrors the linear-probing rows. Only single-CAS (<= 8-byte) slots are + // unconditional; wider-slot RH displacement needs a packed atom.cas.b128 (gated below). + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 1), + (int32_t, int32_t, cuco::test::probe_sequence::robin_hood, 2) #if defined(CUCO_HAS_128BIT_ATOMICS) , (__int128_t, __int128_t, cuco::test::probe_sequence::double_hashing, 2), (__int128_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), - (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2) + (int32_t, __int128_t, cuco::test::probe_sequence::linear_probing, 2), + // Wider-slot Robin Hood rows: RH displacement needs a single packed atom.cas.b128, so the slot + // must be packable (padding-free). Only int64/int64 qualifies -- int32/int64 and int64/int32 are + // padded (not is_packable), fall back to a split (back-to-back) CAS, and RH displacement would + // livelock on an occupied slot. + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 1), + (int64_t, int64_t, cuco::test::probe_sequence::robin_hood, 2) #endif ) { constexpr size_type num_keys{400}; using probe = std::conditional_t< - Probe == cuco::test::probe_sequence::linear_probing, - cuco::linear_probing>, - cuco::double_hashing, cuco::murmurhash3_32>>; + Probe == cuco::test::probe_sequence::double_hashing, + cuco::double_hashing, cuco::murmurhash3_32>, + std::conditional_t< + Probe == cuco::test::probe_sequence::robin_hood, + cuco::robin_hood_probing>>, + cuco::linear_probing>>>; auto map = cuco::static_mapkey value swap from a misplaced assign). +TEMPLATE_TEST_CASE_SIG("static_map robin_hood insert_or_assign (high load)", + "", + ((typename Key, typename Value, int CGSize), Key, Value, CGSize), + (int32_t, int32_t, 1), + (int32_t, int32_t, 2) +#if defined(CUCO_HAS_128BIT_ATOMICS) + , + (int64_t, int64_t, 1), + (int64_t, int64_t, 2) +#endif +) +{ + constexpr size_type num_unique_keys = 5'000; + constexpr size_type num_keys = 10'000; // each unique key assigned twice (same value) + + using probe = cuco::robin_hood_probing>>; + using map_type = cuco::static_map, + cuda::thread_scope_device, + cuda::std::equal_to, + probe, + cuco::cuda_allocator, + cuco::storage<2>>; + + // ~0.95 load on the unique keys, so the table is nearly full and displacement is stressed. + constexpr size_type capacity = static_cast(num_unique_keys / 0.95); + auto map = map_type{capacity, cuco::empty_key{-1}, cuco::empty_value{-1}}; + + // Every occurrence of key k carries the same value (k * 2). + auto pairs_begin = cuda::make_transform_iterator( + cuda::counting_iterator(0), + cuda::proclaim_return_type>([num_unique_keys] __device__(auto i) { + auto const k = static_cast(i % num_unique_keys); + return cuco::pair{k, static_cast(k * 2)}; + })); + + map.insert_or_assign(pairs_begin, pairs_begin + num_keys); + + REQUIRE(map.size() == num_unique_keys); + cuco::test::check_robin_hood_invariant(map); + + thrust::device_vector d_keys(num_unique_keys); + thrust::device_vector d_values(num_unique_keys); + map.retrieve_all(d_keys.begin(), d_values.begin()); + + auto const zip = thrust::make_zip_iterator(cuda::std::tuple{d_keys.begin(), d_values.begin()}); + REQUIRE(cuco::test::all_of( + zip, zip + num_unique_keys, cuda::proclaim_return_type([] __device__(auto const& p) { + return cuda::std::get<1>(p) == static_cast(cuda::std::get<0>(p) * 2); + }))); +} diff --git a/tests/static_map/probing_scheme_invariants_test.cu b/tests/static_map/probing_scheme_invariants_test.cu new file mode 100644 index 000000000..277ec1921 --- /dev/null +++ b/tests/static_map/probing_scheme_invariants_test.cu @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include + +#include + +#include +#include + +namespace { + +// Identity hash. `cuco::detail::sanitize_hash` is `to_positive`, so for a non-negative key `k` this +// puts its home bucket at slot `k % capacity` — letting the test hand-craft an exact Robin Hood +// layout instead of reverse-engineering a real hash function. +template +struct identity_hash { + __host__ __device__ constexpr Key operator()(Key key) const noexcept { return key; } +}; + +} // namespace + +// Validates the Robin Hood read-path early-exit (`find` / `contains`) against a hand-seeded layout, +// before the displacing `insert` exists. A scalar (cg_size == 1) Robin Hood map over a linear probe +// sequence is seeded directly through the storage pointer with a known-valid Robin Hood cluster, +// then queried with keys chosen to exercise each lookup-termination rule. +TEST_CASE("static_map robin_hood read-path early-exit", "") +{ + using Key = std::int32_t; + using Value = std::int32_t; + using size_type = std::int32_t; + auto constexpr capacity = size_type{16}; + + using extent_type = cuco::extent; + using probe_type = cuco::robin_hood_probing>>; + using map_type = cuco::static_map, + probe_type, + cuco::cuda_allocator, + cuco::storage<1>>; + using value_type = typename map_type::value_type; // cuco::pair + + auto map = map_type{extent_type{}, cuco::empty_key{-1}, cuco::empty_value{-1}}; + // Ensure the constructor's slot initialization (empty sentinels) has completed before we seed. + REQUIRE(cudaDeviceSynchronize() == cudaSuccess); + + // Hand-seed a valid Robin Hood layout (identity hash => home(k) = k % capacity): + // slot 0: key 0 home 0, distance 0 + // slot 1: key 16 home 0, distance 1 (displaced past key 0) + // slot 2: key 2 home 2, distance 0 + // slots 3..15: empty + // This is exactly what inserting {0, 16, 2} would produce, so it satisfies the Robin Hood + // invariant. We only write the three occupied (and contiguous) slots; the rest stay empty. + // The ref is used purely to reach the storage pointer; the bulk queries below build their own. + auto const ref = map.ref(cuco::op::find); + std::vector const seed{value_type{0, 0}, value_type{16, 16}, value_type{2, 2}}; + REQUIRE(cudaMemcpy(ref.storage_ref().data(), + seed.data(), + seed.size() * sizeof(value_type), + cudaMemcpyHostToDevice) == cudaSuccess); + + // Probe keys chosen to exercise every lookup-termination rule: + // 0 : present, found immediately at its home (distance 0). + // 16 : present at distance 1 — found only if we do NOT early-exit at slot 0, where the resident + // distance (0) equals our probe step (0). Guards the strict `<` (vs `<=`) richer rule. + // 2 : present at its home. + // 32 : home 0, absent — terminates via the richer-resident early-exit at slot 2 (key 2 sits at + // distance 0 < our probe step 2), before reaching the empty slot 3. + // 1 : home 1, absent — also terminates via the early-exit at slot 2. + // 3 : home 3, absent — terminates on the empty slot 3. + std::vector const probe_keys{0, 16, 2, 32, 1, 3}; + std::vector const expected_contained{true, true, true, false, false, false}; + + thrust::device_vector const d_keys(probe_keys.begin(), probe_keys.end()); + + thrust::device_vector d_contained(probe_keys.size()); + map.contains(d_keys.begin(), d_keys.end(), d_contained.begin()); + for (std::size_t i = 0; i < probe_keys.size(); ++i) { + INFO("contains, probe key = " << probe_keys[i]); + REQUIRE(static_cast(d_contained[i]) == expected_contained[i]); + } + + // `find` must return the stored value for present keys and the empty-value sentinel otherwise. + thrust::device_vector d_values(probe_keys.size()); + map.find(d_keys.begin(), d_keys.end(), d_values.begin()); + std::vector const expected_values{0, 16, 2, -1, -1, -1}; + for (std::size_t i = 0; i < probe_keys.size(); ++i) { + INFO("find, probe key = " << probe_keys[i]); + REQUIRE(d_values[i] == expected_values[i]); + } +} diff --git a/tests/static_map/robin_hood_invariant.cuh b/tests/static_map/robin_hood_invariant.cuh new file mode 100644 index 000000000..34165435c --- /dev/null +++ b/tests/static_map/robin_hood_invariant.cuh @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2026, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include + +#include + +namespace cuco { +namespace test { + +// Per-probe-step Robin Hood layout check. The unit is the *stride group* of `cg_size * bucket_size` +// contiguous slots that one probing step examines -- a single bucket for scalar probing, the whole +// cooperative-group window for CG probing. Within a stride group the slot order is free (the probe +// step distance is identical for every slot in it, so the intra-group offset cancels in all +// comparisons), so the invariant is only meaningful *between* groups. For each occupied group `g` +// (with predecessor `pg`), the resident probe-step distances ("ages") must satisfy: +// +// (1) Contiguity. If `g` holds any overflowed resident (distance >= 1), `pg` must be full -- +// otherwise that resident would have stopped in `pg`'s free slot instead of probing past it. +// (2) Balance. No resident of `pg` may be more than one probing step *richer* than the poorest +// resident of `g` (`min_age(pg) >= max_age(g) - 1`) -- otherwise the poorest resident of `g` +// should have displaced it. This is the property that distinguishes Robin Hood from plain +// linear probing, and (via condition 1) it inductively forces the whole home-to-position run +// to be full. +// +// `probe_distance` is reused here -- it is exercised independently by the utility probe-distance +// test, so a bug in *insert* (a layout that violates the invariant) is still caught. +template +__global__ void robin_hood_invariant_kernel(Ref ref, int* violations) +{ + using size_type = typename Ref::size_type; + constexpr int bs = Ref::bucket_size; + constexpr int stride = Ref::cg_size * Ref::bucket_size; + auto const storage_ref = ref.storage_ref(); + auto const slots = storage_ref.data(); + auto const num_groups = storage_ref.capacity() / stride; + auto const extent = storage_ref.extent(); + auto const empty_key = ref.empty_key_sentinel(); + auto const erased_key = ref.erased_key_sentinel(); + auto const scheme = ref.probing_scheme(); + + for (size_type g = blockIdx.x * blockDim.x + threadIdx.x; g < num_groups; + g += gridDim.x * blockDim.x) { + int occupied_g = 0; + size_type max_age_g = 0; + for (int s = 0; s < stride; ++s) { + auto const slot = slots[g * stride + s]; + if (slot.first != empty_key) { // tombstones count as residents (erase enabled => != empty) + ++occupied_g; + // A tombstone keeps its age in its payload; a live key's age is its probe distance. + auto const age = (slot.first == erased_key) + ? static_cast(slot.second) + : scheme.template probe_distance( + slot.first, static_cast(g * stride + s), extent); + if (age > max_age_g) { max_age_g = age; } + } + } + if (occupied_g == 0) { continue; } + + size_type const pg = (g + num_groups - 1) % num_groups; + int occupied_p = 0; + size_type min_age_p = 0; + for (int s = 0; s < stride; ++s) { + auto const slot = slots[pg * stride + s]; + if (slot.first != empty_key) { + auto const age = (slot.first == erased_key) + ? static_cast(slot.second) + : scheme.template probe_distance( + slot.first, static_cast(pg * stride + s), extent); + if (occupied_p == 0 || age < min_age_p) { min_age_p = age; } + ++occupied_p; + } + } + + if (max_age_g >= 1 && occupied_p < stride) { atomicAdd(violations, 1); } // (1) + if (occupied_p > 0 && min_age_p + 1 < max_age_g) { atomicAdd(violations, 1); } // (2) + } +} + +// Asserts that a populated Robin Hood `map` satisfies the per-bucket layout invariant above. No-op +// to call only on Robin Hood maps -- `probe_distance` exists only on `robin_hood_probing`, so guard +// the call site with `cuco::is_robin_hood_probing<...>`. +template +void check_robin_hood_invariant(Map& map) +{ + auto const ref = map.ref(cuco::op::find); + + thrust::device_vector d_violations(1, 0); + auto constexpr block_size = 128; + auto const grid_size = (map.capacity() + block_size - 1) / block_size; + robin_hood_invariant_kernel<<>>( + ref, thrust::raw_pointer_cast(d_violations.data())); + CUCO_CUDA_TRY(cudaDeviceSynchronize()); + + REQUIRE(d_violations[0] == 0); +} + +} // namespace test +} // namespace cuco diff --git a/tests/test_utils.hpp b/tests/test_utils.hpp index 2c77eda36..d71336f8b 100644 --- a/tests/test_utils.hpp +++ b/tests/test_utils.hpp @@ -33,7 +33,7 @@ namespace cg = cooperative_groups; constexpr int32_t block_size = 128; -enum class probe_sequence { linear_probing, double_hashing }; +enum class probe_sequence { linear_probing, double_hashing, robin_hood }; // User-defined logical algorithms to reduce compilation time template diff --git a/tests/utility/probing_scheme_test.cu b/tests/utility/probing_scheme_test.cu index 39048946b..97f26dc18 100644 --- a/tests/utility/probing_scheme_test.cu +++ b/tests/utility/probing_scheme_test.cu @@ -78,6 +78,37 @@ __global__ void generate_cg_probing_sequence(Key key, } } +// Walks each lane's probe iterator and records, at every step, the probe distance reported for the +// slot that lane is visiting. Because the resident under test is `key` itself, the slot visited at +// step `i` is at probe distance `i` from `key`'s home — for every lane. Recording one column per +// lane lets the host check both that `probe_distance` inverts `make_iterator` and that it strips +// the per-lane intra-stride offset (so all lanes at a given step agree). +template +__global__ void generate_cg_probe_distance_sequence(Key key, + Extent upper_bound, + size_t seq_length, + OutputIt out_seq) +{ + auto constexpr cg_size = ProbingScheme::cg_size; + + auto const tid = blockIdx.x * blockDim.x + threadIdx.x; + auto probing_scheme = ProbingScheme{}; + + if (tid < cg_size) { + auto const tile = + cooperative_groups::tiled_partition( + cooperative_groups::this_thread_block()); + + auto iter = probing_scheme.template make_iterator(tile, key, upper_bound); + + for (size_t i = 0; i < seq_length; ++i) { + out_seq[i * cg_size + tile.thread_rank()] = + probing_scheme.template probe_distance(key, *iter, upper_bound); + ++iter; + } + } +} + TEMPLATE_TEST_CASE_SIG( "utility probing_scheme tests", "", @@ -111,3 +142,44 @@ TEMPLATE_TEST_CASE_SIG( REQUIRE(cuco::test::equal( scalar_seq.begin(), scalar_seq.end(), cg_seq.begin(), cuda::std::equal_to{})); } + +TEMPLATE_TEST_CASE_SIG( + "utility robin_hood probe_distance inverts make_iterator", + "", + ((typename Key, int32_t CGSize, int32_t BucketSize), Key, CGSize, BucketSize), + (int32_t, 1, 1), + (int32_t, 4, 1), + (int32_t, 8, 1), + (int32_t, 8, 2), + (int64_t, 4, 1), + (int64_t, 8, 2)) +{ + // Robin Hood wraps a linear probe sequence; `probe_distance` is its inverse. For `key`'s own + // probe sequence, the slot visited at step `i` must report probe distance `i`. + using probe = + cuco::robin_hood_probing>>; + + // A deliberately small capacity, so the probe sequence wraps around the table within + // `seq_length` steps — this exercises the modular-subtraction (wrap) path in `probe_distance`. + auto const upper_bound = + cuco::make_valid_extent>(cuco::extent{64}); + + // Probe distance is measured in whole probing steps and lives in `[0, num_buckets)`, where one + // step spans the full `cg_size * bucket_size` stride. Taking `seq_length` past `num_buckets` + // guarantees the walk wraps at least once. + auto const capacity = static_cast(upper_bound); + auto const num_buckets = capacity / (CGSize * BucketSize); + auto const seq_length = num_buckets + 3; + constexpr Key key{42}; + + thrust::device_vector distances(seq_length * CGSize); + generate_cg_probe_distance_sequence + <<<1, CGSize>>>(key, upper_bound, seq_length, distances.begin()); + + // Under wrap, the slot visited at step `i` sits at probe distance `i mod num_buckets`. + for (std::size_t i = 0; i < seq_length; ++i) { + for (std::int32_t r = 0; r < CGSize; ++r) { + REQUIRE(distances[i * CGSize + r] == i % num_buckets); + } + } +}