/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ #ifndef THETA_UPDATE_SKETCH_BASE_HPP_ #define THETA_UPDATE_SKETCH_BASE_HPP_ #include #include #include #include #include "MurmurHash3.h" #include "theta_comparators.hpp" #include "theta_constants.hpp" namespace datasketches { template< typename Entry, typename ExtractKey, typename Allocator > struct theta_update_sketch_base { using resize_factor = theta_constants::resize_factor; using comparator = compare_by_key; theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t theta, uint64_t seed, const Allocator& allocator, bool is_empty = true); theta_update_sketch_base(const theta_update_sketch_base& other); theta_update_sketch_base(theta_update_sketch_base&& other) noexcept; ~theta_update_sketch_base(); theta_update_sketch_base& operator=(const theta_update_sketch_base& other); theta_update_sketch_base& operator=(theta_update_sketch_base&& other); using iterator = Entry*; inline uint64_t hash_and_screen(const void* data, size_t length); inline std::pair find(uint64_t key) const; static inline std::pair find(Entry* entries, uint8_t lg_size, uint64_t key); template inline void insert(iterator it, FwdEntry&& entry); iterator begin() const; iterator end() const; // resize threshold = 0.5 tuned for speed static constexpr double RESIZE_THRESHOLD = 0.5; // hash table rebuild threshold = 15/16 static constexpr double REBUILD_THRESHOLD = 15.0 / 16.0; static constexpr uint8_t STRIDE_HASH_BITS = 7; static constexpr uint32_t STRIDE_MASK = (1 << STRIDE_HASH_BITS) - 1; Allocator allocator_; bool is_empty_; uint8_t lg_cur_size_; uint8_t lg_nom_size_; resize_factor rf_; float p_; uint32_t num_entries_; uint64_t theta_; uint64_t seed_; Entry* entries_; void resize(); void rebuild(); void trim(); void reset(); static inline uint32_t get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size); static inline uint32_t get_stride(uint64_t key, uint8_t lg_size); static void consolidate_non_empty(Entry* entries, size_t size, size_t num); }; /// Theta base builder template class theta_base_builder { public: /** * Creates and instance of the builder with default parameters. * @param allocator instance of an Allocator to pass to created sketches */ theta_base_builder(const Allocator& allocator); /** * Set log2(k), where k is a nominal number of entries in the sketch * @param lg_k base 2 logarithm of nominal number of entries * @return this builder */ Derived& set_lg_k(uint8_t lg_k); /** * Set resize factor for the internal hash table (defaults to 8) * @param rf resize factor * @return this builder */ Derived& set_resize_factor(resize_factor rf); /** * Set sampling probability (initial theta). The default is 1, so the sketch retains * all entries until it reaches the limit, at which point it goes into the estimation mode * and reduces the effective sampling probability (theta) as necessary. * @param p sampling probability * @return this builder */ Derived& set_p(float p); /** * Set the seed for the hash function. Should be used carefully if needed. * Sketches produced with different seed are not compatible * and cannot be mixed in set operations. * @param seed hash seed * @return this builder */ Derived& set_seed(uint64_t seed); protected: Allocator allocator_; uint8_t lg_k_; resize_factor rf_; float p_; uint64_t seed_; uint64_t starting_theta() const; uint8_t starting_lg_size() const; }; // key extractor struct trivial_extract_key { template auto operator()(T&& entry) const -> decltype(std::forward(entry)) { return std::forward(entry); } }; // key not zero template class key_not_zero { public: bool operator()(const Entry& entry) const { return ExtractKey()(entry) != 0; } }; template class key_not_zero_less_than { public: explicit key_not_zero_less_than(const Key& key): key(key) {} bool operator()(const Entry& entry) const { return ExtractKey()(entry) != 0 && ExtractKey()(entry) < this->key; } private: Key key; }; // MurMur3 hash functions static inline uint64_t compute_hash(const void* data, size_t length, uint64_t seed) { HashState hashes; MurmurHash3_x64_128(data, length, seed, hashes); return (hashes.h1 >> 1); // Java implementation does unsigned shift >>> to make values positive } // iterators template class theta_iterator { public: using iterator_category = std::input_iterator_tag; using value_type = Entry; using difference_type = std::ptrdiff_t; using pointer = Entry*; using reference = Entry&; theta_iterator(Entry* entries, uint32_t size, uint32_t index); theta_iterator& operator++(); theta_iterator operator++(int); bool operator==(const theta_iterator& other) const; bool operator!=(const theta_iterator& other) const; reference operator*() const; pointer operator->() const; private: Entry* entries_; uint32_t size_; uint32_t index_; }; template class theta_const_iterator { public: using iterator_category = std::input_iterator_tag; using value_type = const Entry; using difference_type = std::ptrdiff_t; using pointer = const Entry*; using reference = const Entry&; theta_const_iterator(const Entry* entries, uint32_t size, uint32_t index); theta_const_iterator& operator++(); theta_const_iterator operator++(int); bool operator==(const theta_const_iterator& other) const; bool operator!=(const theta_const_iterator& other) const; reference operator*() const; pointer operator->() const; private: const Entry* entries_; uint32_t size_; uint32_t index_; }; // double value canonicalization for compatibility with Java static inline int64_t canonical_double(double value) { union { int64_t long_value; double double_value; } long_double_union; if (value == 0.0) { long_double_union.double_value = 0.0; // canonicalize -0.0 to 0.0 } else if (std::isnan(value)) { long_double_union.long_value = 0x7ff8000000000000L; // canonicalize NaN using value from Java's Double.doubleToLongBits() } else { long_double_union.double_value = value; } return long_double_union.long_value; } } /* namespace datasketches */ #include "theta_update_sketch_base_impl.hpp" #endif