/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ #ifndef FREQUENT_ITEMS_SKETCH_HPP_ #define FREQUENT_ITEMS_SKETCH_HPP_ #include #include #include #include #include #include "reverse_purge_hash_map.hpp" #include "common_defs.hpp" #include "serde.hpp" namespace datasketches { /// Frequent items error type enum frequent_items_error_type { NO_FALSE_POSITIVES, ///< include an item in the result list if get_lower_bound(item) > threshold NO_FALSE_NEGATIVES ///< include an item in the result list if get_upper_bound(item) > threshold }; /** * Frequent Items sketch. * * Based on Java implementation here: * https://github.com/apache/datasketches-java/blob/master/src/main/java/org/apache/datasketches/frequencies/ItemsSketch.java * @author Alexander Saydakov */ template< typename T, typename W = uint64_t, typename H = std::hash, typename E = std::equal_to, typename A = std::allocator > class frequent_items_sketch { static_assert(std::is_arithmetic::value, "Arithmetic type expected"); public: static const uint8_t LG_MIN_MAP_SIZE = 3; /** * Construct this sketch with parameters lg_max_map_size and lg_start_map_size. * * @param lg_max_map_size Log2 of the physical size of the internal hash map managed by this * sketch. The maximum capacity of this internal hash map is 0.75 times 2^lg_max_map_size. * Both the ultimate accuracy and size of this sketch are functions of lg_max_map_size. * @param lg_start_map_size Log2 of the starting physical size of the internal hash * map managed by this sketch. * @param equal instance of Equality operator * @param allocator instance of an Allocator */ explicit frequent_items_sketch(uint8_t lg_max_map_size, uint8_t lg_start_map_size = LG_MIN_MAP_SIZE, const E& equal = E(), const A& allocator = A()); /** * Update this sketch with an item and a positive weight (frequency count). * @param item for which the weight should be increased (lvalue) * @param weight the amount by which the weight of the item should be increased * A count of zero is a no-op, and a negative count will throw an exception. */ void update(const T& item, W weight = 1); /** * Update this sketch with an item and a positive weight (frequency count). * @param item for which the weight should be increased (rvalue) * @param weight the amount by which the weight of the item should be increased * A count of zero is a no-op, and a negative count will throw an exception. */ void update(T&& item, W weight = 1); /** * This function merges the other sketch into this one. * The other sketch may be of a different size. * @param other sketch to be merged into this (lvalue) */ void merge(const frequent_items_sketch& other); /** * This function merges the other sketch into this one. * The other sketch may be of a different size. * @param other sketch to be merged into this (rvalue) */ void merge(frequent_items_sketch&& other); /** * @return true if this sketch is empty */ bool is_empty() const; /** * @return the number of active items in the sketch */ uint32_t get_num_active_items() const; /** * Returns the sum of the weights (frequencies) in the stream seen so far by the sketch * * @return the total weight of all items in the stream seen so far by the sketch */ W get_total_weight() const; /** * Returns the estimate of the weight (frequency) of the given item. * Note: The true frequency of a item would be the sum of the counts as a result of the * two update functions. * * @param item the given item * @return the estimate of the weight (frequency) of the given item */ W get_estimate(const T& item) const; /** * Returns the guaranteed lower bound weight (frequency) of the given item. * * @param item the given item. * @return the guaranteed lower bound weight of the given item. That is, a number which * is guaranteed to be no larger than the real weight. */ W get_lower_bound(const T& item) const; /** * Returns the guaranteed upper bound weight (frequency) of the given item. * * @param item the given item * @return the guaranteed upper bound weight of the given item. That is, a number which * is guaranteed to be no smaller than the real frequency. */ W get_upper_bound(const T& item) const; /** * @return An upper bound on the maximum error of get_estimate(item) for any item. * This is equivalent to the maximum distance between the upper bound and the lower bound * for any item. */ W get_maximum_error() const; /** * Returns epsilon value of this sketch. * This is just the value 3.5 / max_map_size. * @return epsilon used by the sketch to compute error. */ double get_epsilon() const; /** * Returns epsilon used to compute a priori error. * This is just the value 3.5 / maxMapSize. * @param lg_max_map_size the planned map size to be used when constructing this sketch. * @return epsilon used to compute a priori error. */ static double get_epsilon(uint8_t lg_max_map_size); /** * Returns the estimated a priori error given the max_map_size for the sketch and the * estimated_total_stream_weight. * @param lg_max_map_size the planned map size to be used when constructing this sketch. * @param estimated_total_weight the estimated total stream weight. * @return the estimated a priori error. */ static double get_apriori_error(uint8_t lg_max_map_size, W estimated_total_weight); class row; using vector_row = typename std::vector::template rebind_alloc>; /** * Returns an array of rows that include frequent items, estimates, upper and lower bounds * given an error_type and using get_maximum_error() as a threshold. * *

The method first examines all active items in the sketch (items that have a counter). * *

If error_type = NO_FALSE_NEGATIVES, this will include an item in the result * list if get_upper_bound(item) > threshold. * There will be no false negatives, i.e., no Type II error. * There may be items in the set with true frequencies less than the threshold * (false positives).

* *

If error_type = NO_FALSE_POSITIVES, this will include an item in the result * list if get_lower_bound(item) > threshold. * There will be no false positives, i.e., no Type I error. * There may be items omitted from the set with true frequencies greater than the * threshold (false negatives).

* * @param err_type determines whether no false positives or no false negatives are desired. * @return an array of frequent items */ vector_row get_frequent_items(frequent_items_error_type err_type) const; /** * Returns an array of rows that include frequent items, estimates, upper and lower bounds * given an error_type and a threshold. * *

The method first examines all active items in the sketch (items that have a counter). * *

If error_type = NO_FALSE_NEGATIVES, this will include an item in the result * list if get_upper_bound(item) > threshold. * There will be no false negatives, i.e., no Type II error. * There may be items in the set with true frequencies less than the threshold * (false positives).

* *

If error_type = NO_FALSE_POSITIVES, this will include an item in the result * list if get_lower_bound(item) > threshold. * There will be no false positives, i.e., no Type I error. * There may be items omitted from the set with true frequencies greater than the * threshold (false negatives).

* * @param err_type determines whether no false positives or no false negatives are desired. * @param threshold to include items in the result list * @return an array of frequent items */ vector_row get_frequent_items(frequent_items_error_type err_type, W threshold) const; /** * Computes size needed to serialize the current state of the sketch. * This can be expensive since every item needs to be looked at. * @param sd instance of a SerDe * @return size in bytes needed to serialize this sketch */ template> size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const; /** * This method serializes the sketch into a given stream in a binary form * @param os output stream * @param sd instance of a SerDe */ template> void serialize(std::ostream& os, const SerDe& sd = SerDe()) const; // This is a convenience alias for users // The type returned by the following serialize method using vector_bytes = std::vector::template rebind_alloc>; /** * This method serializes the sketch as a vector of bytes. * An optional header can be reserved in front of the sketch. * It is a blank space of a given size. * This header is used in Datasketches PostgreSQL extension. * @param header_size_bytes space to reserve in front of the sketch * @param sd instance of a SerDe * @return serialized sketch as a vector of bytes */ template> vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const; /** * This method deserializes a sketch from a given stream. * @param is input stream * @param sd instance of a SerDe * @param equal instance of Equality operator * @param allocator instance of an Allocator * @return an instance of the sketch */ template> static frequent_items_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(), const E& equal = E(), const A& allocator = A()); /** * This method deserializes a sketch from a given array of bytes. * @param bytes pointer to the array of bytes * @param size the size of the array * @param sd instance of a SerDe * @param equal instance of Equality operator * @param allocator instance of an Allocator * @return an instance of the sketch */ template> static frequent_items_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(), const E& equal = E(), const A& allocator = A()); /** * Returns a human readable summary of this sketch * @param print_items if true include the list of items retained by the sketch */ string to_string(bool print_items = false) const; private: static const uint8_t SERIAL_VERSION = 1; static const uint8_t FAMILY_ID = 10; static const uint8_t PREAMBLE_LONGS_EMPTY = 1; static const uint8_t PREAMBLE_LONGS_NONEMPTY = 4; static constexpr double EPSILON_FACTOR = 3.5; // due to a mistake different bits were used in C++ and Java to indicate empty sketch // therefore both are set and checked for compatibility with historical binary format enum flags { IS_EMPTY_1 = 0, IS_EMPTY_2 = 2 }; W total_weight; W offset; reverse_purge_hash_map map; static void check_preamble_longs(uint8_t preamble_longs, bool is_empty); static void check_serial_version(uint8_t serial_version); static void check_family_id(uint8_t family_id); static void check_size(uint8_t lg_cur_size, uint8_t lg_max_size); // version for integral signed type template::value && std::is_signed::value, int>::type = 0> static inline void check_weight(WW weight); // version for integral unsigned type template::value && std::is_unsigned::value, int>::type = 0> static inline void check_weight(WW weight); // version for floating point type template::value, int>::type = 0> static inline void check_weight(WW weight); // for deserialize class items_deleter; }; /// Row in the output from #get_frequent_items template class frequent_items_sketch::row { public: row(const T* item, W weight, W offset): item(item), weight(weight), offset(offset) {} /// @return item const T& get_item() const { return *item; } /// @return frequency (weight) estimate W get_estimate() const { return weight + offset; } /// @return estimate lower bound W get_lower_bound() const { return weight; } /// @return estimate upper bound W get_upper_bound() const { return weight + offset; } private: const T* item; W weight; W offset; }; } #include "frequent_items_sketch_impl.hpp" # endif