/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ #ifndef _QUANTILES_SKETCH_HPP_ #define _QUANTILES_SKETCH_HPP_ #include #include #include #include "quantiles_sorted_view.hpp" #include "common_defs.hpp" #include "serde.hpp" #include "optional.hpp" namespace datasketches { /// Constants for Quantiles sketch namespace quantiles_constants { /// default value of parameter K const uint16_t DEFAULT_K = 128; /// minimum value of parameter K const uint16_t MIN_K = 2; /// maximum value of parameter K const uint16_t MAX_K = 1 << 15; } /** * This is a stochastic streaming sketch that enables near-real time analysis of the * approximate distribution from a very large stream in a single pass. * The analysis is obtained using get_rank() and get_quantile() functions, * the Probability Mass Function from get_PMF() and the Cumulative Distribution Function from get_CDF(). * *

Consider a large stream of one million values such as packet sizes coming into a network node. * The natural rank of any specific size value is its index in the hypothetical sorted * array of values. * The normalized rank is the natural rank divided by the stream size, * in this case one million. * The value corresponding to the normalized rank of 0.5 represents the 50th percentile or median * value of the distribution, or get_quantile(0.5). Similarly, the 95th percentile is obtained from * get_quantile(0.95).

* *

From the min and max values, for example, 1 and 1000 bytes, * you can obtain the PMF from get_PMF(100, 500, 900) that will result in an array of * 4 fractional values such as {.4, .3, .2, .1}, which means that *

    *
  • 40% of the values were < 100,
  • *
  • 30% of the values were ≥ 100 and < 500,
  • *
  • 20% of the values were ≥ 500 and < 900, and
  • *
  • 10% of the values were ≥ 900.
  • *
* A frequency histogram can be obtained by multiplying these fractions by get_n(), * which is the total count of values received. * The get_CDF() works similarly, but produces the cumulative distribution instead. * *

As of November 2021, this implementation produces serialized sketches which are binary-compatible * with the equivalent Java implementation only when template parameter T = double * (64-bit double precision values). * *

The accuracy of this sketch is a function of the configured value k, which also affects * the overall size of the sketch. Accuracy of this quantile sketch is always with respect to * the normalized rank. A k of 128 produces a normalized, rank error of about 1.7%. * For example, the median item returned from getQuantile(0.5) will be between the actual items * from the hypothetically sorted array of input items at normalized ranks of 0.483 and 0.517, with * a confidence of about 99%.

* *
Table Guide for DoublesSketch Size in Bytes and Approximate Error:
          K => |      16      32      64     128     256     512   1,024
    ~ Error => | 12.145%  6.359%  3.317%  1.725%  0.894%  0.463%  0.239%
             N | Size in Bytes ->
------------------------------------------------------------------------
             0 |       8       8       8       8       8       8       8
             1 |      72      72      72      72      72      72      72
             3 |      72      72      72      72      72      72      72
             7 |     104     104     104     104     104     104     104
            15 |     168     168     168     168     168     168     168
            31 |     296     296     296     296     296     296     296
            63 |     424     552     552     552     552     552     552
           127 |     552     808   1,064   1,064   1,064   1,064   1,064
           255 |     680   1,064   1,576   2,088   2,088   2,088   2,088
           511 |     808   1,320   2,088   3,112   4,136   4,136   4,136
         1,023 |     936   1,576   2,600   4,136   6,184   8,232   8,232
         2,047 |   1,064   1,832   3,112   5,160   8,232  12,328  16,424
         4,095 |   1,192   2,088   3,624   6,184  10,280  16,424  24,616
         8,191 |   1,320   2,344   4,136   7,208  12,328  20,520  32,808
        16,383 |   1,448   2,600   4,648   8,232  14,376  24,616  41,000
        32,767 |   1,576   2,856   5,160   9,256  16,424  28,712  49,192
        65,535 |   1,704   3,112   5,672  10,280  18,472  32,808  57,384
       131,071 |   1,832   3,368   6,184  11,304  20,520  36,904  65,576
       262,143 |   1,960   3,624   6,696  12,328  22,568  41,000  73,768
       524,287 |   2,088   3,880   7,208  13,352  24,616  45,096  81,960
     1,048,575 |   2,216   4,136   7,720  14,376  26,664  49,192  90,152
     2,097,151 |   2,344   4,392   8,232  15,400  28,712  53,288  98,344
     4,194,303 |   2,472   4,648   8,744  16,424  30,760  57,384 106,536
     8,388,607 |   2,600   4,904   9,256  17,448  32,808  61,480 114,728
    16,777,215 |   2,728   5,160   9,768  18,472  34,856  65,576 122,920
    33,554,431 |   2,856   5,416  10,280  19,496  36,904  69,672 131,112
    67,108,863 |   2,984   5,672  10,792  20,520  38,952  73,768 139,304
   134,217,727 |   3,112   5,928  11,304  21,544  41,000  77,864 147,496
   268,435,455 |   3,240   6,184  11,816  22,568  43,048  81,960 155,688
   536,870,911 |   3,368   6,440  12,328  23,592  45,096  86,056 163,880
 1,073,741,823 |   3,496   6,696  12,840  24,616  47,144  90,152 172,072
 2,147,483,647 |   3,624   6,952  13,352  25,640  49,192  94,248 180,264
 4,294,967,295 |   3,752   7,208  13,864  26,664  51,240  98,344 188,456

 * 
*

There is more documentation available on * datasketches.apache.org.

* *

This is an implementation of the Low Discrepancy Mergeable Quantiles Sketch * described in section 3.2 of the journal version of the paper "Mergeable Summaries" * by Agarwal, Cormode, Huang, Phillips, Wei, and Yi. *

* *

This algorithm is independent of the distribution of items and * requires only that the items be comparable.

* *

This algorithm intentionally inserts randomness into the sampling process for items that * ultimately get retained in the sketch. The results produced by this algorithm are not * deterministic. For example, if the same stream is inserted into two different instances of this * sketch, the answers obtained from the two sketches may not be identical.

* *

Similarly, there may be directional inconsistencies. For example, the result * obtained from get_quantile(rank) input into the reverse directional query * get_rank(item) may not result in the original item.

* * @author Kevin Lang * @author Lee Rhodes * @author Alexander Saydakov * @author Jon Malkin */ template , // strict weak ordering function (see C++ named requirements: Compare) typename Allocator = std::allocator> class quantiles_sketch { public: using value_type = T; using allocator_type = Allocator; using comparator = Comparator; using quantile_return_type = typename quantiles_sorted_view::quantile_return_type; using vector_double = typename quantiles_sorted_view::vector_double; /** * Constructor * @param k affects the size of the sketch and its estimation error * @param comparator strict weak ordering function (see C++ named requirements: Compare) * @param allocator used to allocate memory */ explicit quantiles_sketch(uint16_t k = quantiles_constants::DEFAULT_K, const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator()); /** * Copy constructor * @param other sketch to be copied */ quantiles_sketch(const quantiles_sketch& other); /** Move constructor * @param other sketch to be moved */ quantiles_sketch(quantiles_sketch&& other) noexcept; ~quantiles_sketch(); /** * Copy assignment * @param other sketch to be copied * @return reference to this sketch */ quantiles_sketch& operator=(const quantiles_sketch& other); /** * Move assignment * @param other sketch to be moved * @return reference to this sketch */ quantiles_sketch& operator=(quantiles_sketch&& other) noexcept; /** * @brief Type converting constructor * @param other quantiles sketch of a different type * @param comparator instance of a Comparator * @param allocator instance of an Allocator */ template explicit quantiles_sketch(const quantiles_sketch& other, const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator()); /** * Updates this sketch with the given data item. * @param item from a stream of items */ template void update(FwdT&& item); /** * Merges another sketch into this one. * @param other sketch to merge into this one */ template void merge(FwdSk&& other); /** * Returns true if this sketch is empty. * @return empty flag */ bool is_empty() const; /** * Returns configured parameter k * @return parameter k */ uint16_t get_k() const; /** * Returns the length of the input stream. * @return stream length */ uint64_t get_n() const; /** * Returns the number of retained items (samples) in the sketch. * @return the number of retained items */ uint32_t get_num_retained() const; /** * Returns true if this sketch is in estimation mode. * @return estimation mode flag */ bool is_estimation_mode() const; /** * Returns the min item of the stream. * If the sketch is empty this throws std::runtime_error. * @return the min item of the stream */ const T& get_min_item() const; /** * Returns the max item of the stream. * If the sketch is empty this throws std::runtime_error. * @return the max item of the stream */ const T& get_max_item() const; /** * Returns an instance of the comparator for this sketch. * @return comparator */ Comparator get_comparator() const; /** * Returns the allocator for this sketch. * @return allocator */ allocator_type get_allocator() const; /** * Returns an approximation to the data item associated with the given rank * of a hypothetical sorted version of the input stream so far. *

* If the sketch is empty this throws std::runtime_error. * * @param rank the specified normalized rank in the hypothetical sorted stream. * @param inclusive if true the weight of the given item is included into the rank. * Otherwise the rank equals the sum of the weights of all items that are less than the given item * according to the Comparator. * @return the approximation to the item at the given rank */ quantile_return_type get_quantile(double rank, bool inclusive = true) const; /** * Returns an approximation to the normalized rank of the given item from 0 to 1, inclusive. * *

The resulting approximation has a probabilistic guarantee that can be obtained from the * get_normalized_rank_error(false) function. * *

If the sketch is empty this throws std::runtime_error. * * @param item to be ranked * @param inclusive if true the weight of the given item is included into the rank. * Otherwise the rank equals the sum of the weights of all items that are less than the given item * according to the Comparator. * @return an approximate normalized rank of the given item */ double get_rank(const T& item, bool inclusive = true) const; /** * Returns an approximation to the Probability Mass Function (PMF) of the input stream * given a set of split points (items). * *

The resulting approximations have a probabilistic guarantee that can be obtained from the * get_normalized_rank_error(true) function. * *

If the sketch is empty this throws std::runtime_error. * * @param split_points an array of m unique, monotonically increasing items * that divide the input domain into m+1 consecutive disjoint intervals (bins). * * @param size of the array of split points. * * @param inclusive if true the rank of an item includes its own weight, and therefore * if the sketch contains items equal to a slit point, then in PMF such items are * included into the interval to the left of split point. Otherwise they are included into the interval * to the right of split point. * * @return an array of m+1 doubles each of which is an approximation * to the fraction of the input stream items (the mass) that fall into one of those intervals. */ vector_double get_PMF(const T* split_points, uint32_t size, bool inclusive = true) const; /** * Returns an approximation to the Cumulative Distribution Function (CDF), which is the * cumulative analog of the PMF, of the input stream given a set of split points (items). * *

The resulting approximations have a probabilistic guarantee that can be obtained from the * get_normalized_rank_error(false) function. * *

If the sketch is empty this throws std::runtime_error. * * @param split_points an array of m unique, monotonically increasing items * that divide the input domain into m+1 consecutive disjoint intervals. * * @param size of the array of split points. * * @param inclusive if true the rank of an item includes its own weight, and therefore * if the sketch contains items equal to a slit point, then in CDF such items are * included into the interval to the left of split point. Otherwise they are included into * the interval to the right of split point. * * @return an array of m+1 double values, which are a consecutive approximation to the CDF * of the input stream given the split_points. The value at array position j of the returned * CDF array is the sum of the returned values in positions 0 through j of the returned PMF * array. This can be viewed as array of ranks of the given split points plus one more value * that is always 1. */ vector_double get_CDF(const T* split_points, uint32_t size, bool inclusive = true) const; /** * Computes size needed to serialize the current state of the sketch. * This version is for fixed-size arithmetic types (integral and floating point). * @param sd instance of a SerDe * @return size in bytes needed to serialize this sketch */ template, typename TT = T, typename std::enable_if::value, int>::type = 0> size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const; /** * Computes size needed to serialize the current state of the sketch. * This version is for all other types and can be expensive since every item needs to be looked at. * @param sd instance of a SerDe * @return size in bytes needed to serialize this sketch */ template, typename TT = T, typename std::enable_if::value, int>::type = 0> size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const; /** * This method serializes the sketch into a given stream in a binary form * @param os output stream * @param sd instance of a SerDe */ template> void serialize(std::ostream& os, const SerDe& sd = SerDe()) const; // This is a convenience alias for users // The type returned by the following serialize method using vector_bytes = std::vector::template rebind_alloc>; /** * This method serializes the sketch as a vector of bytes. * An optional header can be reserved in front of the sketch. * It is a blank space of a given size. * This header is used in Datasketches PostgreSQL extension. * @param header_size_bytes space to reserve in front of the sketch * @param sd instance of a SerDe * @return serialized sketch as a vector of bytes */ template> vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const; /** * This method deserializes a sketch from a given stream. * @param is input stream * @param sd instance of a SerDe * @param comparator instance of a Comparator * @param allocator instance of an Allocator * @return an instance of a sketch */ template> static quantiles_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(), const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator()); /** * This method deserializes a sketch from a given array of bytes. * @param bytes pointer to the array of bytes * @param size the size of the array * @param sd instance of a SerDe * @param comparator instance of a Comparator * @param allocator instance of an Allocator * @return an instance of a sketch */ template> static quantiles_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(), const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator()); /** * Gets the normalized rank error for this sketch. Constants were derived as the best fit to 99 percentile * empirically measured max error in thousands of trials. * @param is_pmf if true, returns the "double-sided" normalized rank error for the get_PMF() function. * Otherwise, it is the "single-sided" normalized rank error for all the other queries. * @return the normalized rank error for the sketch */ double get_normalized_rank_error(bool is_pmf) const; /** * Gets the normalized rank error given k and pmf. Constants were derived as the best fit to 99 percentile * empirically measured max error in thousands of trials. * @param k the configuration parameter * @param is_pmf if true, returns the "double-sided" normalized rank error for the get_PMF() function. * Otherwise, it is the "single-sided" normalized rank error for all the other queries. * @return the normalized rank error for the given parameters */ static double get_normalized_rank_error(uint16_t k, bool is_pmf); /** * Prints a summary of the sketch. * @param print_levels if true include information about levels * @param print_items if true include sketch data */ string to_string(bool print_levels = false, bool print_items = false) const; class const_iterator; /** * Iterator pointing to the first item in the sketch. * If the sketch is empty, the returned iterator must not be dereferenced or incremented. * @return iterator pointing to the first item in the sketch */ const_iterator begin() const; /** * Iterator pointing to the past-the-end item in the sketch. * The past-the-end item is the hypothetical item that would follow the last item. * It does not point to any item, and must not be dereferenced or incremented. * @return iterator pointing to the past-the-end item in the sketch */ const_iterator end() const; /** * Gets the sorted view of this sketch * @return the sorted view of this sketch */ quantiles_sorted_view get_sorted_view() const; private: using Level = std::vector; using VectorLevels = std::vector::template rebind_alloc>; /* Serialized sketch layout: * Long || Start Byte Addr: * Addr: * || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | * 0 || Preamble_Longs | SerVer | FamID | Flags |----- K ---------|---- unused -----| * * || 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | * 1 ||---------------------------Items Seen Count (N)--------------------------------| * * Long 3 is the start of data, beginning with serialized min and max item, followed by * the sketch data buffers. */ static const size_t EMPTY_SIZE_BYTES = 8; static const uint8_t SERIAL_VERSION_1 = 1; static const uint8_t SERIAL_VERSION_2 = 2; static const uint8_t SERIAL_VERSION = 3; static const uint8_t FAMILY = 8; enum flags { RESERVED0, RESERVED1, IS_EMPTY, IS_COMPACT, IS_SORTED }; static const uint8_t PREAMBLE_LONGS_SHORT = 1; // for empty static const uint8_t PREAMBLE_LONGS_FULL = 2; static const size_t DATA_START = 16; Comparator comparator_; Allocator allocator_; bool is_base_buffer_sorted_; uint16_t k_; uint64_t n_; uint64_t bit_pattern_; Level base_buffer_; VectorLevels levels_; optional min_item_; optional max_item_; mutable quantiles_sorted_view* sorted_view_; void setup_sorted_view() const; // modifies mutable state void reset_sorted_view(); // for deserialization class items_deleter; quantiles_sketch(uint16_t k, uint64_t n, uint64_t bit_pattern, Level&& base_buffer, VectorLevels&& levels, optional&& min_item, optional&& max_item, bool is_sorted, const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator()); void grow_base_buffer(); void process_full_base_buffer(); // returns true if size adjusted, else false bool grow_levels_if_needed(); // buffers should be pre-sized to target capacity as appropriate template static void in_place_propagate_carry(uint8_t starting_level, FwdV&& buf_size_k, Level& buf_size_2k, bool apply_as_update, quantiles_sketch& sketch); static void zip_buffer(Level& buf_in, Level& buf_out); static void merge_two_size_k_buffers(Level& arr_in_1, Level& arr_in_2, Level& arr_out, const Comparator& comparator); template static Level deserialize_array(std::istream& is, uint32_t num_items, uint32_t capcacity, const SerDe& serde, const Allocator& allocator); template static std::pair deserialize_array(const void* bytes, size_t size, uint32_t num_items, uint32_t capcacity, const SerDe& serde, const Allocator& allocator); static void check_k(uint16_t k); static void check_serial_version(uint8_t serial_version); static void check_header_validity(uint8_t preamble_longs, uint8_t flags_byte, uint8_t serial_version); static void check_family_id(uint8_t family_id); static uint32_t compute_retained_items(uint16_t k, uint64_t n); static uint32_t compute_base_buffer_items(uint16_t k, uint64_t n); static uint64_t compute_bit_pattern(uint16_t k, uint64_t n); static uint32_t count_valid_levels(uint64_t bit_pattern); static uint8_t compute_levels_needed(uint16_t k, uint64_t n); /** * Merges the src sketch into the tgt sketch with equal values of K. * src is modified only if elements can be moved out of it. */ template static void standard_merge(quantiles_sketch& tgt, FwdSk&& src); /** * Merges the src sketch into the tgt sketch with a smaller value of K. * However, it is required that the ratio of the two K values be a power of 2. * I.e., other.get_k() = this.get_k() * 2^(nonnegative integer). * src is modified only if elements can be moved out of it. */ template static void downsampling_merge(quantiles_sketch& tgt, FwdSk&& src); template static void zip_buffer_with_stride(FwdV&& buf_in, Level& buf_out, uint16_t stride); /** * Returns the zero-based bit position of the lowest zero bit of bits starting at * startingBit. If input is all ones, this returns 64. * @param bits the input bits as a long * @param starting_bit the zero-based starting bit position. Only the low 6 bits are used. * @return the zero-based bit position of the lowest zero bit starting at startingBit. */ static uint8_t lowest_zero_bit_starting_at(uint64_t bits, uint8_t starting_bit); template::value, int>::type = 0> static inline bool check_update_item(TT item) { return !std::isnan(item); } template::value, int>::type = 0> static inline bool check_update_item(TT) { return true; } // for type converting constructor template friend class quantiles_sketch; }; template class quantiles_sketch::const_iterator { public: using iterator_category = std::input_iterator_tag; using value_type = std::pair; using difference_type = void; using pointer = const return_value_holder; using reference = const value_type; const_iterator& operator++(); const_iterator& operator++(int); bool operator==(const const_iterator& other) const; bool operator!=(const const_iterator& other) const; reference operator*() const; pointer operator->() const; private: friend class quantiles_sketch; using Level = std::vector; using AllocLevel = typename std::allocator_traits::template rebind_alloc; Level base_buffer_; std::vector levels_; int level_; uint32_t index_; uint32_t bb_count_; uint64_t bit_pattern_; uint64_t weight_; uint16_t k_; const_iterator(const Level& base_buffer, const std::vector& levels, uint16_t k, uint64_t n, bool is_end); }; } /* namespace datasketches */ #include "quantiles_sketch_impl.hpp" #endif // _QUANTILES_SKETCH_HPP_