/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ #ifndef BOUNDS_ON_RATIOS_IN_THETA_SKETCHED_SETS_HPP_ #define BOUNDS_ON_RATIOS_IN_THETA_SKETCHED_SETS_HPP_ #include #include #include "bounds_on_ratios_in_sampled_sets.hpp" namespace datasketches { /** * Bounds on ratios in Theta sketched sets. * This is to compute the bounds on the estimate of the ratio B / A, where: * * Note: The theta of A cannot be greater than the theta of B. * If B is formed as an intersection of A and some other set C, * then the theta of B is guaranteed to be less than or equal to the theta of B. */ template class bounds_on_ratios_in_theta_sketched_sets { public: /** * Gets the approximate lower bound for B over A based on a 95% confidence interval * @param sketch_a the sketch A * @param sketch_b the sketch B * @return the approximate lower bound for B over A */ template static double lower_bound_for_b_over_a(const SketchA& sketch_a, const SketchB& sketch_b) { const uint64_t theta64_a = sketch_a.get_theta64(); const uint64_t theta64_b = sketch_b.get_theta64(); check_thetas(theta64_a, theta64_b); const uint64_t count_b = sketch_b.get_num_retained(); const uint64_t count_a = theta64_a == theta64_b ? sketch_a.get_num_retained() : count_less_than_theta64(sketch_a, theta64_b); if (count_a == 0) return 0; const double f = sketch_b.get_theta(); return bounds_on_ratios_in_sampled_sets::lower_bound_for_b_over_a(count_a, count_b, f); } /** * Gets the approximate upper bound for B over A based on a 95% confidence interval * @param sketch_a the sketch A * @param sketch_b the sketch B * @return the approximate upper bound for B over A */ template static double upper_bound_for_b_over_a(const SketchA& sketch_a, const SketchB& sketch_b) { const uint64_t theta64_a = sketch_a.get_theta64(); const uint64_t theta64_b = sketch_b.get_theta64(); check_thetas(theta64_a, theta64_b); const uint64_t count_b = sketch_b.get_num_retained(); const uint64_t count_a = (theta64_a == theta64_b) ? sketch_a.get_num_retained() : count_less_than_theta64(sketch_a, theta64_b); if (count_a == 0) return 1; const double f = sketch_b.get_theta(); return bounds_on_ratios_in_sampled_sets::upper_bound_for_b_over_a(count_a, count_b, f); } /** * Gets the estimate for B over A * @param sketch_a the sketch A * @param sketch_b the sketch B * @return the estimate for B over A */ template static double estimate_of_b_over_a(const SketchA& sketch_a, const SketchB& sketch_b) { const uint64_t theta64_a = sketch_a.get_theta64(); const uint64_t theta64_b = sketch_b.get_theta64(); check_thetas(theta64_a, theta64_b); const uint64_t count_b = sketch_b.get_num_retained(); const uint64_t count_a = (theta64_a == theta64_b) ? sketch_a.get_num_retained() : count_less_than_theta64(sketch_a, theta64_b); if (count_a == 0) return 0.5; return static_cast(count_b) / static_cast(count_a); } private: static inline void check_thetas(uint64_t theta_a, uint64_t theta_b) { if (theta_b > theta_a) { throw std::invalid_argument("theta_a must be <= theta_b"); } } template static uint64_t count_less_than_theta64(const Sketch& sketch, uint64_t theta) { uint64_t count = 0; for (const auto& entry: sketch) if (ExtractKey()(entry) < theta) ++count; return count; } }; } /* namespace datasketches */ # endif