/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ #include #include #include #include #include #include #include template class max_value_policy { public: max_value_policy(const T& initial_value): initial_value(initial_value) {} T create() const { return initial_value; } void update(T& summary, const T& update) const { summary = std::max(summary, update); } private: T initial_value; }; using max_float_update_tuple_sketch = datasketches::update_tuple_sketch>; template class always_one_policy { public: always_one_policy(): initial_value(1) {} T create() const { return 1; } void update(T&, const T&) const {} private: T initial_value; }; using always_one_tuple_sketch = datasketches::update_tuple_sketch>; template class update_sum_value_policy { public: update_sum_value_policy(): initial_value(0) {} T create() const { return initial_value; } void update(T& summary, const T& update) const { summary += update; } private: T initial_value; }; using sum_update_tuple_sketch = datasketches::update_tuple_sketch>; template struct union_sum_value_policy { void operator()(Summary& summary, const Summary& other) const { summary += other; } }; using sum_union_tuple_sketch = datasketches::tuple_union>; class EngagementTest { public: uint8_t num_std_dev = 2; void test_always_one_update() { /* * Tests that updates into an update_tuple_sketch sketch only keeps a 1 in the column for stored values. */ uint8_t lgK = 8; std::vector>> sketch_array; auto always_one_sketch = always_one_tuple_sketch::builder(always_one_policy()).set_lg_k(lgK).build(); always_one_sketch.update(1, 1); always_one_sketch.update(1, 2); always_one_sketch.update(2, 1); always_one_sketch.update(3, 3); always_one_sketch.update(3, 7); int num_retained = 0; int sum = 0; for (const auto& entry: always_one_sketch) { sum += entry.second; ++num_retained; } REQUIRE(num_retained == 3); REQUIRE(sum == 3); // we only keep 1 for every stored key. } void test_sum_update_policy() { /* * Tests that updates into an sum_update_tuple_sketch sum the stored values on updates. */ uint8_t lgK = 8; auto sum_sketch = sum_update_tuple_sketch::builder().set_lg_k(lgK).build(); sum_sketch.update(1, 1); sum_sketch.update(1, 2); sum_sketch.update(2, 1); sum_sketch.update(3, 3); sum_sketch.update(3, 7); int num_retained = 0; int sum = 0; for (const auto& entry: sum_sketch) { sum += entry.second; ++num_retained; } REQUIRE(num_retained == 3); REQUIRE(sum == 14); // (1+2) + 1 + (3 + 7) = 14 } void test_sum_union_policy(){ /* * Tests that updates into two sketches of sum_update_tuple_sketch flavour, which have been unioned, * cause the stored values of two of the same keys to be summed. */ auto sketch1 = sum_update_tuple_sketch::builder().build(); auto sketch2 = sum_update_tuple_sketch::builder().build(); sketch1.update(1, 1); sketch1.update(2, 1); sketch1.update(3, 3); sketch2.update(1, 2); sketch2.update(2, 1); sketch2.update(3, 7); auto union_sketch = sum_union_tuple_sketch::builder().build(); union_sketch.update(sketch1); union_sketch.update(sketch2); auto union_result = union_sketch.get_result(); int num_retained = 0; int sum = 0; for (const auto& entry: union_result) { sum += entry.second; ++num_retained; } REQUIRE(num_retained == 3); REQUIRE(sum == 15); // 1:(1+2) + 2:(1+1) + 3:(3+7) = 15 } void compute_engagement_histogram() { /* * Returns the estimated histogram from the synthetic data. * On inspection one can verify this agrees with the * https://github.com/apache/datasketches-java/blob/master/src/test/java/org/apache/datasketches/tuple/aninteger/EngagementTest.java */ uint8_t lgK = 8; const int days = 30; int v = 0; std::set set_array[days]; std::vector>> sketch_array; for (int i = 0; i < days; ++i) { auto builder = always_one_tuple_sketch::builder(always_one_policy()); builder.set_lg_k(lgK); auto sketch = builder.build(); sketch_array.push_back(sketch); } REQUIRE(sketch_array.size() == days); for (int i = 0; i <= days; ++i) { int32_t num_ids = get_num_ids(days, i); int32_t num_days = get_num_days(days, i); int my_v = v++; for (int d = 0; d < num_days; ++d) { for (int id = 0; id < num_ids; ++id) { set_array[d].insert(my_v + id); sketch_array[d].update(my_v + id, 1); } } v += num_ids; } union_ops(lgK, sketch_array); } private: int32_t get_num_ids(int total_days, int index) { /* * Generates power law distributed synthetic data */ double d = total_days; double i = index; return int(round(exp(i * log(d) / d))); } int32_t get_num_days(int total_days, int index) { double d = total_days; double i = index; return int(round(exp((d-i) * log(d) / d ))); } int32_t round_double_to_int(double x) { return int(std::round(x)); } void union_ops(uint8_t lgk, std::vector>> sketches) { auto num_sketches = sketches.size(); auto u = sum_union_tuple_sketch::builder().set_lg_k(lgk).build(); for (auto sk: sketches) { u.update(sk); } auto union_result = u.get_result(); std::vector num_days_arr(num_sketches+1); for (const auto& entry: union_result) { int num_days_visited = entry.second; ++num_days_arr[num_days_visited]; } uint64_t sum_visits = 0; double theta = union_result.get_theta(); std::cout <<"\t\tEngagement Histogram.\t\t\t\n"; std::cout << "Number of Unique Visitors by Number of Days Visited" << std::endl; std::cout << "---------------------------------------------------" << std::endl; std::cout << std::setw(12) << "Days Visited" << std::setw(12) << "Estimate" << std::setw(12) << "LB" << std::setw(12) << "UB" << std:: endl; for (size_t i = 0; i < num_days_arr.size(); ++i) { auto visitors_at_days_visited = num_days_arr[i]; if (visitors_at_days_visited == 0) continue; sum_visits += visitors_at_days_visited * i; double est_visitors_at_days_visited = visitors_at_days_visited / theta; double lower_bound_at_days_visited = union_result.get_lower_bound(num_std_dev, visitors_at_days_visited); double upper_bound_at_days_visited = union_result.get_upper_bound(num_std_dev, visitors_at_days_visited); std::cout << std::setw(12) << i << std::setw(12) << est_visitors_at_days_visited << std::setw(12) << lower_bound_at_days_visited << std::setw(12) << upper_bound_at_days_visited << std:: endl; } std::cout << std::endl << std::endl; std::cout << std::setw(12) << "Totals" << std::setw(12) << "Estimate" << std::setw(12) << "LB" << std::setw(12) << "UB" << std:: endl; std::cout << "---------------------------------------------------" << std::endl; const double total_visitors = union_result.get_estimate(); const double lb_visitors = union_result.get_lower_bound(num_std_dev); const double ub_visitors = union_result.get_upper_bound(num_std_dev); std::cout << std::setw(12) << "Visitors" << std::setw(12) << total_visitors << std::setw(12) << lb_visitors << std::setw(12) << ub_visitors << std:: endl; // The total number of visits, however, is a scaled metric and takes advantage of the fact that // the retained entries in the sketch is a uniform random sample of all unique visitors, and // the rest of the unique users will likely behave in the same way. const double est_visits = sum_visits / theta; const double lb_visits = est_visits * lb_visitors / total_visitors; const double ub_visits = est_visits * ub_visitors / total_visitors; std::cout << std::setw(12) << "Visits" << std::setw(12) << est_visits << std::setw(12) << lb_visits << std::setw(12) << ub_visits << std:: endl; } }; namespace datasketches { TEST_CASE("engagement", "[engagement]") { EngagementTest E; E.test_always_one_update(); E.test_sum_update_policy(); E.test_sum_union_policy(); E.compute_engagement_histogram(); } } /* namespace datasketches */