/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ #include #include #include #ifdef TEST_BINARY_INPUT_PATH static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH; #else static std::string testBinaryInputPath = "test/"; #endif namespace datasketches { class frequent_items_sketch_test: public CppUnit::TestFixture { CPPUNIT_TEST_SUITE(frequent_items_sketch_test); CPPUNIT_TEST(empty); CPPUNIT_TEST(one_item); CPPUNIT_TEST(several_items_no_resize_no_purge); CPPUNIT_TEST(several_items_with_resize_no_purge); CPPUNIT_TEST(estimation_mode); CPPUNIT_TEST(merge_exact_mode); CPPUNIT_TEST(merge_estimation_mode); CPPUNIT_TEST(deserialize_from_java_long); CPPUNIT_TEST(deserialize_from_java_string); CPPUNIT_TEST(deserialize_from_java_string_utf8); CPPUNIT_TEST(serialize_deserialize_long64_stream); CPPUNIT_TEST(serialize_deserialize_long64_bytes); CPPUNIT_TEST(serialize_deserialize_string_stream); CPPUNIT_TEST(serialize_deserialize_string_bytes); CPPUNIT_TEST(serialize_deserialize_string_utf8_stream); CPPUNIT_TEST_SUITE_END(); void empty() { frequent_items_sketch sketch(3); CPPUNIT_ASSERT(sketch.is_empty()); CPPUNIT_ASSERT_EQUAL(0U, sketch.get_num_active_items()); CPPUNIT_ASSERT_EQUAL(0, (int) sketch.get_total_weight()); } void one_item() { frequent_items_sketch sketch(3); sketch.update("a"); CPPUNIT_ASSERT(!sketch.is_empty()); CPPUNIT_ASSERT_EQUAL(1U, sketch.get_num_active_items()); CPPUNIT_ASSERT_EQUAL(1, (int) sketch.get_total_weight()); CPPUNIT_ASSERT_EQUAL(1, (int) sketch.get_estimate("a")); CPPUNIT_ASSERT_EQUAL(1, (int) sketch.get_lower_bound("a")); CPPUNIT_ASSERT_EQUAL(1, (int) sketch.get_upper_bound("a")); } void several_items_no_resize_no_purge() { frequent_items_sketch sketch(3); sketch.update("a"); sketch.update("b"); sketch.update("c"); sketch.update("d"); sketch.update("b"); sketch.update("c"); sketch.update("b"); CPPUNIT_ASSERT(!sketch.is_empty()); CPPUNIT_ASSERT_EQUAL(7, (int) sketch.get_total_weight()); CPPUNIT_ASSERT_EQUAL(4U, sketch.get_num_active_items()); CPPUNIT_ASSERT_EQUAL(1, (int) sketch.get_estimate("a")); CPPUNIT_ASSERT_EQUAL(3, (int) sketch.get_estimate("b")); CPPUNIT_ASSERT_EQUAL(2, (int) sketch.get_estimate("c")); CPPUNIT_ASSERT_EQUAL(1, (int) sketch.get_estimate("d")); } void several_items_with_resize_no_purge() { frequent_items_sketch sketch(4); sketch.update("a"); sketch.update("b"); sketch.update("c"); sketch.update("d"); sketch.update("b"); sketch.update("c"); sketch.update("b"); sketch.update("e"); sketch.update("f"); sketch.update("g"); sketch.update("h"); sketch.update("i"); sketch.update("j"); sketch.update("k"); sketch.update("l"); CPPUNIT_ASSERT(!sketch.is_empty()); CPPUNIT_ASSERT_EQUAL(15, (int) sketch.get_total_weight()); CPPUNIT_ASSERT_EQUAL(12U, sketch.get_num_active_items()); CPPUNIT_ASSERT_EQUAL(1, (int) sketch.get_estimate("a")); CPPUNIT_ASSERT_EQUAL(3, (int) sketch.get_estimate("b")); CPPUNIT_ASSERT_EQUAL(2, (int) sketch.get_estimate("c")); CPPUNIT_ASSERT_EQUAL(1, (int) sketch.get_estimate("d")); } void estimation_mode() { frequent_items_sketch sketch(3); sketch.update(1, 10); sketch.update(2); sketch.update(3); sketch.update(4); sketch.update(5); sketch.update(6); sketch.update(7, 15); sketch.update(8); sketch.update(9); sketch.update(10); sketch.update(11); sketch.update(12); CPPUNIT_ASSERT(sketch.get_maximum_error() > 0); // estimation mode CPPUNIT_ASSERT(!sketch.is_empty()); CPPUNIT_ASSERT_EQUAL(35, (int) sketch.get_total_weight()); auto items = sketch.get_frequent_items(frequent_items_error_type::NO_FALSE_POSITIVES); CPPUNIT_ASSERT_EQUAL(2, (int) items.size()); // only 2 items (1 and 7) should have counts more than 1 CPPUNIT_ASSERT_EQUAL(7, items[0].get_item()); CPPUNIT_ASSERT_EQUAL(15, (int) items[0].get_estimate()); CPPUNIT_ASSERT_EQUAL(1, items[1].get_item()); CPPUNIT_ASSERT_EQUAL(10, (int) items[1].get_estimate()); items = sketch.get_frequent_items(frequent_items_error_type::NO_FALSE_NEGATIVES); CPPUNIT_ASSERT(2 <= (int) items.size()); // at least 2 items CPPUNIT_ASSERT(12 >= (int) items.size()); // but not more than 12 items } void merge_exact_mode() { frequent_items_sketch sketch1(3); sketch1.update(1); sketch1.update(2); sketch1.update(3); sketch1.update(4); frequent_items_sketch sketch2(3); sketch1.update(2); sketch1.update(3); sketch1.update(2); sketch1.merge(sketch2); CPPUNIT_ASSERT(!sketch1.is_empty()); CPPUNIT_ASSERT_EQUAL(7, (int) sketch1.get_total_weight()); CPPUNIT_ASSERT_EQUAL(4U, sketch1.get_num_active_items()); CPPUNIT_ASSERT_EQUAL(1, (int) sketch1.get_estimate(1)); CPPUNIT_ASSERT_EQUAL(3, (int) sketch1.get_estimate(2)); CPPUNIT_ASSERT_EQUAL(2, (int) sketch1.get_estimate(3)); CPPUNIT_ASSERT_EQUAL(1, (int) sketch1.get_estimate(4)); } void merge_estimation_mode() { frequent_items_sketch sketch1(4); sketch1.update(1, 9); // to make sure it survives the purge sketch1.update(2); sketch1.update(3); sketch1.update(4); sketch1.update(5); sketch1.update(6); sketch1.update(7); sketch1.update(8); sketch1.update(9); sketch1.update(10); sketch1.update(11); sketch1.update(12); sketch1.update(13); sketch1.update(14); CPPUNIT_ASSERT(sketch1.get_maximum_error() > 0); // estimation mode frequent_items_sketch sketch2(4); sketch2.update(8); sketch2.update(9); sketch2.update(10); sketch2.update(11); sketch2.update(12); sketch2.update(13); sketch2.update(14); sketch2.update(15); sketch2.update(16); sketch2.update(17); sketch2.update(18); sketch2.update(19); sketch2.update(20); sketch2.update(21, 11); // to make sure it survives the purge CPPUNIT_ASSERT(sketch2.get_maximum_error() > 0); // estimation mode sketch1.merge(sketch2); CPPUNIT_ASSERT(!sketch1.is_empty()); CPPUNIT_ASSERT_EQUAL(46, (int) sketch1.get_total_weight()); CPPUNIT_ASSERT(2U <= sketch1.get_num_active_items()); auto items = sketch1.get_frequent_items(frequent_items_error_type::NO_FALSE_POSITIVES, 2); CPPUNIT_ASSERT_EQUAL(2, (int) items.size()); // only 2 items (1 and 21) should be above threshold CPPUNIT_ASSERT_EQUAL(21, items[0].get_item()); CPPUNIT_ASSERT(11 <= (int) items[0].get_estimate()); // always overestimated CPPUNIT_ASSERT_EQUAL(1, items[1].get_item()); CPPUNIT_ASSERT(9 <= (int) items[1].get_estimate()); // always overestimated } void deserialize_from_java_long() { std::ifstream is; is.exceptions(std::ios::failbit | std::ios::badbit); is.open(testBinaryInputPath + "longs_sketch_from_java.bin", std::ios::binary); auto sketch = frequent_items_sketch::deserialize(is); CPPUNIT_ASSERT(!sketch.is_empty()); CPPUNIT_ASSERT_EQUAL(4, (int) sketch.get_total_weight()); CPPUNIT_ASSERT_EQUAL(4U, sketch.get_num_active_items()); CPPUNIT_ASSERT_EQUAL(1, (int) sketch.get_estimate(1)); CPPUNIT_ASSERT_EQUAL(1, (int) sketch.get_estimate(2)); CPPUNIT_ASSERT_EQUAL(1, (int) sketch.get_estimate(3)); CPPUNIT_ASSERT_EQUAL(1, (int) sketch.get_estimate(4)); } void deserialize_from_java_string() { std::ifstream is; is.exceptions(std::ios::failbit | std::ios::badbit); is.open(testBinaryInputPath + "items_sketch_string_from_java.bin", std::ios::binary); auto sketch = frequent_items_sketch::deserialize(is); CPPUNIT_ASSERT(!sketch.is_empty()); CPPUNIT_ASSERT_EQUAL(4, (int) sketch.get_total_weight()); CPPUNIT_ASSERT_EQUAL(4U, sketch.get_num_active_items()); CPPUNIT_ASSERT_EQUAL(1, (int) sketch.get_estimate("aaaaaaaaaaaaaaaaaaaaaaaaaaaaa")); CPPUNIT_ASSERT_EQUAL(1, (int) sketch.get_estimate("bbbbbbbbbbbbbbbbbbbbbbbbbbbbb")); CPPUNIT_ASSERT_EQUAL(1, (int) sketch.get_estimate("ccccccccccccccccccccccccccccc")); CPPUNIT_ASSERT_EQUAL(1, (int) sketch.get_estimate("ddddddddddddddddddddddddddddd")); } void deserialize_from_java_string_utf8() { std::ifstream is; is.exceptions(std::ios::failbit | std::ios::badbit); is.open(testBinaryInputPath + "items_sketch_string_utf8_from_java.bin", std::ios::binary); auto sketch = frequent_items_sketch::deserialize(is); CPPUNIT_ASSERT(!sketch.is_empty()); CPPUNIT_ASSERT_EQUAL(10, (int) sketch.get_total_weight()); CPPUNIT_ASSERT_EQUAL(4U, sketch.get_num_active_items()); CPPUNIT_ASSERT_EQUAL(1, (int) sketch.get_estimate("абвгд")); CPPUNIT_ASSERT_EQUAL(2, (int) sketch.get_estimate("еёжзи")); CPPUNIT_ASSERT_EQUAL(3, (int) sketch.get_estimate("йклмн")); CPPUNIT_ASSERT_EQUAL(4, (int) sketch.get_estimate("опрст")); } void serialize_deserialize_long64_stream() { frequent_items_sketch sketch1(3); sketch1.update(1, 1); sketch1.update(2, 2); sketch1.update(3, 3); sketch1.update(4, 4); sketch1.update(5, 5); std::stringstream s(std::ios::in | std::ios::out | std::ios::binary); sketch1.serialize(s); auto sketch2 = frequent_items_sketch::deserialize(s); CPPUNIT_ASSERT(!sketch2.is_empty()); CPPUNIT_ASSERT_EQUAL(15, (int) sketch2.get_total_weight()); CPPUNIT_ASSERT_EQUAL(5U, sketch2.get_num_active_items()); CPPUNIT_ASSERT_EQUAL(1, (int) sketch2.get_estimate(1)); CPPUNIT_ASSERT_EQUAL(2, (int) sketch2.get_estimate(2)); CPPUNIT_ASSERT_EQUAL(3, (int) sketch2.get_estimate(3)); CPPUNIT_ASSERT_EQUAL(4, (int) sketch2.get_estimate(4)); CPPUNIT_ASSERT_EQUAL(5, (int) sketch2.get_estimate(5)); } void serialize_deserialize_long64_bytes() { frequent_items_sketch sketch1(3); sketch1.update(1, 1); sketch1.update(2, 2); sketch1.update(3, 3); sketch1.update(4, 4); sketch1.update(5, 5); auto p = sketch1.serialize(); auto sketch2 = frequent_items_sketch::deserialize(p.first.get(), p.second); CPPUNIT_ASSERT(!sketch2.is_empty()); CPPUNIT_ASSERT_EQUAL(15, (int) sketch2.get_total_weight()); CPPUNIT_ASSERT_EQUAL(5U, sketch2.get_num_active_items()); CPPUNIT_ASSERT_EQUAL(1, (int) sketch2.get_estimate(1)); CPPUNIT_ASSERT_EQUAL(2, (int) sketch2.get_estimate(2)); CPPUNIT_ASSERT_EQUAL(3, (int) sketch2.get_estimate(3)); CPPUNIT_ASSERT_EQUAL(4, (int) sketch2.get_estimate(4)); CPPUNIT_ASSERT_EQUAL(5, (int) sketch2.get_estimate(5)); } void serialize_deserialize_string_stream() { frequent_items_sketch sketch1(3); sketch1.update("aaaaaaaaaaaaaaaa", 1); sketch1.update("bbbbbbbbbbbbbbbb", 2); sketch1.update("cccccccccccccccc", 3); sketch1.update("dddddddddddddddd", 4); sketch1.update("eeeeeeeeeeeeeeee", 5); std::stringstream s(std::ios::in | std::ios::out | std::ios::binary); sketch1.serialize(s); auto sketch2 = frequent_items_sketch::deserialize(s); CPPUNIT_ASSERT(!sketch2.is_empty()); CPPUNIT_ASSERT_EQUAL(15, (int) sketch2.get_total_weight()); CPPUNIT_ASSERT_EQUAL(5U, sketch2.get_num_active_items()); CPPUNIT_ASSERT_EQUAL(1, (int) sketch2.get_estimate("aaaaaaaaaaaaaaaa")); CPPUNIT_ASSERT_EQUAL(2, (int) sketch2.get_estimate("bbbbbbbbbbbbbbbb")); CPPUNIT_ASSERT_EQUAL(3, (int) sketch2.get_estimate("cccccccccccccccc")); CPPUNIT_ASSERT_EQUAL(4, (int) sketch2.get_estimate("dddddddddddddddd")); CPPUNIT_ASSERT_EQUAL(5, (int) sketch2.get_estimate("eeeeeeeeeeeeeeee")); } void serialize_deserialize_string_bytes() { frequent_items_sketch sketch1(3); sketch1.update("aaaaaaaaaaaaaaaa", 1); sketch1.update("bbbbbbbbbbbbbbbb", 2); sketch1.update("cccccccccccccccc", 3); sketch1.update("dddddddddddddddd", 4); sketch1.update("eeeeeeeeeeeeeeee", 5); auto p = sketch1.serialize(); auto sketch2 = frequent_items_sketch::deserialize(p.first.get(), p.second); CPPUNIT_ASSERT(!sketch2.is_empty()); CPPUNIT_ASSERT_EQUAL(15, (int) sketch2.get_total_weight()); CPPUNIT_ASSERT_EQUAL(5U, sketch2.get_num_active_items()); CPPUNIT_ASSERT_EQUAL(1, (int) sketch2.get_estimate("aaaaaaaaaaaaaaaa")); CPPUNIT_ASSERT_EQUAL(2, (int) sketch2.get_estimate("bbbbbbbbbbbbbbbb")); CPPUNIT_ASSERT_EQUAL(3, (int) sketch2.get_estimate("cccccccccccccccc")); CPPUNIT_ASSERT_EQUAL(4, (int) sketch2.get_estimate("dddddddddddddddd")); CPPUNIT_ASSERT_EQUAL(5, (int) sketch2.get_estimate("eeeeeeeeeeeeeeee")); } void serialize_deserialize_string_utf8_stream() { frequent_items_sketch sketch1(3); sketch1.update("абвгд", 1); sketch1.update("еёжзи", 2); sketch1.update("йклмн", 3); sketch1.update("опрст", 4); sketch1.update("уфхцч", 5); std::stringstream s(std::ios::in | std::ios::out | std::ios::binary); sketch1.serialize(s); auto sketch2 = frequent_items_sketch::deserialize(s); CPPUNIT_ASSERT(!sketch2.is_empty()); CPPUNIT_ASSERT_EQUAL(15, (int) sketch2.get_total_weight()); CPPUNIT_ASSERT_EQUAL(5U, sketch2.get_num_active_items()); CPPUNIT_ASSERT_EQUAL(1, (int) sketch2.get_estimate("абвгд")); CPPUNIT_ASSERT_EQUAL(2, (int) sketch2.get_estimate("еёжзи")); CPPUNIT_ASSERT_EQUAL(3, (int) sketch2.get_estimate("йклмн")); CPPUNIT_ASSERT_EQUAL(4, (int) sketch2.get_estimate("опрст")); CPPUNIT_ASSERT_EQUAL(5, (int) sketch2.get_estimate("уфхцч")); } }; CPPUNIT_TEST_SUITE_REGISTRATION(frequent_items_sketch_test); } /* namespace datasketches */