/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ #ifndef THETA_A_SET_DIFFERENCE_BASE_IMPL_HPP_ #define THETA_A_SET_DIFFERENCE_BASE_IMPL_HPP_ #include #include #include "conditional_back_inserter.hpp" #include "conditional_forward.hpp" namespace datasketches { template theta_set_difference_base::theta_set_difference_base(uint64_t seed, const A& allocator): allocator_(allocator), seed_hash_(compute_seed_hash(seed)) {} template template CS theta_set_difference_base::compute(FwdSketch&& a, const Sketch& b, bool ordered) const { if (a.is_empty() || (a.get_num_retained() > 0 && b.is_empty())) return CS(a, ordered); if (a.get_seed_hash() != seed_hash_) throw std::invalid_argument("A seed hash mismatch"); if (b.get_seed_hash() != seed_hash_) throw std::invalid_argument("B seed hash mismatch"); const uint64_t theta = std::min(a.get_theta64(), b.get_theta64()); std::vector entries(allocator_); bool is_empty = a.is_empty(); if (b.get_num_retained() == 0) { std::copy_if(forward_begin(std::forward(a)), forward_end(std::forward(a)), std::back_inserter(entries), key_less_than(theta)); } else { if (a.is_ordered() && b.is_ordered()) { // sort-based std::set_difference(forward_begin(std::forward(a)), forward_end(std::forward(a)), b.begin(), b.end(), conditional_back_inserter(entries, key_less_than(theta)), comparator()); } else { // hash-based const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), hash_table::REBUILD_THRESHOLD); hash_table table(lg_size, lg_size, hash_table::resize_factor::X1, 1, 0, 0, allocator_); // theta and seed are not used here for (const auto& entry: b) { const uint64_t hash = EK()(entry); if (hash < theta) { table.insert(table.find(hash).first, hash); } else if (b.is_ordered()) { break; // early stop } } // scan A lookup B for (auto& entry: a) { const uint64_t hash = EK()(entry); if (hash < theta) { auto result = table.find(hash); if (!result.second) entries.push_back(conditional_forward(entry)); } else if (a.is_ordered()) { break; // early stop } } } } if (entries.empty() && theta == theta_constants::MAX_THETA) is_empty = true; if (ordered && !a.is_ordered()) std::sort(entries.begin(), entries.end(), comparator()); return CS(is_empty, a.is_ordered() || ordered, seed_hash_, theta, std::move(entries)); } } /* namespace datasketches */ #endif