/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ #ifndef _COUPONHASHSET_INTERNAL_HPP_ #define _COUPONHASHSET_INTERNAL_HPP_ #include "CouponHashSet.hpp" #include #include #include namespace datasketches { template static int find(const int* array, const int lgArrInts, const int coupon); template CouponHashSet::CouponHashSet(const int lgConfigK, const target_hll_type tgtHllType) : CouponList(lgConfigK, tgtHllType, hll_mode::SET) { if (lgConfigK <= 7) { throw std::invalid_argument("CouponHashSet must be initialized with lgConfigK > 7. Found: " + std::to_string(lgConfigK)); } } template CouponHashSet::CouponHashSet(const CouponHashSet& that) : CouponList(that) {} template CouponHashSet::CouponHashSet(const CouponHashSet& that, const target_hll_type tgtHllType) : CouponList(that, tgtHllType) {} template CouponHashSet::~CouponHashSet() {} template std::function*)> CouponHashSet::get_deleter() const { return [](HllSketchImpl* ptr) { CouponHashSet* chs = static_cast*>(ptr); chs->~CouponHashSet(); chsAlloc().deallocate(chs, 1); }; } template CouponHashSet* CouponHashSet::newSet(const void* bytes, size_t len) { if (len < HllUtil::HASH_SET_INT_ARR_START) { // hard-coded throw std::invalid_argument("Input data length insufficient to hold CouponHashSet"); } const uint8_t* data = static_cast(bytes); if (data[HllUtil::PREAMBLE_INTS_BYTE] != HllUtil::HASH_SET_PREINTS) { throw std::invalid_argument("Incorrect number of preInts in input stream"); } if (data[HllUtil::SER_VER_BYTE] != HllUtil::SER_VER) { throw std::invalid_argument("Wrong ser ver in input stream"); } if (data[HllUtil::FAMILY_BYTE] != HllUtil::FAMILY_ID) { throw std::invalid_argument("Input stream is not an HLL sketch"); } const hll_mode mode = HllSketchImpl::extractCurMode(data[HllUtil::MODE_BYTE]); if (mode != SET) { throw std::invalid_argument("Calling set construtor with non-set mode data"); } const target_hll_type tgtHllType = HllSketchImpl::extractTgtHllType(data[HllUtil::MODE_BYTE]); const int lgK = data[HllUtil::LG_K_BYTE]; if (lgK <= 7) { throw std::invalid_argument("Attempt to deserialize invalid CouponHashSet with lgConfigK <= 7. Found: " + std::to_string(lgK)); } int lgArrInts = data[HllUtil::LG_ARR_BYTE]; const bool compactFlag = ((data[HllUtil::FLAGS_BYTE] & HllUtil::COMPACT_FLAG_MASK) ? true : false); int couponCount; std::memcpy(&couponCount, data + HllUtil::HASH_SET_COUNT_INT, sizeof(couponCount)); if (lgArrInts < HllUtil::LG_INIT_SET_SIZE) { lgArrInts = HllUtil::computeLgArrInts(SET, couponCount, lgK); } // Don't set couponCount in sketch here; // we'll set later if updatable, and increment with updates if compact const int couponsInArray = (compactFlag ? couponCount : (1 << lgArrInts)); const size_t expectedLength = HllUtil::HASH_SET_INT_ARR_START + (couponsInArray * sizeof(int)); if (len < expectedLength) { throw std::invalid_argument("Byte array too short for sketch. Expected " + std::to_string(expectedLength) + ", found: " + std::to_string(len)); } CouponHashSet* sketch = new (chsAlloc().allocate(1)) CouponHashSet(lgK, tgtHllType); sketch->putOutOfOrderFlag(true); if (compactFlag) { const uint8_t* curPos = data + HllUtil::HASH_SET_INT_ARR_START; int coupon; for (int i = 0; i < couponCount; ++i, curPos += sizeof(coupon)) { std::memcpy(&coupon, curPos, sizeof(coupon)); sketch->couponUpdate(coupon); } } else { int* oldArr = sketch->couponIntArr; const size_t oldArrLen = 1 << sketch->lgCouponArrInts; sketch->lgCouponArrInts = lgArrInts; typedef typename std::allocator_traits::template rebind_alloc intAlloc; sketch->couponIntArr = intAlloc().allocate(1 << lgArrInts); sketch->couponCount = couponCount; // only need to read valid coupons, unlike in stream case std::memcpy(sketch->couponIntArr, data + HllUtil::HASH_SET_INT_ARR_START, couponCount * sizeof(int)); intAlloc().deallocate(oldArr, oldArrLen); } return sketch; } template CouponHashSet* CouponHashSet::newSet(std::istream& is) { uint8_t listHeader[8]; is.read((char*)listHeader, 8 * sizeof(uint8_t)); if (listHeader[HllUtil::PREAMBLE_INTS_BYTE] != HllUtil::HASH_SET_PREINTS) { throw std::invalid_argument("Incorrect number of preInts in input stream"); } if (listHeader[HllUtil::SER_VER_BYTE] != HllUtil::SER_VER) { throw std::invalid_argument("Wrong ser ver in input stream"); } if (listHeader[HllUtil::FAMILY_BYTE] != HllUtil::FAMILY_ID) { throw std::invalid_argument("Input stream is not an HLL sketch"); } hll_mode mode = HllSketchImpl::extractCurMode(listHeader[HllUtil::MODE_BYTE]); if (mode != SET) { throw std::invalid_argument("Calling set construtor with non-set mode data"); } target_hll_type tgtHllType = HllSketchImpl::extractTgtHllType(listHeader[HllUtil::MODE_BYTE]); const int lgK = listHeader[HllUtil::LG_K_BYTE]; if (lgK <= 7) { throw std::invalid_argument("Attempt to deserialize invalid CouponHashSet with lgConfigK <= 7. Found: " + std::to_string(lgK)); } int lgArrInts = listHeader[HllUtil::LG_ARR_BYTE]; const bool compactFlag = ((listHeader[HllUtil::FLAGS_BYTE] & HllUtil::COMPACT_FLAG_MASK) ? true : false); int couponCount; is.read((char*)&couponCount, sizeof(couponCount)); if (lgArrInts < HllUtil::LG_INIT_SET_SIZE) { lgArrInts = HllUtil::computeLgArrInts(SET, couponCount, lgK); } CouponHashSet* sketch = new (chsAlloc().allocate(1)) CouponHashSet(lgK, tgtHllType); sketch->putOutOfOrderFlag(true); // Don't set couponCount here; // we'll set later if updatable, and increment with updates if compact if (compactFlag) { for (int i = 0; i < couponCount; ++i) { int coupon; is.read((char*)&coupon, sizeof(coupon)); sketch->couponUpdate(coupon); } } else { int* oldArr = sketch->couponIntArr; const size_t oldArrLen = 1 << sketch->lgCouponArrInts; sketch->lgCouponArrInts = lgArrInts; typedef typename std::allocator_traits::template rebind_alloc intAlloc; sketch->couponIntArr = intAlloc().allocate(1 << lgArrInts); sketch->couponCount = couponCount; // for stream processing, read entire list so read pointer ends up set correctly is.read((char*)sketch->couponIntArr, (1 << sketch->lgCouponArrInts) * sizeof(int)); intAlloc().deallocate(oldArr, oldArrLen); } return sketch; } template CouponHashSet* CouponHashSet::copy() const { return new (chsAlloc().allocate(1)) CouponHashSet(*this); } template CouponHashSet* CouponHashSet::copyAs(const target_hll_type tgtHllType) const { return new (chsAlloc().allocate(1)) CouponHashSet(*this, tgtHllType); } template HllSketchImpl* CouponHashSet::couponUpdate(int coupon) { const int index = find(this->couponIntArr, this->lgCouponArrInts, coupon); if (index >= 0) { return this; // found duplicate, ignore } this->couponIntArr[~index] = coupon; // found empty ++this->couponCount; if (checkGrowOrPromote()) { return this->promoteHeapListOrSetToHll(*this); } return this; } template int CouponHashSet::getMemDataStart() const { return HllUtil::HASH_SET_INT_ARR_START; } template int CouponHashSet::getPreInts() const { return HllUtil::HASH_SET_PREINTS; } template bool CouponHashSet::checkGrowOrPromote() { if ((HllUtil::RESIZE_DENOM * this->couponCount) > (HllUtil::RESIZE_NUMER * (1 << this->lgCouponArrInts))) { if (this->lgCouponArrInts == (this->lgConfigK - 3)) { // at max size return true; // promote to HLL } int tgtLgCoupArrSize = this->lgCouponArrInts + 1; growHashSet(this->lgCouponArrInts, tgtLgCoupArrSize); } return false; } template void CouponHashSet::growHashSet(const int srcLgCoupArrSize, const int tgtLgCoupArrSize) { const int tgtLen = 1 << tgtLgCoupArrSize; typedef typename std::allocator_traits::template rebind_alloc intAlloc; int* tgtCouponIntArr = intAlloc().allocate(tgtLen); std::fill(tgtCouponIntArr, tgtCouponIntArr + tgtLen, 0); const int srcLen = 1 << srcLgCoupArrSize; for (int i = 0; i < srcLen; ++i) { // scan existing array for non-zero values const int fetched = this->couponIntArr[i]; if (fetched != HllUtil::EMPTY) { const int idx = find(tgtCouponIntArr, tgtLgCoupArrSize, fetched); // search TGT array if (idx < 0) { // found EMPTY tgtCouponIntArr[~idx] = fetched; // insert continue; } throw std::runtime_error("Error: Found duplicate coupon"); } } intAlloc().deallocate(this->couponIntArr, 1 << this->lgCouponArrInts); this->couponIntArr = tgtCouponIntArr; this->lgCouponArrInts = tgtLgCoupArrSize; } template static int find(const int* array, const int lgArrInts, const int coupon) { const int arrMask = (1 << lgArrInts) - 1; int probe = coupon & arrMask; const int loopIndex = probe; do { const int couponAtIdx = array[probe]; if (couponAtIdx == HllUtil::EMPTY) { return ~probe; //empty } else if (coupon == couponAtIdx) { return probe; //duplicate } const int stride = ((coupon & HllUtil::KEY_MASK_26) >> lgArrInts) | 1; probe = (probe + stride) & arrMask; } while (probe != loopIndex); throw std::invalid_argument("Key not found and no empty slots!"); } } #endif // _COUPONHASHSET_INTERNAL_HPP_