import plpy import hashlib import struct from struct import pack, unpack from math import log import base64 # import numpy as np __ranges = 8*8 # INT64BITS __depth = 8 # magic # of hash functions __numcounters = 1024 # magic mod of hash function __countmin_sz = __depth*__numcounters __numsketches = __ranges total_size = __numsketches * __countmin_sz __max_int64 = (1L << 63) - 1 __min_int64 = __max_int64 * (-1) def count(b64sketch, val): if b64sketch is None: plpy.error("Sketch error: NULL sketch is provided!") try: return __do_count(base64.b64decode(b64sketch), val) except (TypeError, struct.error) as e: plpy.error("Sketch error: wrongly formatted cmsketch object provided!") def __do_count(all_sketch, val): rows = [ all_sketch[i*__countmin_sz:(i+1)*__countmin_sz] for i in range(0,__depth) ] return __do_count_rows(rows, val) def __do_count_rows(rows, val): m = hashlib.md5(pack('@q', val)).hexdigest() # we have to flip the bytes around here col_per_row = [int(m[i+2:i+4]+m[i:i+2],16) % __numcounters for i in range(0,__depth*4,4)] counts = [rows[i][col_per_row[i]*8:col_per_row[i]*8+8] for i in range(0,__depth)] return min([unpack('@q',x)[0] for x in counts]) def intlog2(x): i = 0 while (x > 0): x /= 2 i += 1 return i-1 #! # convert an arbitrary range [bot-top] into a rangelist of dyadic ranges. # E.g. convert 14-48 into [[14-15], [16-31], [32-47], [48-48]] # To manage overflow issues, do not generate a range larger than 2^63-1! # \param bot the bottom of the range (inclusive) # \param top the top of the range (inclusive) # \param r the list of ranges to be returned # def __find_ranges(bot, top): # kick off the recursion at power RANGES-1 return __find_ranges_recursive(bot, top, __ranges-1, []) #! # find the ranges via recursive calls to this routine, pulling out smaller and # smaller powers of 2 # To manage overflow issues, do not generate a range larger than 2^63-1! # \param bot the bottom of the range (inclusive) # \param top the top of the range (inclusive) # \param power the highest power to start with # \param r the rangelist to be returned def __find_ranges_recursive(bot, top, power, r): # Sanity check if (top < bot or power < 0): # skip. we get a bunch of off-by-one errors here right now return r if (top == bot): # base case of recursion, a range of the form [x-x]. r.append([bot, bot]) return r # a range that's 2^63 or larger will overflow int64's in the # code below, so recurse to the left and right by hand if (top >= 0 and bot < 0): # don't even try to figure out how wide this range is, just split r = __find_ranges_recursive(bot, -1, power-1, r) r = __find_ranges_recursive(0, top, power-1, r) return r width = top - bot + 1L # account for the fact that MIN and MAX are 1 off the true power of 2 if (top == __max_int64 or bot == __min_int64): width += 1 dyad = intlog2(width) if (dyad > 62): # dangerously big, so split. we know that we don't span 0. sign = -1 if (top < 0) else 1 r = __find_ranges_recursive(bot, (1L << 62)*sign - 1, 62, r) r = __find_ranges_recursive((1L << 62)*sign, top, 62, r) return r # if we get here, we have a range of size 2 or greater. # Find the largest dyadic range width in this range. pow_dyad = 1L << dyad # if this is a complete dyad if (pow_dyad == width): # if aligned, just append and return if (bot % pow_dyad == 0): r.append([bot,top]) return r else: # largest dyad that fits is 1 power-of-2 smaller pow_dyad /= 2 if ((bot == __min_int64) or (bot % pow_dyad == 0)): # our range is left-aligned on the dyad's min r.append([bot, None]) if (top == __max_int64): # special case: no need to recurse on right, and doing so # could overflow, so handle by hand r[-1][1] = __max_int64 else : # -1 on end of next line because range to the right will # start at the power-of-2 boundary. r[-1][1] = (bot + pow_dyad - 1) if (bot == __min_int64): # account for fact that __min_int64 is 1 bigger than -2^63 r[-1][1] -= 1 newbot = r[-1][1] + 1 # recurse on right at finer grain r = __find_ranges_recursive(newbot, top, power-1, r) elif (top == __max_int64 or ((top+1) % pow_dyad) == 0): # our range is right-aligned on the dyad's max. r.append([None, top]) if (bot == __min_int64): # special case: no need to recurse on left, and doing so # could overflow, so handle by hand r[-1][0] = __min_int64 else: r[-1][0] = (top - pow_dyad + 1) if (top == __max_int64): # account for fact that __max_int64 is 1 smaller than 2^63 r[-1][0] += 1 newtop = r[-1][0] - 1 # recurse on left at finer grain. r = __find_ranges_recursive(bot, newtop, power-1, r) else: # we straddle a power of 2 power_of_2 = pow_dyad*(top/pow_dyad) # recurse on right at finer grain r = __find_ranges_recursive(bot, power_of_2 - 1, power-1, r) # recurse on left at finer grain r = __find_ranges_recursive(power_of_2, top, power-1, r) return r def rangecount(b64sketch, bot, top): if b64sketch is None: plpy.error("Sketch error: NULL sketch is provided!") if bot > top: plpy.error("Sketch error: wrong range [{0}, {1}] provided!".format( str(bot), str(top))) try: return __do_rangecount(base64.b64decode(b64sketch), bot, top) except (TypeError, struct.error) as e: plpy.error("Sketch error: wrongly formatted cmsketch object provided!") def __do_rangecount(all_sketch, bot, top): cursum = 0 rows = [ all_sketch[i*__countmin_sz:(i+1)*__countmin_sz] for i in range(0,__depth*__ranges) ] r = __find_ranges(bot, top) # for obscure reasons, len(r) isn't working so use sum to compute lenny = sum([1 for i in r]) # __find_ranges will not generate a span larger than 2^63-1, so # we only need to consider 2^63 or less. for i in range(0, lenny): # What power of 2 is this range? if (r[i][0] == __min_int64 and r[i][1] == -1): # special case to avoid overflow: full range left of 0 dyad = 63 countval = -1 elif (r[i][0] == 0 and r[i][1] == __max_int64): # special case to avoid overflow: full range right of 0 dyad = 63 countval = 0 else: width = r[i][1] - r[i][0] + 1 if (r[i][0] == __min_int64): width += 1 if (r[i][1] == __max_int64): width += 1 # Divide min of range by 2^dyad and get count dyad = intlog2(width) countval = r[i][0] >> dyad val = __do_count_rows(rows[dyad*__depth:(dyad+1)*__depth], countval) cursum += val return cursum #! # find the approximate centile in a cm sketch by binary search in the domain of values # \param transval the current transition value # \param intcentile the centile to return # \param total the total count of items def centile(b64sketch, intcentile, total): if b64sketch is None: plpy.error("Sketch error: NULL sketch is provided!") if (intcentile <= 0 or intcentile >= 100): plpy.error("Sketch error: invalid centile {0} provided!".format( str(intcentile))) if total < 0: plpy.error("Sketch error: invalid count {0} is provided!".format( str(total))) try: return __do_centile(base64.b64decode(b64sketch), intcentile, total) except (TypeError, struct.error) as e: plpy.error("Sketch error: wrongly formatted cmsketch object provided!") def __do_centile(all_sketches, intcentile, total): # est_cnt = __do_rangecount(all_sketches, __min_int64, __max_int64) # if total != est_cnt: # plpy.error("Sketch error: cnt is {0}, but sketch has {1}".format( # str(total), str(est_cnt))) centile_cnt = (total * intcentile/100.0); loguess = __min_int64 higuess = __max_int64 while loguess < higuess: curguess = loguess + int((higuess - loguess) / 2) curcount = __do_rangecount(all_sketches, __min_int64, curguess) if (curcount >= centile_cnt): # overshot higuess = curguess else: # undershot loguess = curguess + 1 return loguess def width_histogram(b64sketch, themin, themax, buckets): if b64sketch is None: plpy.error("Sketch error: NULL sketch is provided!") if themin > themax: plpy.error("Sketch error: wrong range [{0}, {1}] provided!".format( str(themin), str(themax))) if buckets <= 0: plpy.error("Sketch error: negative nbuckets {0} provided!".format( str(buckets))) try: return __do_width_histo(base64.b64decode(b64sketch), themin, themax, buckets) except (TypeError, struct.error) as e: plpy.error("Sketch error: wrongly formatted cmsketch object provided!") def __do_width_histo(all_sketches, themin, themax, buckets): step = int(float(themax-themin+1) / float(buckets)) step = 1 if step < 1 else step histo = [] for i in range(0, buckets): binlo = themin + i*step; if (binlo > themax): break binhi = themax if (i == buckets-1) else (themin + (i+1)*step - 1) binval = __do_rangecount(all_sketches, binlo, binhi) histo.append([binlo,binhi,binval]) return histo def depth_histogram(b64sketch, buckets): if b64sketch is None: plpy.error("Sketch error: NULL sketch is provided!") if buckets <= 0: plpy.error("Sketch error: negative nbuckets {0} provided!".format( str(buckets))) try: return __do_depth_histo(base64.b64decode(b64sketch), buckets) except (TypeError, struct.error) as e: plpy.error("Sketch error: wrongly formatted cmsketch object provided!") def __do_depth_histo(all_sketches, buckets): step = int(100.0 / float(buckets)) step = 1 if step < 1 else step total = __do_rangecount(all_sketches, __min_int64, __max_int64) binlo = __min_int64 histo = [] for i in range(0, buckets): if (i < buckets - 1): cent = __do_centile(all_sketches, (i+1)*step, total); if (i > 0 and cent <= histo[-1][1]): # next centile is lower than previous; skip continue; histo.append([binlo,cent]) else: # this is the top bucket histo.append([binlo, __max_int64]) histo[-1].append(__do_rangecount(all_sketches,\ histo[-1][0], histo[-1][1])) binlo = histo[-1][1] + 1; return histo