#include "postgres.h"
/* This is an implementation of HyperLogLog algorithm as described in the
* paper "HyperLogLog: the analysis of near-optimal cardinality estimation
* algorithm", published by Flajolet, Fusy, Gandouet and Meunier in 2007.
* Generally it is an improved version of LogLog algorithm with the last
* step modified, to combine the parts using harmonic means.
*
* TODO The bitmap lengths are encoded in 1B each, which is much more than
* needed for 32bit hashes (5 bits would be enough, according to the paper).
* It might be improved (made more space efficient), but IMO it's not really
* worth the effort and it works with 64bit hashes too which is a good thing
* if you need to work with large cardinalities. So I'll keep it this for now.
*
* Computing the number of bits based on ndistinct seems rather simple - if
* 'b' is the number of bits, the estimator should handle 2^(2^b) distinct
* values. So with 5 bits, each bin can store values between 0 and 31.
* Apparently 31 is used as a special case 'value too high' so, there are
* only 30 values, so the bins count ~2^30 values.
*
* By reversing this, we can derive number of bits necessary.
*
* 4 bits -> 2^14 (~16k)
* 5 bits -> 2^30 (~1e9)
* 6 bits -> 2^62 (~4e18)
* 7 bits -> 2^126 ...
*
* So 4 bits seem to low and 6 bits unnecessarily high, and 5 bits is pretty
* much the best default option.
*
*
* TODO Implement merging two estimators (just as with adaptive estimator).
*/
typedef struct HyperLogLogCounterData {
/* length of the structure (varlena) */
int32 length;
/* Number of counters ('m' in the algorithm) - this is determined depending
* on the requested error rate - see hyperloglog_create() for details. */
int b; /* bits for bin index */
int m; /* m = 2^b */
/* number of bits for a single counter (1B=8bits for now, but may change) */
int binbits;
/* largest observed 'rho' for each of the 'm' bins (uses the very same trick
* as in the varlena type in include/c.h */
char data[1];
} HyperLogLogCounterData;
typedef HyperLogLogCounterData * HyperLogLogCounter;
/* creates an optimal bitmap able to count a multiset with the expected
* cardinality and the given error rate. */
HyperLogLogCounter hyperloglog_create(int64 ndistinct, float error);
int hyperloglog_get_size(int64 ndistinct, float error);
HyperLogLogCounter hyperloglog_copy(HyperLogLogCounter counter);
HyperLogLogCounter hyperloglog_merge(HyperLogLogCounter counter1, HyperLogLogCounter counter2, bool inplace);
/* add element existence */
void hyperloglog_add_element(HyperLogLogCounter hloglog, const char * element, int elen);
/* get an estimate from the hyperloglog counter */
int hyperloglog_estimate(HyperLogLogCounter hloglog);
void hyperloglog_reset_internal(HyperLogLogCounter hloglog);