/*! * \file countmin.h * * \brief header file for CM and MFV sketches */ #ifndef _COUNTMIN_H_ #define _COUNTMIN_H_ #include /* CHAR_BIT */ #define INT64BITS (sizeof(int64)*CHAR_BIT) #define RANGES INT64BITS #define DEPTH 8 /* magic tuning value: number of hash functions */ /* #define NUMCOUNTERS 65535 */ #define NUMCOUNTERS 1024 /* another magic tuning value: modulus of hash functions */ #ifdef INT64_IS_BUSTED #define MAX_INT64 (INT64CONST(0x7FFFFFFF)) #define MAX_UINT64 (UINT64CONST(0xFFFFFFFF)) #else #define MAX_INT64 (INT64CONST(0x7FFFFFFFFFFFFFFF)) #define MAX_UINT64 (UINT64CONST(0xFFFFFFFFFFFFFFFF)) #endif /* INT64_IS_BUSTED */ #define MID_INT64 (0) #define MIN_INT64 (~MAX_INT64) #define MID_UINT64 (MAX_UINT64 >> 1) #define MIN_UINT64 (0) /*! * \brief the CountMin sketch array * * a CountMin sketch is a set of DEPTH arrays of NUMCOUNTERS each. * It's like a "counting Bloom Filter" where instead of just hashing to * DEPTH bitmaps, we count up hash-collisions in DEPTH counter arrays */ typedef uint64 countmin[DEPTH][NUMCOUNTERS]; #define MAXARGS 3 /*! * \internal * \brief the transition value struct for CM sketches * * Holds the sketch counters * and a cache of handy metadata that we'll reuse across calls * \endinternal */ typedef struct { int64 args[MAXARGS]; /*! carry along additional args for finalizer */ int nargs; /*! number of args being carried for finalizer */ countmin sketches[RANGES]; } cmtransval; /*! base size of a cmtransval */ #define CM_TRANSVAL_SZ (VARHDRSZ + sizeof(cmtransval)) #define CM_TRANSVAL_INITIALIZED(t) (VARSIZE(t) >= CM_TRANSVAL_SZ) /*! * \internal * \brief array of ranges * * a data structure to hold the constituent dyadic (power-of-two) ranges * corresponding to an arbitrary range. * E.g. 14-48 becomes [[14-15], [16-31], [32-47], [48-48]] * \endinternal */ typedef struct { int64 spans[2*INT64BITS][2]; /*! the ranges */ uint32 emptyoffset; /*! offset of next empty span */ } rangelist; #define ADVANCE_OFFSET(r) \ if ((r).emptyoffset == 2*INT64BITS) { \ uint32 i; \ for (i = 0; i < 2*INT64BITS; i++) \ elog(NOTICE, \ "[" INT64_FORMAT ", " INT64_FORMAT "]", \ (r).spans[i][0], \ (r).spans[i][1]); \ elog(ERROR, "countmin error: rangelist overflow"); \ } \ else ((r).emptyoffset++); /*! * \internal * \brief offset/count pairs for MFV sketches * \endinternal */ typedef struct { unsigned offset; /*! memory offset to the value */ uint64 cnt; /*! counter */ } offsetcnt; /*! * \internal * \brief the transition value struct for MFV sketches. * * Holds a single * countmin sketch (no dyadic ranges) and an array of Most Frequent Values. * We are flexible with the number of mfvs, as well as the type. * Hence at the end of this struct is an array mfv[max_mfvs] of offsetcnt entries, * followed by an array of Postgres text objects with the output formats of * the mfvs. * Each mfv entry contains an offset from the top of the structure where * we can find a Postgres text object holding the output format of a * frequent value. * \endinternal */ typedef struct { unsigned max_mfvs; /*! number of frequent values */ unsigned next_mfv; /*! index of next mfv to insert into */ unsigned next_offset; /*! next memory offset to insert into */ Oid typOid; /*! Oid of the type being counted */ int typLen; /*! Length of the data type */ bool typByVal; /*! Whether type is by value or by reference */ Oid outFuncOid; /*! Oid of the outfunc for this type */ countmin sketch; /*! a single countmin sketch */ /*! * type-independent collection of Most Frequent Values * Holds an array of (counter,offset) pairs, which by * convention is followed by the values themselves as text Datums, * accessible via the offsets */ offsetcnt mfvs[]; } mfvtransval; /*! base size of an MFV transval */ #define MFV_TRANSVAL_SZ(i) (VARHDRSZ + sizeof(mfvtransval) + i*sizeof(offsetcnt)) /*! free space remaining for text values */ #define MFV_TRANSVAL_CAPACITY(transblob) (VARSIZE(transblob) - VARHDRSZ - \ ((mfvtransval *)VARDATA(transblob))-> \ next_offset) /* countmin aggregate protos */ Datum countmin_trans_c(countmin, Datum, Oid, Oid); bytea *cmsketch_check_transval(PG_FUNCTION_ARGS, bool); bytea *cmsketch_init_transval(); void countmin_dyadic_trans_c(cmtransval *, Datum); /* countmin scalar function protos */ int64 cmsketch_count_c(countmin, Datum, Oid, Oid); int64 cmsketch_count_md5_datum(countmin, bytea *, Oid); /* hash_counters_iterate and its lambdas */ int64 hash_counters_iterate(bytea *, countmin, int64, int64 (*lambdaptr)( uint32, uint32, countmin, int64)); int64 increment_counter(uint32, uint32, countmin, int64); int64 min_counter(uint32, uint32, countmin, int64); /* MFV protos */ bytea *mfv_transval_append(bytea *, Datum); int mfv_find(bytea *, Datum); bytea *mfv_transval_replace(bytea *, Datum, int); bytea *mfv_transval_insert_at(bytea *, Datum, uint32); void *mfv_transval_getval(bytea *, uint32); bytea *mfv_init_transval(int, Oid); bytea *mfvsketch_merge_c(bytea *, bytea *); void mfv_copy_datum(bytea *, int, Datum); int cnt_cmp_desc(const void *i, const void *j); /* UDF protos */ Datum __cmsketch_int8_trans(PG_FUNCTION_ARGS); Datum cmsketch_width_histogram(PG_FUNCTION_ARGS); Datum cmsketch_dhistogram(PG_FUNCTION_ARGS); Datum __cmsketch_base64_final(PG_FUNCTION_ARGS); Datum __cmsketch_merge(PG_FUNCTION_ARGS); Datum cmsketch_dump(PG_FUNCTION_ARGS); Datum __cmsketch_count_final(PG_FUNCTION_ARGS); Datum __cmsketch_rangecount_final(PG_FUNCTION_ARGS); Datum __cmsketch_centile_final(PG_FUNCTION_ARGS); Datum __cmsketch_median_final(PG_FUNCTION_ARGS); Datum __cmsketch_dhist_final(PG_FUNCTION_ARGS); Datum __mfvsketch_trans(PG_FUNCTION_ARGS); Datum __mfvsketch_final(PG_FUNCTION_ARGS); Datum __mfvsketch_merge(PG_FUNCTION_ARGS); #endif /* _COUNTMIN_H_ */