/* ----------------------------------------------------------------------- *//**
*
* @file sketch.sql_in
*
* @brief SQL functions for sketch-based approximations of descriptive statistics
* @date April 2011
*
* @sa For a brief introduction to sketches, see the module description grp_sketches
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_sketches
@brief A collection of methods to estimate the number of unique values contained in the data.
\warning These MADlib methods are still in early stage development. There may be some
issues that will be addressed in future versions. Interface and implementation
is subject to change.
Sketches (sometimes called "synopsis data structures") are small randomized
in-memory data structures that capture statistical properties of a large set
of values (e.g., a column of a table). Sketches can be formed in a single
pass of the data, and used to approximate a variety of descriptive statistics.
We implement sketches as SQL User-Defined Aggregates (UDAs). Because they
are single-pass, small-space and parallelized, a single query can
use many sketches to gather summary statistics on many columns of a table efficiently.
This module currently implements user-defined aggregates based on three main sketch methods:
- Count-Min (CM) sketches, which can be used to approximate a number of descriptive statistics including
-
fmsketch_dcount( col_name )@anchor examples @examp -# Generate some data.
CREATE TABLE data(class INT, a1 INT); INSERT INTO data SELECT 1,1 FROM generate_series(1,10000); INSERT INTO data SELECT 1,2 FROM generate_series(1,15000); INSERT INTO data SELECT 1,3 FROM generate_series(1,10000); INSERT INTO data SELECT 2,5 FROM generate_series(1,1000); INSERT INTO data SELECT 2,6 FROM generate_series(1,1000);-# Find the distinct number of values for each class.
SELECT class, fmsketch_dcount(a1) FROM data GROUP BY data.class;Result:
class | fmsketch_dcount ------+----------------- 2 | 2 1 | 3 (2 rows)@anchor literature @literature [1] P. Flajolet and N.G. Martin. Probabilistic counting algorithms for data base applications, Journal of Computer and System Sciences 31(2), pp 182-209, 1985. http://algo.inria.fr/flajolet/Publications/FlMa85.pdf @anchor related @par Related Topics File sketch.sql_in documenting the SQL function. */ /** @addtogroup grp_countmin
cmsketch( col_name )- Get the number of rows where col_name = p, computed from the sketch obtained from cmsketch.
cmsketch_count( cmsketch, p )- Get the number of rows where col_name is between m and n inclusive.
cmsketch_rangecount( cmsketch, m, n )- Get the kth percentile of col_name where count specifies number of rows. k should be an integer between 1 to 99.
cmsketch_centile( cmsketch, k, count )- Get the median of col_name where count specifies number of rows. This is equivalent to \ref cmsketch_centile(cmsketch,50,count).
cmsketch_median( cmsketch, count )- Get an n-bucket histogram for values between min and max for the column where each bucket has approximately the same width. The output is a text string containing triples {lo, hi, count} representing the buckets; counts are approximate.
cmsketch_width_histogram( cmsketch, min, max, n )- Get an n-bucket histogram for the column where each bucket has approximately the same count. The output is a text string containing triples {lo, hi, count} representing the buckets; counts are approximate. Note that an equi-depth histogram is equivalent to a spanning set of equi-spaced centiles.
cmsketch_depth_histogram( cmsketch, n )@anchor examples @examp -# Generate some data.
CREATE TABLE data(class INT, a1 INT); INSERT INTO data SELECT 1,1 FROM generate_series(1,10000); INSERT INTO data SELECT 1,2 FROM generate_series(1,15000); INSERT INTO data SELECT 1,3 FROM generate_series(1,10000); INSERT INTO data SELECT 2,5 FROM generate_series(1,1000); INSERT INTO data SELECT 2,6 FROM generate_series(1,1000);-# Count number of rows where a1 = 2 in each class.
SELECT class, cmsketch_count( cmsketch( a1 ), 2 ) FROM data GROUP BY data.class;Result:
class | cmsketch_count ------+---------------- 2 | 0 1 | 15000 (2 rows)-# Count number of rows where a1 is between 3 and 6.
SELECT class, cmsketch_rangecount( cmsketch(a1), 3, 6 ) FROM data GROUP BY data.class;Result:
class | cmsketch_rangecount ------+--------------------- 2 | 2000 1 | 10000 (2 rows)-# Compute the 90th percentile of all of a1.
SELECT cmsketch_centile( cmsketch( a1 ), 90, count(*) ) FROM data;Result:
cmsketch_centile ----------------- 3 (1 row)-# Produce an equi-width histogram with 2 bins between 0 and 10.
SELECT cmsketch_width_histogram( cmsketch( a1 ), 0, 10, 2 ) FROM data;Result:
cmsketch_width_histogram ----------------------------------- [[0L, 4L, 35000], [5L, 10L, 2000]] (1 row)-# Produce an equi-depth histogram of a1 with 2 bins of approximately equal depth.
SELECT cmsketch_depth_histogram( cmsketch( a1 ), 2 ) FROM data;Result:
cmsketch_depth_histogram ---------------------------------------------------------------------- [[-9223372036854775807L, 1, 10000], [2, 9223372036854775807L, 27000]] (1 row)@anchor literature @literature [1] G. Cormode and S. Muthukrishnan. An improved data stream summary: The count-min sketch and its applications. LATIN 2004, J. Algorithm 55(1): 58-75 (2005) . http://dimacs.rutgers.edu/~graham/pubs/html/CormodeMuthukrishnan04CMLatin.html [2] G. Cormode. Encyclopedia entry on 'Count-Min Sketch'. In L. Liu and M. T. Ozsu, editors, Encyclopedia of Database Systems, pages 511-516. Springer, 2009. http://dimacs.rutgers.edu/~graham/pubs/html/Cormode09b.html @anchor related File sketch.sql_in documenting the SQL functions. Module \ref grp_quantile for a different implementation of quantile function. */ /** @addtogroup grp_mfvsketch
mfvsketch_top_histogram( col_name, n )The MFV frequent-value UDA comes in two different versions: - a faithful implementation that preserves the approximation guarantees of Cormode/Muthukrishnan, - and a "quick and dirty" version that can do parallel aggregation in Greenplum at the expense of missing some of the most frequent values. In PostgreSQL the two UDAs are identical. In Greenplum, the quick version should produce good results unless the number of values requested is very small, or the distribution is very flat. @anchor examples @examp -# Generate some data.
CREATE TABLE data(class INT, a1 INT); INSERT INTO data SELECT 1,1 FROM generate_series(1,10000); INSERT INTO data SELECT 1,2 FROM generate_series(1,15000); INSERT INTO data SELECT 1,3 FROM generate_series(1,10000); INSERT INTO data SELECT 2,5 FROM generate_series(1,1000); INSERT INTO data SELECT 2,6 FROM generate_series(1,1000);-# Produce a histogram of 5 bins and return the most frequent value and associated count in each bin.
SELECT mfvsketch_top_histogram( a1, 5 ) FROM data;Result:
mfvsketch_top_histogram ------------------------------------------------------------- [0:4][0:1]={{2,15000},{1,10000},{3,10000},{5,1000},{6,1000}} (1 row)@anchor literature @literature This method is not usually called an MFV sketch in the literature; it is a natural extension of the CountMin sketch. @anchor related @par Related Topics File sketch.sql_in documenting the SQL functions. Module \ref grp_countmin. */ -- FM Sketch Functions DROP FUNCTION IF EXISTS MADLIB_SCHEMA.big_or(bitmap1 bytea, bitmap2 bytea) CASCADE; CREATE FUNCTION MADLIB_SCHEMA.big_or(bitmap1 bytea, bitmap2 bytea) RETURNS bytea AS 'MODULE_PATHNAME' LANGUAGE C STRICT m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__fmsketch_trans(bitmaps bytea, input anyelement) CASCADE; CREATE FUNCTION MADLIB_SCHEMA.__fmsketch_trans(bitmaps bytea, input anyelement) RETURNS bytea AS 'MODULE_PATHNAME' LANGUAGE C STRICT m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__fmsketch_count_distinct(bitmaps bytea) CASCADE; CREATE FUNCTION MADLIB_SCHEMA.__fmsketch_count_distinct(bitmaps bytea) RETURNS int8 AS 'MODULE_PATHNAME' LANGUAGE C STRICT m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__fmsketch_merge(bitmaps1 bytea, bitmaps2 bytea) CASCADE; CREATE FUNCTION MADLIB_SCHEMA.__fmsketch_merge(bitmaps1 bytea, bitmaps2 bytea) RETURNS bytea AS 'MODULE_PATHNAME' LANGUAGE C STRICT m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.fmsketch_dcount(anyelement); /** * @brief Flajolet-Martin's distinct count estimation * @param column name */ CREATE AGGREGATE MADLIB_SCHEMA.fmsketch_dcount(/*+ column */ anyelement) ( sfunc = MADLIB_SCHEMA.__fmsketch_trans, stype = bytea, finalfunc = MADLIB_SCHEMA.__fmsketch_count_distinct, m4_ifdef(`__POSTGRESQL__', `', `prefunc = MADLIB_SCHEMA.__fmsketch_merge,') initcond = '' ); -- CM Sketch Functions -- We register __cmsketch_int8_trans for varying numbers of arguments to support -- a variety of agg function signatures. The first 2 args are used to -- aggregate; the remaining args are carried along unchanged inside the -- return structure for the use of the UDA finalizer. DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_int8_trans(bytea, int8) CASCADE; CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_int8_trans(bitmaps bytea, input int8) RETURNS bytea AS 'MODULE_PATHNAME' LANGUAGE C STRICT m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_int8_trans(bytea, int8, int8) CASCADE; CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_int8_trans(bitmaps bytea, input int8, arg1 int8) RETURNS bytea AS 'MODULE_PATHNAME' LANGUAGE C STRICT m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_int8_trans(bytea, int8, int8, int8) CASCADE; CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_int8_trans(bitmaps bytea, input int8, arg1 int8, arg2 int8) RETURNS bytea AS 'MODULE_PATHNAME' LANGUAGE C STRICT m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_int8_trans(bytea, int8, int8, int8, int8) CASCADE; CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_int8_trans(bitmaps bytea, input int8, arg1 int8, arg2 int8, arg3 int8) RETURNS bytea AS 'MODULE_PATHNAME' LANGUAGE C STRICT m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_base64_final(bytea) CASCADE; CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_base64_final(sketch bytea) RETURNS text AS 'MODULE_PATHNAME' LANGUAGE C STRICT m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); DROP FUNCTION IF EXISTS MADLIB_SCHEMA.__cmsketch_merge(bytea, bytea) CASCADE; CREATE FUNCTION MADLIB_SCHEMA.__cmsketch_merge(bytea, bytea) RETURNS bytea AS 'MODULE_PATHNAME' LANGUAGE C STRICT m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `'); DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.cmsketch(int8); /** *@brief