/* ----------------------------------------------------------------------- *//**
*
* @file correlation.sql_in
*
* @brief SQL functions for correlation computation
* @date April 2013
*
* @sa For a brief introduction to correlation, see the
* module description \ref grp_correlation
*
*//* ----------------------------------------------------------------------- */
m4_include(`SQLCommon.m4')
/**
@addtogroup grp_correlation
@brief Generates a cross-correlation matrix for all pairs of numeric columns in a table.
A correlation function is the degree and direction of association of two
variables—how well one random variable can be predicted from the other. The
coefficient of correlation varies from -1 to 1. A coefficient of 1 implies perfect
correlation, 0 means no correlation, and -1 means perfect anti-correlation.
This function provides a cross-correlation matrix for all pairs of numeric
columns in a source_table. A correlation matrix describes correlation
among \f$ M \f$ variables. It is a square symmetrical \f$ M \f$x \f$M \f$ matrix
with the \f$ (ij) \f$th element equal to the correlation coefficient between the
\f$i\f$th and the \f$j\f$th variable. The diagonal elements (correlations of
variables with themselves) are always equal to 1.0.
@anchor usage
@par Correlation Function
The correlation function has the following syntax:
correlation( source_table,
output_table,
target_cols,
verbose
)
The covariance function, with a similar syntax,
can be used to compute the covariance between features.
covariance( source_table,
output_table,
target_cols,
verbose
)
- source_table
- TEXT. The name of the data containing the input data.
- output_table
- TEXT. The name of the table where the cross-correlation matrix will be saved.
The output is a table with N+2 columns and N rows, where N is the number of
target columns. It contains the following columns.
| column_position |
The first column is a sequential counter indicating the position of the variable in the 'output_table'. |
| variable |
The second column contains the row-header for the variables. |
| <...> |
The remainder of the table is the NxN correlation matrix for the pairs of
numeric 'source_table' columns. |
The output table is arranged as a lower-triangular matrix with the upper
triangle set to NULL and the diagonal elements set to 1.0. To obtain the result
from the 'output_table' in this matrix format ensure to order the
elements using the 'column_position', as shown in the example below.
SELECT * FROM output_table ORDER BY column_position;
In addition to output table, a summary table named \_summary
is also created at the same time, which has the following columns:
| method | 'correlation' |
| source_table | VARCHAR. The data source table name. |
| output_table | VARCHAR. The output table name. |
| column_names | VARCHAR. Column names used for correlation computation, comma-separated string. |
| mean_vector | FLOAT8[]. Vector where each is the mean of a column. |
| total_rows_processed |
BIGINT. Total numbers of rows processed. |
- target_cols (optional)
- TEXT, default: '*'. A comma-separated list of the columns to correlate.
If NULL or '*', results are produced for all numeric columns.
- verbose (optional)
- BOOLEAN, default: FALSE. Print verbose debugging information if TRUE.
@anchor examples
@examp
-# View online help for the correlation function.
SELECT madlib.correlation();
-# Create an input data set.
DROP TABLE IF EXISTS example_data;
CREATE TABLE example_data(
id SERIAL, outlook TEXT,
temperature FLOAT8, humidity FLOAT8,
windy TEXT, class TEXT);
INSERT INTO example_data VALUES
(1, 'sunny', 85, 85, 'false', 'Dont Play'),
(2, 'sunny', 80, 90, 'true', 'Dont Play'),
(3, 'overcast', 83, 78, 'false', 'Play'),
(4, 'rain', 70, 96, 'false', 'Play'),
(5, 'rain', 68, 80, 'false', 'Play'),
(6, 'rain', 65, 70, 'true', 'Dont Play'),
(7, 'overcast', 64, 65, 'true', 'Play'),
(8, 'sunny', 72, 95, 'false', 'Dont Play'),
(9, 'sunny', 69, 70, 'false', 'Play'),
(10, 'rain', 75, 80, 'false', 'Play'),
(11, 'sunny', 75, 70, 'true', 'Play'),
(12, 'overcast', 72, 90, 'true', 'Play'),
(13, 'overcast', 81, 75, 'false', 'Play'),
(14, 'rain', 71, 80, 'true', 'Dont Play'),
(15, NULL, 100, 100, 'true', NULL),
(16, NULL, 110, 100, 'true', NULL);
-# Run the correlation() function on the data set.
-- Correlate all numeric columns
SELECT madlib.correlation( 'example_data',
'example_data_output'
);
-- Setting target_cols to NULL or '*' also correlates all numeric columns
SELECT madlib.correlation( 'example_data',
'example_data_output',
'*'
);
-- Correlate only the temperature and humidity columns
SELECT madlib.correlation( 'example_data',
'example_data_output',
'temperature, humidity'
);
-# View the correlation matrix.
SELECT * FROM example_data_output ORDER BY column_position;
Result:
column_position | variable | temperature | humidity
-----------------+-------------+-------------------+----------
1 | temperature | 1.0 |
2 | humidity | 0.616876934548786 | 1.0
(2 rows)
-# Compute the covariance of features in the data set.
SELECT madlib.covariance( 'example_data',
'cov_output'
);
-# View the covariance matrix.
SELECT * FROM cov_output ORDER BY column_position;
Result:
column_position | variable | temperature | humidity
-----------------+-------------+-------------------+----------
1 | temperature | 146.25 |
2 | humidity | 82.125 | 121.1875
(2 rows)
@par Notes
Null values will be replaced by the mean of their respective columns (Mean
imputation/substitution). Mean imputation is a method in which the missing
value on a certain variable is replaced by the mean of the available cases.
This method maintains the sample size and is easy to use, but the variability
in the data is reduced, so the standard deviations and the variance estimates
tend to be underestimated. Please refer to [1] and [2] for details.
If the mean imputation method is not suitable for the target use case, it is
advised to employ a view that handles the NULL values prior to calling the
correlation/covariance functions.
@anchor literature
@literature
[1] https://en.wikipedia.org/wiki/Imputation_(statistics)
[2] https://www.iriseekhout.com/missing-data/missing-data-methods/imputation-methods/
@anchor related
@par Related Topics
File correlation.sql_in documenting the SQL functions
\ref grp_summary for general descriptive statistics for a table
*/
-----------------------------------------------------------------------
-- Aggregate function for correlation
-----------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation_transition(
state double precision[],
x double precision[],
mean double precision[]
) RETURNS double precision[] AS
'MODULE_PATHNAME', 'correlation_transition'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation_merge(
left_state double precision[],
right_state double precision[]
) RETURNS double precision[] AS
'MODULE_PATHNAME', 'correlation_merge_states'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation_final(
state double precision[]
) RETURNS double precision[] AS
'MODULE_PATHNAME', 'correlation_final'
LANGUAGE C IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.correlation_agg(
double precision[], double precision[]);
CREATE AGGREGATE MADLIB_SCHEMA.correlation_agg(
/* x */ double precision[],
/* mean */ double precision[]
) (
SType = double precision[],
SFunc = MADLIB_SCHEMA.correlation_transition,
m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.correlation_merge,')
FinalFunc = MADLIB_SCHEMA.correlation_final
-- use NULL as the initial value
);
DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.covariance_agg(
double precision[], double precision[]);
CREATE AGGREGATE MADLIB_SCHEMA.covariance_agg(
/* x */ double precision[],
/* mean */ double precision[]
) (
SType = double precision[],
m4_ifdef(`__POSTGRESQL__', `', `prefunc=MADLIB_SCHEMA.correlation_merge,')
SFunc = MADLIB_SCHEMA.correlation_transition
-- use NULL as the initial value
-- return the last transition or merge state as the final state
-- this aggregate does not divide by the number of samples
-- (hence it's sum of (x-mean)^2 instead of expectation)
);
-----------------------------------------------------------------------
-- Main function for correlation
-----------------------------------------------------------------------
/* @brief Compute a correlation matrix for a table with optional target columns specified
@param source_table Name of source relation containing the data
@param output_table Name of output table name to store the correlation
@param target_cols String with comma separated list of columns for which cross-correlation is desired
@param verbose Flag to determine verbosity
@usage
SELECT MADLIB_SCHEMA.correlation (
'source_table', 'output_table',
'target_cols'
);
SELECT * FROM 'output_table' ORDER BY 'colum_position';
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation(
source_table varchar, -- input table name
output_table varchar, -- output table name
target_cols varchar, -- comma separated list of output cols (default = '*')
verbose boolean -- flag to determine verbosity
) RETURNS TEXT AS $$
PythonFunction(stats, correlation, correlation)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-----------------------------------------------------------------------
-- Overloaded functions
-----------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation(
source_table varchar, -- input table name
output_table varchar, -- output table name
target_cols varchar -- comma separated list of output cols (default = '*')
)
RETURNS TEXT AS $$
select MADLIB_SCHEMA.correlation($1, $2, $3, FALSE)
$$ LANGUAGE sql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-----------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation(
source_table varchar, -- input table name
output_table varchar -- output table name
) RETURNS TEXT AS $$
select MADLIB_SCHEMA.correlation($1, $2, NULL)
$$ LANGUAGE sql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-----------------------------------------------------------------------
-- Help functions
-----------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation(
input_message text
) RETURNS TEXT AS $$
PythonFunctionBodyOnly(`stats', `correlation')
return correlation.correlation_help_message(schema_madlib, input_message)
$$ LANGUAGE plpythonu IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-----------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.correlation()
RETURNS TEXT AS $$
PythonFunctionBodyOnly(`stats', `correlation')
return correlation.correlation_help_message(schema_madlib, None)
$$ LANGUAGE plpythonu IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-------------------------------------------------------------------------
-----------------------------------------------------------------------
-- Main function for covariance
-----------------------------------------------------------------------
/* @brief Compute a covariance matrix for a table with optional target columns specified
@param source_table Name of source relation containing the data
@param output_table Name of output table name to store the correlation
@param target_cols String with comma separated list of columns for which cross-correlation is desired
@param verbose Flag to determine verbosity
@usage
SELECT MADLIB_SCHEMA.covariance (
'source_table', 'output_table',
'target_cols'
);
SELECT * FROM 'output_table' ORDER BY 'colum_position';
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.covariance(
source_table varchar, -- input table name
output_table varchar, -- output table name
target_cols varchar, -- comma separated list of output cols (default = '*')
verbose boolean -- flag to determine verbosity
) RETURNS TEXT AS $$
PythonFunctionBodyOnly(`stats', `correlation')
return correlation.correlation(schema_madlib, source_table, output_table,
target_cols, True, verbose)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-----------------------------------------------------------------------
-- Overloaded functions
-----------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.covariance(
source_table varchar, -- input table name
output_table varchar, -- output table name
target_cols varchar -- comma separated list of output cols (default = '*')
)
RETURNS TEXT AS $$
select MADLIB_SCHEMA.covariance($1, $2, $3, FALSE)
$$ LANGUAGE sql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-----------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.covariance(
source_table varchar, -- input table name
output_table varchar -- output table name
) RETURNS TEXT AS $$
select MADLIB_SCHEMA.covariance($1, $2, NULL)
$$ LANGUAGE sql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
-----------------------------------------------------------------------
-- Help functions
-----------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.covariance(
input_message text
) RETURNS TEXT AS $$
PythonFunctionBodyOnly(`stats', `correlation')
return correlation.correlation_help_message(schema_madlib, input_message, cov=True)
$$ LANGUAGE plpythonu IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-----------------------------------------------------------------------
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.covariance()
RETURNS TEXT AS $$
PythonFunctionBodyOnly(`stats', `correlation')
return correlation.correlation_help_message(schema_madlib, None, cov=True)
$$ LANGUAGE plpythonu IMMUTABLE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `NO SQL', `');
-------------------------------------------------------------------------