_standardization
is also created. This is needed by the models that will use the
preprocessed data so is likely not of much interest to users.
It has the following columns:
| grouping columns |
If 'grouping_col' is specified,
a column for each grouping column
is created. |
| mean |
Mean of independent variables. |
| std |
Population standard deviation of
independent variables. |
@anchor example
@par Examples
-# Create an input data set based on the well known iris data set:
DROP TABLE IF EXISTS iris_data;
CREATE TABLE iris_data(
id serial,
attributes numeric[],
class_text varchar,
class integer,
state varchar
);
INSERT INTO iris_data(id, attributes, class_text, class, state) VALUES
(1,ARRAY[5.0,3.2,1.2,0.2],'Iris_setosa',1,'Alaska'),
(2,ARRAY[5.5,3.5,1.3,0.2],'Iris_setosa',1,'Alaska'),
(3,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Alaska'),
(4,ARRAY[4.4,3.0,1.3,0.2],'Iris_setosa',1,'Alaska'),
(5,ARRAY[5.1,3.4,1.5,0.2],'Iris_setosa',1,'Alaska'),
(6,ARRAY[5.0,3.5,1.3,0.3],'Iris_setosa',1,'Alaska'),
(7,ARRAY[4.5,2.3,1.3,0.3],'Iris_setosa',1,'Alaska'),
(8,ARRAY[4.4,3.2,1.3,0.2],'Iris_setosa',1,'Alaska'),
(9,ARRAY[5.0,3.5,1.6,0.6],'Iris_setosa',1,'Alaska'),
(10,ARRAY[5.1,3.8,1.9,0.4],'Iris_setosa',1,'Alaska'),
(11,ARRAY[4.8,3.0,1.4,0.3],'Iris_setosa',1,'Alaska'),
(12,ARRAY[5.1,3.8,1.6,0.2],'Iris_setosa',1,'Alaska'),
(13,ARRAY[5.7,2.8,4.5,1.3],'Iris_versicolor',2,'Alaska'),
(14,ARRAY[6.3,3.3,4.7,1.6],'Iris_versicolor',2,'Alaska'),
(15,ARRAY[4.9,2.4,3.3,1.0],'Iris_versicolor',2,'Alaska'),
(16,ARRAY[6.6,2.9,4.6,1.3],'Iris_versicolor',2,'Alaska'),
(17,ARRAY[5.2,2.7,3.9,1.4],'Iris_versicolor',2,'Alaska'),
(18,ARRAY[5.0,2.0,3.5,1.0],'Iris_versicolor',2,'Alaska'),
(19,ARRAY[5.9,3.0,4.2,1.5],'Iris_versicolor',2,'Alaska'),
(20,ARRAY[6.0,2.2,4.0,1.0],'Iris_versicolor',2,'Alaska'),
(21,ARRAY[6.1,2.9,4.7,1.4],'Iris_versicolor',2,'Alaska'),
(22,ARRAY[5.6,2.9,3.6,1.3],'Iris_versicolor',2,'Alaska'),
(23,ARRAY[6.7,3.1,4.4,1.4],'Iris_versicolor',2,'Alaska'),
(24,ARRAY[5.6,3.0,4.5,1.5],'Iris_versicolor',2,'Alaska'),
(25,ARRAY[5.8,2.7,4.1,1.0],'Iris_versicolor',2,'Alaska'),
(26,ARRAY[6.2,2.2,4.5,1.5],'Iris_versicolor',2,'Alaska'),
(27,ARRAY[5.6,2.5,3.9,1.1],'Iris_versicolor',2,'Alaska'),
(28,ARRAY[5.0,3.4,1.5,0.2],'Iris_setosa',1,'Tennessee'),
(29,ARRAY[4.4,2.9,1.4,0.2],'Iris_setosa',1,'Tennessee'),
(30,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Tennessee'),
(31,ARRAY[5.4,3.7,1.5,0.2],'Iris_setosa',1,'Tennessee'),
(32,ARRAY[4.8,3.4,1.6,0.2],'Iris_setosa',1,'Tennessee'),
(33,ARRAY[4.8,3.0,1.4,0.1],'Iris_setosa',1,'Tennessee'),
(34,ARRAY[4.3,3.0,1.1,0.1],'Iris_setosa',1,'Tennessee'),
(35,ARRAY[5.8,4.0,1.2,0.2],'Iris_setosa',1,'Tennessee'),
(36,ARRAY[5.7,4.4,1.5,0.4],'Iris_setosa',1,'Tennessee'),
(37,ARRAY[5.4,3.9,1.3,0.4],'Iris_setosa',1,'Tennessee'),
(38,ARRAY[6.0,2.9,4.5,1.5],'Iris_versicolor',2,'Tennessee'),
(39,ARRAY[5.7,2.6,3.5,1.0],'Iris_versicolor',2,'Tennessee'),
(40,ARRAY[5.5,2.4,3.8,1.1],'Iris_versicolor',2,'Tennessee'),
(41,ARRAY[5.5,2.4,3.7,1.0],'Iris_versicolor',2,'Tennessee'),
(42,ARRAY[5.8,2.7,3.9,1.2],'Iris_versicolor',2,'Tennessee'),
(43,ARRAY[6.0,2.7,5.1,1.6],'Iris_versicolor',2,'Tennessee'),
(44,ARRAY[5.4,3.0,4.5,1.5],'Iris_versicolor',2,'Tennessee'),
(45,ARRAY[6.0,3.4,4.5,1.6],'Iris_versicolor',2,'Tennessee'),
(46,ARRAY[6.7,3.1,4.7,1.5],'Iris_versicolor',2,'Tennessee'),
(47,ARRAY[6.3,2.3,4.4,1.3],'Iris_versicolor',2,'Tennessee'),
(48,ARRAY[5.6,3.0,4.1,1.3],'Iris_versicolor',2,'Tennessee'),
(49,ARRAY[5.5,2.5,4.0,1.3],'Iris_versicolor',2,'Tennessee'),
(50,ARRAY[5.5,2.6,4.4,1.2],'Iris_versicolor',2,'Tennessee'),
(51,ARRAY[6.1,3.0,4.6,1.4],'Iris_versicolor',2,'Tennessee'),
(52,ARRAY[5.8,2.6,4.0,1.2],'Iris_versicolor',2,'Tennessee');
-# Run the preprocessor:
DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, iris_data_packed_standardization;
SELECT madlib.minibatch_preprocessor('iris_data', -- Source table
'iris_data_packed', -- Output table
'class_text', -- Dependent variable
'attributes' -- Independent variables
);
For small datasets like in this example, buffer size is mainly
determined by the number of segments in the database.
This example is run on a Greenplum database with 2 segments,
so there are 2 rows with a buffer size of 26.
For PostgresSQL, there would be only one row with a buffer
size of 52 since it is a single node database.
For larger data sets, other factors go into
computing buffers size besides number of segments.
Also, note that the dependent variable has
been one-hot encoded since it is categorical.
Here is a sample of the packed output table:
\\x on
SELECT * FROM iris_data_packed;
-[ RECORD 1 ]-------+-------------------------------------
__id__ | 0
dependent_varname | {{1,0},{0,1},{1,0},{0,1},{1,0},{0,1},{0,1},{1,0},{1,0},{1,0},{1,0},{0,1},{0,1},{0,1},{1,0},{0,1},{0,1},{0,1},{1,0},{0,1},{1,0},{0,1},{1,0},{1,0},{1,0},{0,1}}
independent_varname | {{-0.767560815504508,0.806649237861967,-1.07515071152907,-1.18456909732025},{-0.0995580974152422,0.00385956572525086,1.03989986852812,1.17758048907675},...
...
-[ RECORD 2 ]-------+-------------------------------------
__id__ | 1
dependent_varname | {{1,0},{1,0},{1,0},{0,1},{0,1},{1,0},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{0,1},{1,0},{0,1},{0,1},{0,1},{0,1},{0,1},{1,0},{0,1},{1,0},{0,1},{1,0},{1,0},{0,1}}
independent_varname | {{0.568444620674023,2.01083374606704,-1.28665576953479,-1.18456909732025},{-1.76956489263841,0.405254401793609,-1.21615408353289,-1.18456909732025},...
...
Review the output summary table:
SELECT * FROM iris_data_packed_summary;
-[ RECORD 1 ]------------+------------------------------
source_table | iris_data
output_table | iris_data_packed
dependent_varname | class_text
independent_varname | attributes
buffer_size | 26
class_values | {Iris_setosa,Iris_versicolor}
num_rows_processed | 52
num_missing_rows_skipped | 0
grouping_cols |
Review the output standardization table:
SELECT * FROM iris_data_packed_standardization;
-[ RECORD 1 ]------------------------------------------------------
mean | {5.45961538462,2.99807692308,3.025,0.851923076923}
std | {0.598799958695,0.498262513686,1.41840579525,0.550346179381}
-# Generally the default buffer size will work well,
but if you have occasion to change it:
DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, iris_data_packed_standardization;
SELECT madlib.minibatch_preprocessor('iris_data', -- Source table
'iris_data_packed', -- Output table
'class_text', -- Dependent variable
'attributes', -- Independent variables
NULL, -- Grouping
10 -- Buffer size
);
Review the output summary table:
SELECT * FROM iris_data_packed_summary;
-[ RECORD 1 ]------------+------------------------------
source_table | iris_data
output_table | iris_data_packed
dependent_varname | class_text
independent_varname | attributes
buffer_size | 10
class_values | {Iris_setosa,Iris_versicolor}
num_rows_processed | 52
num_missing_rows_skipped | 0
grouping_cols |
-# Run the preprocessor with grouping by state:
DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, iris_data_packed_standardization;
SELECT madlib.minibatch_preprocessor('iris_data', -- Source table
'iris_data_packed', -- Output table
'class_text', -- Dependent variable
'attributes', -- Independent variables
'state' -- Grouping
);
Review the output table:
SELECT * FROM iris_data_packed ORDER BY state, __id__;
-[ RECORD 1 ]-------+-------------------------------------
__id__ | 0
state | Alaska
dependent_varname | {{0,1},{0,1},{1,0},{0,1},{0,1},{0,1},{1,0},{0,1},{0,1},{1,0},{1,0},{0,1},{0,1}}
independent_varname | {{0.306242850830503,-0.977074857057813,0.680489757142278 ...
...
-[ RECORD 2 ]-------+-------------------------------------
__id__ | 1
state | Alaska
dependent_varname | {{0,1},{1,0},{0,1},{0,1},{1,0},{1,0},{1,0},{0,1},{1,0},{0,1},{0,1},{1,0},{1,0}}
independent_varname | {{1.10129640587123,-0.126074175104234,1.2524188915498 ...
...
-[ RECORD 3 ]-------+-------------------------------------
__id__ | 2
state | Alaska
dependent_varname | {{1,0}}
independent_varname | {{-0.647821415218373,1.15042684782613,-1.17827992968215 ...
...
-[ RECORD 4 ]-------+-------------------------------------
__id__ | 0
state | Tennessee
dependent_varname | {{1,0},{0,1},{1,0},{1,0},{1,0},{0,1},{1,0},{0,1},{0,1},{0,1},{1,0},{1,0},{0,1}}
independent_varname | {{0.32912603663053,2.59625206429212,-1.12079945083087 ...
...
-[ RECORD 5 ]-------+-------------------------------------
__id__ | 1
state | Tennessee
dependent_varname | {{0,1},{0,1},{0,1},{1,0},{1,0},{0,1},{0,1},{1,0},{0,1},{0,1},{0,1},{0,1}}
independent_varname | {{0.865744574615085,-0.267261241912424,0.970244300719264 ...
...
Review the output summary table:
SELECT * FROM iris_data_packed_summary;
-[ RECORD 1 ]------------+------------------------------
source_table | iris_data
output_table | iris_data_packed
dependent_varname | class_text
independent_varname | attributes
buffer_size | 13
class_values | {Iris_setosa,Iris_versicolor}
num_rows_processed | 52
num_missing_rows_skipped | 0
grouping_cols | state
Review the output standardization table:
SELECT * FROM iris_data_packed_standardization;
-[ RECORD 1 ]-------------------------------------------------------------------
state | Alaska
mean | {5.40740740740741,2.95925925925926,2.94814814814815,0.833333333333333}
std | {0.628888452645665,0.470034875978888,1.39877469405147,0.536103914747325}
-[ RECORD 2 ]-------------------------------------------------------------------
state | Tennessee
mean | {5.516,3.04,3.108,0.872}
std | {0.55905634778617,0.523832034148353,1.43469021046357,0.564637937088893}
-# If the depedent variable is scalar integer,
and you have not already encoded it, you can ask
the preprocessor to encode it for you:
DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary, iris_data_packed_standardization;
SELECT madlib.minibatch_preprocessor('iris_data', -- Source table
'iris_data_packed', -- Output table
'class', -- Integer dependent variable
'attributes', -- Independent variables
NULL, -- Grouping
NULL, -- Buffer size
TRUE -- Encode scalar int dependent variable
);
Review the output summary table:
SELECT * FROM iris_data_packed_summary;
-[ RECORD 1 ]------------+-----------------
source_table | iris_data
output_table | iris_data_packed
dependent_varname | class
independent_varname | attributes
dependent_vartype | integer
buffer_size | 26
class_values | {1,2}
num_rows_processed | 52
num_missing_rows_skipped | 0
grouping_cols |
@anchor literature
@literature
[1] "Neural Networks for Machine Learning", Lectures 6a and 6b on mini-batch gradient descent,
Geoffrey Hinton with Nitish Srivastava and Kevin Swersky,
http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
@anchor related
@par Related Topics
minibatch_preprocessing.sql_in
Neural Networks
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor(
source_table VARCHAR,
output_table VARCHAR,
dependent_varname VARCHAR,
independent_varname VARCHAR,
grouping_cols VARCHAR,
buffer_size INTEGER,
one_hot_encode_int_dep_var BOOLEAN
) RETURNS VOID AS $$
PythonFunctionBodyOnly(utilities, minibatch_preprocessing)
minibatch_preprocessor_obj = minibatch_preprocessing.MiniBatchPreProcessor(**globals())
minibatch_preprocessor_obj.minibatch_preprocessor()
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor(
source_table VARCHAR,
output_table VARCHAR,
dependent_varname VARCHAR,
independent_varname VARCHAR,
grouping_cols VARCHAR,
buffer_size INTEGER
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.minibatch_preprocessor($1, $2, $3, $4, $5, $6, FALSE);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor(
source_table VARCHAR,
output_table VARCHAR,
dependent_varname VARCHAR,
independent_varname VARCHAR,
grouping_cols VARCHAR
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.minibatch_preprocessor($1, $2, $3, $4, $5, NULL, FALSE);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor(
source_table VARCHAR,
output_table VARCHAR,
dependent_varname VARCHAR,
independent_varname VARCHAR
) RETURNS VOID AS $$
SELECT MADLIB_SCHEMA.minibatch_preprocessor($1, $2, $3, $4, NULL, NULL, FALSE);
$$ LANGUAGE sql VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor(
message VARCHAR
) RETURNS VARCHAR AS $$
PythonFunctionBodyOnly(utilities, minibatch_preprocessing)
return minibatch_preprocessing.MiniBatchDocumentation.minibatch_preprocessor_help(schema_madlib, message)
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor()
RETURNS VARCHAR AS $$
PythonFunctionBodyOnly(utilities, minibatch_preprocessing)
return minibatch_preprocessing.MiniBatchDocumentation.minibatch_preprocessor_help(schema_madlib, '')
$$ LANGUAGE plpythonu VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');