/* ----------------------------------------------------------------------- *//** * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * * * @file encode_categorical.sql_in * * @brief SQL functions for encoding categorical variables to numerical values * @date Dec 2016 * * @sa Encodes categorical variables to numerical values * *//* ----------------------------------------------------------------------- */ m4_include(`SQLCommon.m4') /** @addtogroup grp_encode_categorical
@brief Provides functions to encode categorical variables @anchor categorical @par Coding Systems for Categorical Variables Categorical variables [1] require special attention in regression analysis because, unlike dichotomous or continuous variables, they cannot be entered into the regression equation just as they are. For example, if you have a variable called race that is coded with 1=Hispanic, 2=Asian, 3=Black, 4=White, then entering race in your regression will look at the linear effect of the race variable, which is probably not what you intended. Instead, categorical variables like this need to be coded into a series of indicator variables which can then be entered into the regression model. There are a variety of coding systems that can be used for coding categorical variables, including one-hot, dummy, effects, orthogonal, and Helmert. We currently support one-hot and dummy coding techniques. Dummy coding is used when a researcher wants to compare other groups of the predictor variable with one specific group of the predictor variable. Often, the specific group to compare with is called the reference group. One-hot encoding is similar to dummy coding except it builds indicator (0/1) columns (cast as numeric) for each value of each category. Only one of these columns could take on the value 1 for each row (data point). There is no reference category for this function.
encode_categorical_variables (
source_table,
output_table,
categorical_cols,
categorical_cols_to_exclude, -- Optional
row_id, -- Optional
top, -- Optional
value_to_drop, -- Optional
encode_null, -- Optional
output_type, -- Optional
output_dictionary, -- Optional
distributed_by -- Optional
)
\b Arguments
DROP TABLE IF EXISTS abalone;
CREATE TABLE abalone (
id serial,
sex character varying,
length double precision,
diameter double precision,
height double precision,
rings int
);
INSERT INTO abalone (sex, length, diameter, height, rings) VALUES
('M', 0.455, 0.365, 0.095, 15),
('M', 0.35, 0.265, 0.09, 7),
('F', 0.53, 0.42, 0.135, 9),
('M', 0.44, 0.365, 0.125, 10),
('I', 0.33, 0.255, 0.08, 7),
('I', 0.425, 0.3, 0.095, 8),
('F', 0.53, 0.415, 0.15, 20),
('F', 0.545, 0.425, 0.125, 16),
('M', 0.475, 0.37, 0.125, 9),
(NULL, 0.55, 0.44, 0.15, 19),
('F', 0.525, 0.38, 0.14, 14),
('M', 0.43, 0.35, 0.11, 10),
('M', 0.49, 0.38, 0.135, 11),
('F', 0.535, 0.405, 0.145, 10),
('F', 0.47, 0.355, 0.1, 10),
('M', 0.5, 0.4, 0.13, 12),
('I', 0.355, 0.28, 0.085, 7),
('F', 0.44, 0.34, 0.1, 10),
('M', 0.365, 0.295, 0.08, 7),
(NULL, 0.45, 0.32, 0.1, 9);
-# Create new table with one-hot encoding.
The column 'sex' is replaced by three columns encoding the
values 'F', 'M' and 'I'. Null values are not encoded by default:
DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
SELECT madlib.encode_categorical_variables (
'abalone', -- Source table
'abalone_out', -- Output table
'sex' -- Categorical columns
);
SELECT * FROM abalone_out ORDER BY id;
id | length | diameter | height | rings | sex_F | sex_I | sex_M ----+--------+----------+--------+-------+-------+-------+------- 1 | 0.455 | 0.365 | 0.095 | 15 | 0 | 0 | 1 2 | 0.35 | 0.265 | 0.09 | 7 | 0 | 0 | 1 3 | 0.53 | 0.42 | 0.135 | 9 | 1 | 0 | 0 4 | 0.44 | 0.365 | 0.125 | 10 | 0 | 0 | 1 5 | 0.33 | 0.255 | 0.08 | 7 | 0 | 1 | 0 6 | 0.425 | 0.3 | 0.095 | 8 | 0 | 1 | 0 7 | 0.53 | 0.415 | 0.15 | 20 | 1 | 0 | 0 8 | 0.545 | 0.425 | 0.125 | 16 | 1 | 0 | 0 9 | 0.475 | 0.37 | 0.125 | 9 | 0 | 0 | 1 10 | 0.55 | 0.44 | 0.15 | 19 | 0 | 0 | 0 11 | 0.525 | 0.38 | 0.14 | 14 | 1 | 0 | 0 12 | 0.43 | 0.35 | 0.11 | 10 | 0 | 0 | 1 13 | 0.49 | 0.38 | 0.135 | 11 | 0 | 0 | 1 14 | 0.535 | 0.405 | 0.145 | 10 | 1 | 0 | 0 15 | 0.47 | 0.355 | 0.1 | 10 | 1 | 0 | 0 16 | 0.5 | 0.4 | 0.13 | 12 | 0 | 0 | 1 17 | 0.355 | 0.28 | 0.085 | 7 | 0 | 1 | 0 18 | 0.44 | 0.34 | 0.1 | 10 | 1 | 0 | 0 19 | 0.365 | 0.295 | 0.08 | 7 | 0 | 0 | 1 20 | 0.45 | 0.32 | 0.1 | 9 | 0 | 0 | 0 (20 rows)-# Now include NULL values in encoding (note the additional column 'sex_null'):
DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
SELECT madlib.encode_categorical_variables (
'abalone', -- Source table
'abalone_out', -- Output table
'sex', -- Categorical columns
NULL, -- Categorical columns to exclude
NULL, -- Index columns
NULL, -- Top values
NULL, -- Value to drop for dummy encoding
TRUE -- Encode nulls
);
SELECT * FROM abalone_out ORDER BY id;
id | length | diameter | height | rings | sex_F | sex_I | sex_M | sex_null ----+--------+----------+--------+-------+-------+-------+-------+---------- 1 | 0.455 | 0.365 | 0.095 | 15 | 0 | 0 | 1 | 0 2 | 0.35 | 0.265 | 0.09 | 7 | 0 | 0 | 1 | 0 3 | 0.53 | 0.42 | 0.135 | 9 | 1 | 0 | 0 | 0 4 | 0.44 | 0.365 | 0.125 | 10 | 0 | 0 | 1 | 0 5 | 0.33 | 0.255 | 0.08 | 7 | 0 | 1 | 0 | 0 6 | 0.425 | 0.3 | 0.095 | 8 | 0 | 1 | 0 | 0 7 | 0.53 | 0.415 | 0.15 | 20 | 1 | 0 | 0 | 0 8 | 0.545 | 0.425 | 0.125 | 16 | 1 | 0 | 0 | 0 9 | 0.475 | 0.37 | 0.125 | 9 | 0 | 0 | 1 | 0 10 | 0.55 | 0.44 | 0.15 | 19 | 0 | 0 | 0 | 1 11 | 0.525 | 0.38 | 0.14 | 14 | 1 | 0 | 0 | 0 12 | 0.43 | 0.35 | 0.11 | 10 | 0 | 0 | 1 | 0 13 | 0.49 | 0.38 | 0.135 | 11 | 0 | 0 | 1 | 0 14 | 0.535 | 0.405 | 0.145 | 10 | 1 | 0 | 0 | 0 15 | 0.47 | 0.355 | 0.1 | 10 | 1 | 0 | 0 | 0 16 | 0.5 | 0.4 | 0.13 | 12 | 0 | 0 | 1 | 0 17 | 0.355 | 0.28 | 0.085 | 7 | 0 | 1 | 0 | 0 18 | 0.44 | 0.34 | 0.1 | 10 | 1 | 0 | 0 | 0 19 | 0.365 | 0.295 | 0.08 | 7 | 0 | 0 | 1 | 0 20 | 0.45 | 0.32 | 0.1 | 9 | 0 | 0 | 0 | 1 (20 rows)-# Encode all categorical variables in the source table. Also, specify the column 'id' as the index (primary key) which changes the output table to include only the index and the encoded variables:
DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
SELECT madlib.encode_categorical_variables (
'abalone', -- Source table
'abalone_out', -- Output table
'*', -- Categorical columns
NULL, -- Categorical columns to exclude
'id' -- Index columns
);
SELECT * FROM abalone_out ORDER BY id;
id | sex_F | sex_I | sex_M | rings_7 | rings_8 | rings_9 | rings_10 | rings_11 | rings_12 | rings_14 | rings_15 | rings_16 | rings_19 | rings_20 ----+-------+-------+-------+---------+---------+---------+----------+----------+----------+----------+----------+----------+----------+---------- 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 2 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 4 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 5 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 6 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 9 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 12 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 13 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 14 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 15 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 16 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 17 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 18 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 19 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 20 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 (20 rows)-# Now let's encode only the top values and group others into a miscellaneous bucket column. Top values can be global across all columns or specified by column. As an example of the latter, here are the top 2 'sex' values and the top 50% of 'rings' values:
DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
SELECT madlib.encode_categorical_variables (
'abalone', -- Source table
'abalone_out', -- Output table
'*', -- Categorical columns
NULL, -- Categorical columns to exclude
'id', -- Index columns
'sex=2, rings=0.5' -- Top values
);
SELECT * FROM abalone_out ORDER BY id;
id | sex_M | sex_F | sex__misc__ | rings_10 | rings_7 | rings_9 | rings__misc__ ----+-------+-------+-------------+----------+---------+---------+--------------- 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 2 | 1 | 0 | 0 | 0 | 1 | 0 | 0 3 | 0 | 1 | 0 | 0 | 0 | 1 | 0 4 | 1 | 0 | 0 | 1 | 0 | 0 | 0 5 | 0 | 0 | 1 | 0 | 1 | 0 | 0 6 | 0 | 0 | 1 | 0 | 0 | 0 | 1 7 | 0 | 1 | 0 | 0 | 0 | 0 | 1 8 | 0 | 1 | 0 | 0 | 0 | 0 | 1 9 | 1 | 0 | 0 | 0 | 0 | 1 | 0 10 | 0 | 0 | 0 | 0 | 0 | 0 | 1 11 | 0 | 1 | 0 | 0 | 0 | 0 | 1 12 | 1 | 0 | 0 | 1 | 0 | 0 | 0 13 | 1 | 0 | 0 | 0 | 0 | 0 | 1 14 | 0 | 1 | 0 | 1 | 0 | 0 | 0 15 | 0 | 1 | 0 | 1 | 0 | 0 | 0 16 | 1 | 0 | 0 | 0 | 0 | 0 | 1 17 | 0 | 0 | 1 | 0 | 1 | 0 | 0 18 | 0 | 1 | 0 | 1 | 0 | 0 | 0 19 | 1 | 0 | 0 | 0 | 1 | 0 | 0 20 | 0 | 0 | 0 | 0 | 0 | 1 | 0 (20 rows)-# If you want to see both the raw categorical variable and its encoded form in the output_table, then include the categorical variable(s) in the index parameter. (Remember that this will not work if you specify '*' for the parameter 'categorical_cols', because in this case 'row_id' columns will not be encoded at all.)
DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
SELECT madlib.encode_categorical_variables (
'abalone', -- Source table
'abalone_out', -- Output table
'sex, rings', -- Categorical columns
NULL, -- Categorical columns to exclude
'id, sex, rings' -- Index columns
);
SELECT * FROM abalone_out ORDER BY id;
id | sex | rings | sex_F | sex_I | sex_M | rings_7 | rings_8 | rings_9 | rings_10 | rings_11 | rings_12 | rings_14 | rings_15 | rings_16 | rings_19 | rings_20 ----+-----+-------+-------+-------+-------+---------+---------+---------+----------+----------+----------+----------+----------+----------+----------+---------- 1 | M | 15 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 2 | M | 7 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 3 | F | 9 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 4 | M | 10 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 5 | I | 7 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 6 | I | 8 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 7 | F | 20 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 8 | F | 16 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 9 | M | 9 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 10 | | 19 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 11 | F | 14 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 12 | M | 10 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 13 | M | 11 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 14 | F | 10 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 15 | F | 10 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 16 | M | 12 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 17 | I | 7 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 18 | F | 10 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 19 | M | 7 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 20 | | 9 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 (20 rows)-# For dummy encoding, let's make the 'I' value from the 'sex' variable as the reference. Here we use the 'value_to_drop' parameter:
DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
SELECT madlib.encode_categorical_variables (
'abalone', -- Source table
'abalone_out', -- Output table
'*', -- Categorical columns
'rings', -- Categorical columns to exclude
'id', -- Index columns
NULL, -- Top value
'sex=I' -- Value to drop for dummy encoding
);
SELECT * FROM abalone_out ORDER BY id;
id | sex_F | sex_M ----+-------+------- 1 | 0 | 1 2 | 0 | 1 3 | 1 | 0 4 | 0 | 1 5 | 0 | 0 6 | 0 | 0 7 | 1 | 0 8 | 1 | 0 9 | 0 | 1 10 | 0 | 0 11 | 1 | 0 12 | 0 | 1 13 | 0 | 1 14 | 1 | 0 15 | 1 | 0 16 | 0 | 1 17 | 0 | 0 18 | 1 | 0 19 | 0 | 1 20 | 0 | 0 (20 rows)-# Create an array output for the two categorical variables in the source table:
DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
SELECT madlib.encode_categorical_variables (
'abalone', -- Source table
'abalone_out', -- Output table
'*', -- Categorical columns
NULL, -- Categorical columns to exclude
'id', -- Index columns
NULL, -- Top values
NULL, -- Value to drop for dummy encoding
NULL, -- Encode nulls
'array' -- Array output type
);
SELECT * FROM abalone_out ORDER BY id;
id | __encoded_variables__
----+-------------------------------
1 | {0,0,1,0,0,0,0,0,0,0,1,0,0,0}
2 | {0,0,1,1,0,0,0,0,0,0,0,0,0,0}
3 | {1,0,0,0,0,1,0,0,0,0,0,0,0,0}
4 | {0,0,1,0,0,0,1,0,0,0,0,0,0,0}
5 | {0,1,0,1,0,0,0,0,0,0,0,0,0,0}
6 | {0,1,0,0,1,0,0,0,0,0,0,0,0,0}
7 | {1,0,0,0,0,0,0,0,0,0,0,0,0,1}
8 | {1,0,0,0,0,0,0,0,0,0,0,1,0,0}
9 | {0,0,1,0,0,1,0,0,0,0,0,0,0,0}
10 | {0,0,0,0,0,0,0,0,0,0,0,0,1,0}
11 | {1,0,0,0,0,0,0,0,0,1,0,0,0,0}
12 | {0,0,1,0,0,0,1,0,0,0,0,0,0,0}
13 | {0,0,1,0,0,0,0,1,0,0,0,0,0,0}
14 | {1,0,0,0,0,0,1,0,0,0,0,0,0,0}
15 | {1,0,0,0,0,0,1,0,0,0,0,0,0,0}
16 | {0,0,1,0,0,0,0,0,1,0,0,0,0,0}
17 | {0,1,0,1,0,0,0,0,0,0,0,0,0,0}
18 | {1,0,0,0,0,0,1,0,0,0,0,0,0,0}
19 | {0,0,1,1,0,0,0,0,0,0,0,0,0,0}
20 | {0,0,0,0,0,1,0,0,0,0,0,0,0,0}
(20 rows)
View the dictionary table that gives the index into the array:
SELECT * FROM abalone_out_dictionary;
encoded_column_name | index | variable | value -----------------------+-------+----------+------- __encoded_variables__ | 1 | sex | F __encoded_variables__ | 2 | sex | I __encoded_variables__ | 3 | sex | M __encoded_variables__ | 4 | rings | 7 __encoded_variables__ | 5 | rings | 8 __encoded_variables__ | 6 | rings | 9 __encoded_variables__ | 7 | rings | 10 __encoded_variables__ | 8 | rings | 11 __encoded_variables__ | 9 | rings | 12 __encoded_variables__ | 10 | rings | 14 __encoded_variables__ | 11 | rings | 15 __encoded_variables__ | 12 | rings | 16 __encoded_variables__ | 13 | rings | 19 __encoded_variables__ | 14 | rings | 20 (14 rows)-# Create a dictionary output:
DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
SELECT madlib.encode_categorical_variables (
'abalone', -- Source table
'abalone_out', -- Output table
'*', -- Categorical columns
NULL, -- Categorical columns to exclude
'id', -- Index columns
NULL, -- Top values
NULL, -- Value to drop for dummy encoding
NULL, -- Encode nulls
NULL, -- Output type
TRUE -- Dictionary output
);
SELECT * FROM abalone_out ORDER BY id;
id | sex_1 | sex_2 | sex_3 | rings_1 | rings_2 | rings_3 | rings_4 | rings_5 | rings_6 | rings_7 | rings_8 | rings_9 | rings_10 | rings_11 ----+-------+-------+-------+---------+---------+---------+---------+---------+---------+---------+---------+---------+----------+---------- 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 2 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 3 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 4 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 5 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 6 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 9 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 11 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 12 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 13 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 14 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 15 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 16 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 17 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 18 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 19 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 20 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 (20 rows)View the dictionary table that defines the numerical columns in the output table:
SELECT * FROM abalone_out_dictionary ORDER BY encoded_column_name;
encoded_column_name | index | variable | value ---------------------+-------+----------+------- "rings_1" | 1 | rings | 7 "rings_10" | 10 | rings | 19 "rings_11" | 11 | rings | 20 "rings_2" | 2 | rings | 8 "rings_3" | 3 | rings | 9 "rings_4" | 4 | rings | 10 "rings_5" | 5 | rings | 11 "rings_6" | 6 | rings | 12 "rings_7" | 7 | rings | 14 "rings_8" | 8 | rings | 15 "rings_9" | 9 | rings | 16 "sex_1" | 1 | sex | F "sex_2" | 2 | sex | I "sex_3" | 3 | sex | M (14 rows)-# We can chose from various distribution policies of the output table, for examply RANDOMLY:
DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
SELECT madlib.encode_categorical_variables (
'abalone', -- Source table
'abalone_out', -- Output table
'*', -- Categorical columns
NULL, -- Categorical columns to exclude
'id', -- Index columns
NULL, -- Top values
NULL, -- Value to drop for dummy encoding
NULL, -- Encode nulls
NULL, -- Output type
NULL, -- Dictionary output
'RANDOMLY' -- Distribution policy
);
SELECT * FROM abalone_out ORDER BY id;
id | sex_F | sex_I | sex_M | rings_10 | rings_11 | rings_12 | rings_14 | rings_15 | rings_16 | rings_19 | rings_20 | rings_7 | rings_8 | rings_9 ----+-------+-------+-------+----------+----------+----------+----------+----------+----------+----------+----------+---------+---------+--------- 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 2 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 4 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 5 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 6 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 9 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 11 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 12 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 13 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 14 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 15 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 16 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 17 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 18 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 19 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 (20 rows)-# If you have a reason to encode FLOAT variables, you can cast them as TEXT in the following way within the function call:
DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
SELECT madlib.encode_categorical_variables (
'abalone', -- Source table
'abalone_out', -- Output table
'height::TEXT' -- Categorical columns
);
SELECT * FROM abalone_out ORDER BY id;
id | sex | length | diameter | height | rings | height::TEXT_0.08 | height::TEXT_0.085 | height::TEXT_0.09 | height::TEXT_0.095 | height::TEXT_0.1 | height::TEXT_0.11 | height::TEXT_0.125 | height::TEXT_0.13 | height::TEXT_0.135 | height::TEXT_0.14 | height::TEXT_0.145 | height::TEXT_0.15 ----+-----+--------+----------+--------+-------+-------------------+--------------------+-------------------+--------------------+------------------+-------------------+--------------------+-------------------+--------------------+-------------------+--------------------+------------------- 1 | M | 0.455 | 0.365 | 0.095 | 15 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 2 | M | 0.35 | 0.265 | 0.09 | 7 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 3 | F | 0.53 | 0.42 | 0.135 | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 4 | M | 0.44 | 0.365 | 0.125 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 5 | I | 0.33 | 0.255 | 0.08 | 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 6 | I | 0.425 | 0.3 | 0.095 | 8 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 7 | F | 0.53 | 0.415 | 0.15 | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 8 | F | 0.545 | 0.425 | 0.125 | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 9 | M | 0.475 | 0.37 | 0.125 | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 10 | | 0.55 | 0.44 | 0.15 | 19 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 11 | F | 0.525 | 0.38 | 0.14 | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 12 | M | 0.43 | 0.35 | 0.11 | 10 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 13 | M | 0.49 | 0.38 | 0.135 | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 14 | F | 0.535 | 0.405 | 0.145 | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 15 | F | 0.47 | 0.355 | 0.1 | 10 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 16 | M | 0.5 | 0.4 | 0.13 | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 17 | I | 0.355 | 0.28 | 0.085 | 7 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 18 | F | 0.44 | 0.34 | 0.1 | 10 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 19 | M | 0.365 | 0.295 | 0.08 | 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 20 | | 0.45 | 0.32 | 0.1 | 9 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 (20 rows)-# You can also use a logical expression in the categorical columns, which will be passed as boolean, and in the output table there will be two columns with name logical_expression_true and logical_expression_false:
DROP TABLE IF EXISTS abalone_out, abalone_out_dictionary;
SELECT madlib.encode_categorical_variables (
'abalone', -- Source table
'abalone_out', -- Output table
'height>.10' -- Categorical columns
);
SELECT * FROM abalone_out ORDER BY id;
id | sex | length | diameter | height | rings | height>.10_false | height>.10_true ----+-----+--------+----------+--------+-------+------------------+----------------- 1 | M | 0.455 | 0.365 | 0.095 | 15 | 1 | 0 2 | M | 0.35 | 0.265 | 0.09 | 7 | 1 | 0 3 | F | 0.53 | 0.42 | 0.135 | 9 | 0 | 1 4 | M | 0.44 | 0.365 | 0.125 | 10 | 0 | 1 5 | I | 0.33 | 0.255 | 0.08 | 7 | 1 | 0 6 | I | 0.425 | 0.3 | 0.095 | 8 | 1 | 0 7 | F | 0.53 | 0.415 | 0.15 | 20 | 0 | 1 8 | F | 0.545 | 0.425 | 0.125 | 16 | 0 | 1 9 | M | 0.475 | 0.37 | 0.125 | 9 | 0 | 1 10 | | 0.55 | 0.44 | 0.15 | 19 | 0 | 1 11 | F | 0.525 | 0.38 | 0.14 | 14 | 0 | 1 12 | M | 0.43 | 0.35 | 0.11 | 10 | 0 | 1 13 | M | 0.49 | 0.38 | 0.135 | 11 | 0 | 1 14 | F | 0.535 | 0.405 | 0.145 | 10 | 0 | 1 15 | F | 0.47 | 0.355 | 0.1 | 10 | 1 | 0 16 | M | 0.5 | 0.4 | 0.13 | 12 | 0 | 1 17 | I | 0.355 | 0.28 | 0.085 | 7 | 1 | 0 18 | F | 0.44 | 0.34 | 0.1 | 10 | 1 | 0 19 | M | 0.365 | 0.295 | 0.08 | 7 | 1 | 0 20 | | 0.45 | 0.32 | 0.1 | 9 | 1 | 0 (20 rows)@anchor literature @literature @anchor svm-lit-1 [1] https://en.wikipedia.org/wiki/Categorical_variable [2] https://archive.ics.uci.edu/ml/datasets/Abalone */ ------------------------------------------------------------------------- /** * @brief Encode categorical columns using either one-hot encoding or dummy coding * * @param source_table Name of table containing categorical variable * @param output_table Name of table to output dummy variables * @param categorical_cols Comma-separated list of column names to dummy code (can be '*') * @param categorical_cols_to_exclude Comma-separated list of column names to exclude (if categorical_cols = '*') * @param row_id Columns from source table to index output table * @param top Parameter to include only top values of a categorical variable * @param value_to_drop Parameter to set reference column in dummy coding * @param encode_null Boolean to determine the behavior for rows with NULL value * @param output_type Parameter to set output data type: 'column', 'array' or 'svec' * @param output_dictionary Boolean to simplify column naming and with a separate * mapping table to actual values * @param distributed_by Comma-separated list of column names to use for distribution of output * * @return Void * */ -- Create the below function for PostgreSQL but ensure that distributed_by is a no-op CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.encode_categorical_variables( source_table VARCHAR, output_table VARCHAR, categorical_cols VARCHAR, categorical_cols_to_exclude VARCHAR, row_id VARCHAR, top VARCHAR, value_to_drop VARCHAR, encode_null BOOLEAN, output_type VARCHAR, output_dictionary BOOLEAN, distributed_by VARCHAR ) RETURNS VOID AS $$ PythonFunction(utilities, encode_categorical, encode_categorical_variables) $$ LANGUAGE plpythonu m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.encode_categorical_variables( source_table VARCHAR, output_table VARCHAR, categorical_cols VARCHAR, categorical_cols_to_exclude VARCHAR, row_id VARCHAR, top VARCHAR, value_to_drop VARCHAR, encode_null BOOLEAN, output_type VARCHAR, output_dictionary BOOLEAN ) RETURNS VOID AS $$ PythonFunction(utilities, encode_categorical, encode_categorical_variables) $$ LANGUAGE plpythonu m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); -- Overloaded functions -------------------------------------------------------- -- Default values are set by underlying Python function -------------------------------------------------------------------------------- CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.encode_categorical_variables( source_table VARCHAR, output_table VARCHAR, categorical_cols VARCHAR, categorical_cols_to_exclude VARCHAR, row_id VARCHAR, top VARCHAR, value_to_drop VARCHAR, encode_null BOOLEAN, output_type VARCHAR ) RETURNS VOID AS $$ PythonFunction(utilities, encode_categorical, encode_categorical_variables) $$ LANGUAGE plpythonu m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.encode_categorical_variables( source_table VARCHAR, output_table VARCHAR, categorical_cols VARCHAR, categorical_cols_to_exclude VARCHAR, row_id VARCHAR, top VARCHAR, value_to_drop VARCHAR, encode_null BOOLEAN ) RETURNS VOID AS $$ PythonFunction(utilities, encode_categorical, encode_categorical_variables) $$ LANGUAGE plpythonu m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.encode_categorical_variables( source_table VARCHAR, output_table VARCHAR, categorical_cols VARCHAR, categorical_cols_to_exclude VARCHAR, row_id VARCHAR, top VARCHAR, value_to_drop VARCHAR ) RETURNS VOID AS $$ PythonFunction(utilities, encode_categorical, encode_categorical_variables) $$ LANGUAGE plpythonu m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.encode_categorical_variables( source_table VARCHAR, output_table VARCHAR, categorical_cols VARCHAR, categorical_cols_to_exclude VARCHAR, row_id VARCHAR, top VARCHAR ) RETURNS VOID AS $$ PythonFunction(utilities, encode_categorical, encode_categorical_variables) $$ LANGUAGE plpythonu m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.encode_categorical_variables( source_table VARCHAR, output_table VARCHAR, categorical_cols VARCHAR, categorical_cols_to_exclude VARCHAR, row_id VARCHAR ) RETURNS VOID AS $$ PythonFunction(utilities, encode_categorical, encode_categorical_variables) $$ LANGUAGE plpythonu m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.encode_categorical_variables( source_table VARCHAR, output_table VARCHAR, categorical_cols VARCHAR, categorical_cols_to_exclude VARCHAR ) RETURNS VOID AS $$ PythonFunction(utilities, encode_categorical, encode_categorical_variables) $$ LANGUAGE plpythonu m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.encode_categorical_variables( source_table VARCHAR, output_table VARCHAR, categorical_cols VARCHAR ) RETURNS VOID AS $$ PythonFunction(utilities, encode_categorical, encode_categorical_variables) $$ LANGUAGE plpythonu m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); -- Online help ----------------------------------------------------------------- CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.encode_categorical_variables( message VARCHAR ) RETURNS VARCHAR AS $$ PythonFunction(utilities, encode_categorical, encode_categorical_help) $$ LANGUAGE plpythonu IMMUTABLE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); -------------------------------------------------------------------------------- CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.encode_categorical_variables() RETURNS VARCHAR AS $$ SELECT MADLIB_SCHEMA.encode_categorical_variables(''); $$ LANGUAGE sql IMMUTABLE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); --------------------------------------------------------------------------------