/* ----------------------------------------------------------------------- */ /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * * @file sessionize.sql_in * * @brief SQL functions for sessionization functions * @date May 2016 * */ /* ----------------------------------------------------------------------- */ m4_include(`SQLCommon.m4') /** @addtogroup grp_sessionize
sessionize( source_table, output_table, partition_expr, time_stamp, max_time, output_cols, create_view )\b Arguments
DROP TABLE IF EXISTS eventlog CASCADE; -- Using CASCADE in case you are running through this example more than once (views used below)
CREATE TABLE eventlog (event_timestamp TIMESTAMP,
user_id INT,
page TEXT,
revenue FLOAT);
INSERT INTO eventlog VALUES
('04/15/2015 02:19:00', 101331, 'CHECKOUT', 16),
('04/15/2015 02:17:00', 202201, 'WINE', 0),
('04/15/2015 03:18:00', 202201, 'BEER', 0),
('04/15/2015 01:03:00', 100821, 'LANDING', 0),
('04/15/2015 01:04:00', 100821, 'WINE', 0),
('04/15/2015 01:05:00', 100821, 'CHECKOUT', 39),
('04/15/2015 02:06:00', 100821, 'WINE', 0),
('04/15/2015 02:09:00', 100821, 'WINE', 0),
('04/15/2015 02:15:00', 101331, 'LANDING', 0),
('04/15/2015 02:16:00', 101331, 'WINE', 0),
('04/15/2015 02:17:00', 101331, 'HELP', 0),
('04/15/2015 02:18:00', 101331, 'WINE', 0),
('04/15/2015 02:29:00', 201881, 'LANDING', 0),
('04/15/2015 02:30:00', 201881, 'BEER', 0),
('04/15/2015 01:05:00', 202201, 'LANDING', 0),
('04/15/2015 01:06:00', 202201, 'HELP', 0),
('04/15/2015 01:09:00', 202201, 'LANDING', 0),
('04/15/2015 02:15:00', 202201, 'WINE', 0),
('04/15/2015 02:16:00', 202201, 'BEER', 0),
('04/15/2015 03:19:00', 202201, 'WINE', 0),
('04/15/2015 03:22:00', 202201, 'CHECKOUT', 21);
Sessionize the table by each user_id:
DROP VIEW IF EXISTS sessionize_output_view;
SELECT madlib.sessionize(
'eventlog', -- Name of input table
'sessionize_output_view', -- View to store sessionize results
'user_id', -- Partition input table by user id
'event_timestamp', -- Time column used to compute sessions
'0:30:0' -- Use 30 minute time out to define sessions
);
SELECT * FROM sessionize_output_view ORDER BY user_id, event_timestamp;
Result:
event_timestamp | user_id | page | revenue | session_id ---------------------+---------+----------+---------+------------ 2015-04-15 01:03:00 | 100821 | LANDING | 0 | 1 2015-04-15 01:04:00 | 100821 | WINE | 0 | 1 2015-04-15 01:05:00 | 100821 | CHECKOUT | 39 | 1 2015-04-15 02:06:00 | 100821 | WINE | 0 | 2 2015-04-15 02:09:00 | 100821 | WINE | 0 | 2 2015-04-15 02:15:00 | 101331 | LANDING | 0 | 1 2015-04-15 02:16:00 | 101331 | WINE | 0 | 1 2015-04-15 02:17:00 | 101331 | HELP | 0 | 1 2015-04-15 02:18:00 | 101331 | WINE | 0 | 1 2015-04-15 02:19:00 | 101331 | CHECKOUT | 16 | 1 2015-04-15 02:29:00 | 201881 | LANDING | 0 | 1 2015-04-15 02:30:00 | 201881 | BEER | 0 | 1 2015-04-15 01:05:00 | 202201 | LANDING | 0 | 1 2015-04-15 01:06:00 | 202201 | HELP | 0 | 1 2015-04-15 01:09:00 | 202201 | LANDING | 0 | 1 2015-04-15 02:15:00 | 202201 | WINE | 0 | 2 2015-04-15 02:16:00 | 202201 | BEER | 0 | 2 2015-04-15 02:17:00 | 202201 | WINE | 0 | 2 2015-04-15 03:18:00 | 202201 | BEER | 0 | 3 2015-04-15 03:19:00 | 202201 | WINE | 0 | 3 2015-04-15 03:22:00 | 202201 | CHECKOUT | 21 | 3 (21 rows)Now let's say we want to see 3 minute sessions by a group of users with a certain range of user IDs. To do this, we need to sessionize the table based on a partition expression. Also, we want to persist a table output with a reduced set of columns in the table.
DROP TABLE IF EXISTS sessionize_output_table;
SELECT madlib.sessionize(
'eventlog', -- Name of input table
'sessionize_output_table', -- Table to store sessionize results
'user_id < 200000', -- Partition input table by subset of users
'event_timestamp', -- Order partitions in input table by time
'180', -- Use 180 second time out to define sessions (same as '0:03:0')
'event_timestamp, user_id, user_id < 200000 AS "Department-A1"', -- Select only user_id and event_timestamp columns, along with the session id as output
'f' -- create a table
);
SELECT * FROM sessionize_output_table WHERE "Department-A1"='TRUE' ORDER BY event_timestamp;
Result showing 2 users and 3 total sessions across the group:
event_timestamp | user_id | Department-A1 | session_id ---------------------+---------+---------------+------------ 2015-04-15 01:03:00 | 100821 | t | 1 2015-04-15 01:04:00 | 100821 | t | 1 2015-04-15 01:05:00 | 100821 | t | 1 2015-04-15 02:06:00 | 100821 | t | 2 2015-04-15 02:09:00 | 100821 | t | 2 2015-04-15 02:15:00 | 101331 | t | 3 2015-04-15 02:16:00 | 101331 | t | 3 2015-04-15 02:17:00 | 101331 | t | 3 2015-04-15 02:18:00 | 101331 | t | 3 2015-04-15 02:19:00 | 101331 | t | 3 (10 rows)@anchor literature @par Literature NOTE: The following PostgreSQL link refers to documentation resources for the current PostgreSQL database version. Depending upon your database platform version, you may need to change "current" reference in the link to your database version. If your database platform uses the Greenplum Database (or related variants), please check with the project community and/or your database vendor to identify the PostgreSQL version it is based on. [1] Sesssions in web analytics https://en.wikipedia.org/wiki/Session_(web_analytics) [2] PostgreSQL date/time types https://www.postgresql.org/docs/current/static/datatype-datetime.html */ ------------------------------------------------------------------------- CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.sessionize( source_table VARCHAR, output_table VARCHAR, partition_expr VARCHAR, time_stamp VARCHAR, max_time INTERVAL, output_cols VARCHAR, create_view BOOLEAN ) RETURNS void AS $$ PythonFunction(utilities, sessionize, sessionize) $$ LANGUAGE plpythonu m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.sessionize( source_table VARCHAR, output_table VARCHAR, partition_expr VARCHAR, time_stamp VARCHAR, max_time INTERVAL, output_cols VARCHAR ) RETURNS void AS $$ SELECT MADLIB_SCHEMA.sessionize($1, $2, $3, $4, $5, $6, True); $$ LANGUAGE SQL m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.sessionize( source_table VARCHAR, output_table VARCHAR, partition_expr VARCHAR, time_stamp VARCHAR, max_time INTERVAL ) RETURNS void AS $$ SELECT MADLIB_SCHEMA.sessionize($1, $2, $3, $4, $5, '*', True); $$ LANGUAGE SQL m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); ------------------------------------------------------------------------- CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.sessionize(message TEXT) RETURNS text AS $$ PythonFunction(utilities, sessionize, sessionize_help_message) $$ language plpythonu m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `'); CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.sessionize() RETURNS text AS $$ SELECT MADLIB_SCHEMA.sessionize(''); $$ language SQL m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `CONTAINS SQL', `');