-- -- multi subquery behavioral analytics queries aims to expand existing subquery pushdown -- regression tests to cover more cases -- the tables that are used depends to multi_insert_select_behavioral_analytics_create_table.sql -- --- We don't need shard id sequence here given that we're not creating any shards, so not writing it at all -- The following line is intended to force Citus to NOT use router planner for the tests in this -- file. The motivation for doing this is to make sure that single-task queries can be planned -- by non-router code-paths. Thus, this flag should NOT be used in production. Otherwise, the actual -- router queries would fail. SET citus.enable_router_execution TO FALSE; --------------------------------------------------------------------- -- Vanilla funnel query --------------------------------------------------------------------- SELECT user_id, array_length(events_table, 1) FROM ( SELECT user_id, array_agg(event ORDER BY time) AS events_table FROM ( SELECT u.user_id, e.event_type::text AS event, e.time FROM users_table AS u, events_table AS e WHERE u.user_id = e.user_id AND u.user_id >= 1 AND u.user_id <= 3 AND e.event_type IN (1, 2) ) t GROUP BY user_id ) q ORDER BY 2 DESC, 1; user_id | array_length --------------------------------------------------------------------- 3 | 187 2 | 180 1 | 28 (3 rows) --------------------------------------------------------------------- -- Funnel grouped by whether or not a user has done an event -- This has multiple subqueries joinin at the top level --------------------------------------------------------------------- SELECT user_id, sum(array_length(events_table, 1)), length(hasdone_event), hasdone_event FROM ( SELECT t1.user_id, array_agg(event ORDER BY time) AS events_table, COALESCE(hasdone_event, 'Has not done event') AS hasdone_event FROM ( ( SELECT u.user_id, 'step=>1'::text AS event, e.time FROM users_table AS u, events_table AS e WHERE u.user_id = e.user_id AND u.user_id >= 1 AND u.user_id <= 3 AND e.event_type IN (1, 2) ) UNION ( SELECT u.user_id, 'step=>2'::text AS event, e.time FROM users_table AS u, events_table AS e WHERE u.user_id = e.user_id AND u.user_id >= 1 AND u.user_id <= 3 AND e.event_type IN (3, 4) ) ) t1 LEFT JOIN ( SELECT DISTINCT user_id, 'Has done event'::TEXT AS hasdone_event FROM events_table AS e WHERE e.user_id >= 1 AND e.user_id <= 3 AND e.event_type IN (5, 6) ) t2 ON (t1.user_id = t2.user_id) GROUP BY t1.user_id, hasdone_event ) t GROUP BY user_id, hasdone_event ORDER BY user_id; user_id | sum | length | hasdone_event --------------------------------------------------------------------- 1 | 12 | 14 | Has done event 2 | 20 | 14 | Has done event 3 | 20 | 14 | Has done event (3 rows) -- same query but multiple joins are one level below, returns count of row instead of actual rows SELECT count(*) FROM ( SELECT user_id, sum(array_length(events_table, 1)), length(hasdone_event), hasdone_event FROM ( SELECT t1.user_id, array_agg(event ORDER BY time) AS events_table, COALESCE(hasdone_event, 'Has not done event') AS hasdone_event FROM ( ( SELECT u.user_id, 'step=>1'::text AS event, e.time FROM users_table AS u, events_table AS e WHERE u.user_id = e.user_id AND u.user_id >= 1 AND u.user_id <= 3 AND e.event_type IN (1, 2) ) UNION ( SELECT u.user_id, 'step=>2'::text AS event, e.time FROM users_table AS u, events_table AS e WHERE u.user_id = e.user_id AND u.user_id >= 1 AND u.user_id <= 3 AND e.event_type IN (3, 4) ) ) t1 LEFT JOIN ( SELECT DISTINCT user_id, 'Has done event'::TEXT AS hasdone_event FROM events_table AS e WHERE e.user_id >= 1 AND e.user_id <= 3 AND e.event_type IN (5, 6) ) t2 ON (t1.user_id = t2.user_id) GROUP BY t1.user_id, hasdone_event ) t GROUP BY user_id, hasdone_event ORDER BY user_id) u; count --------------------------------------------------------------------- 3 (1 row) -- Same queries written without unions SELECT user_id, sum(array_length(events_table, 1)), length(hasdone_event), hasdone_event FROM ( SELECT t1.user_id, array_agg(event ORDER BY time) AS events_table, COALESCE(hasdone_event, 'Has not done event') AS hasdone_event FROM ( SELECT u.user_id, CASE WHEN e.event_type IN (1, 2) THEN 'step=>1'::text else 'step==>2'::text END AS event, e.time FROM users_table AS u, events_table AS e WHERE u.user_id = e.user_id AND u.user_id >= 1 AND u.user_id <= 3 AND e.event_type IN (1, 2, 3, 4) GROUP BY 1,2,3 ) t1 LEFT JOIN ( SELECT DISTINCT user_id, 'Has done event'::TEXT AS hasdone_event FROM events_table AS e WHERE e.user_id >= 1 AND e.user_id <= 3 AND e.event_type IN (5, 6) ) t2 ON (t1.user_id = t2.user_id) GROUP BY t1.user_id, hasdone_event ) t GROUP BY user_id, hasdone_event ORDER BY user_id; user_id | sum | length | hasdone_event --------------------------------------------------------------------- 1 | 12 | 14 | Has done event 2 | 20 | 14 | Has done event 3 | 20 | 14 | Has done event (3 rows) -- same query but multiple joins are one level below, returns count of row instead of actual rows SELECT count(*) FROM ( SELECT user_id, sum(array_length(events_table, 1)), length(hasdone_event), hasdone_event FROM ( SELECT t1.user_id, array_agg(event ORDER BY time) AS events_table, COALESCE(hasdone_event, 'Has not done event') AS hasdone_event FROM ( SELECT u.user_id, CASE WHEN e.event_type in (1, 2) then 'step=>1'::text else 'step==>2'::text END AS event, e.time FROM users_table AS u, events_table AS e WHERE u.user_id = e.user_id AND u.user_id >= 1 AND u.user_id <= 3 AND e.event_type IN (1, 2, 3, 4) GROUP BY 1,2,3 ) t1 LEFT JOIN ( SELECT DISTINCT user_id, 'Has done event'::TEXT AS hasdone_event FROM events_table AS e WHERE e.user_id >= 1 AND e.user_id <= 3 AND e.event_type IN (5, 6) ) t2 ON (t1.user_id = t2.user_id) GROUP BY t1.user_id, hasdone_event ) t GROUP BY user_id, hasdone_event ORDER BY user_id) u; count --------------------------------------------------------------------- 3 (1 row) --------------------------------------------------------------------- -- Funnel, grouped by the number of times a user has done an event --------------------------------------------------------------------- SELECT user_id, avg(array_length(events_table, 1)) AS event_average, count_pay FROM ( SELECT subquery_1.user_id, array_agg(event ORDER BY time) AS events_table, COALESCE(count_pay, 0) AS count_pay FROM ( (SELECT users_table.user_id, 'action=>1'AS event, events_table.time FROM users_table, events_table WHERE users_table.user_id = events_table.user_id AND users_table.user_id >= 1 AND users_table.user_id <= 3 AND events_table.event_type > 1 AND events_table.event_type < 3 ) UNION (SELECT users_table.user_id, 'action=>2'AS event, events_table.time FROM users_table, events_table WHERE users_table.user_id = events_table.user_id AND users_table.user_id >= 1 AND users_table.user_id <= 3 AND events_table.event_type > 1 AND events_table.event_type < 4 ) ) AS subquery_1 LEFT JOIN (SELECT user_id, COUNT(*) AS count_pay FROM users_table WHERE user_id >= 1 AND user_id <= 3 AND users_table.value_1 > 2 AND users_table.value_1 < 5 GROUP BY user_id HAVING COUNT(*) > 1) AS subquery_2 ON subquery_1.user_id = subquery_2.user_id GROUP BY subquery_1.user_id, count_pay) AS subquery_top WHERE array_ndims(events_table) > 0 GROUP BY count_pay, user_id ORDER BY event_average DESC, count_pay DESC, user_id DESC; user_id | event_average | count_pay --------------------------------------------------------------------- 3 | 19.0000000000000000 | 7 2 | 12.0000000000000000 | 9 1 | 7.0000000000000000 | 5 (3 rows) SELECT user_id, avg(array_length(events_table, 1)) AS event_average, count_pay FROM ( SELECT subquery_1.user_id, array_agg(event ORDER BY time) AS events_table, COALESCE(count_pay, 0) AS count_pay FROM ( (SELECT users_table.user_id, 'action=>1'AS event, events_table.time FROM users_table, events_table WHERE users_table.user_id = events_table.user_id AND users_table.user_id >= 1 AND users_table.user_id <= 3 AND events_table.event_type > 1 AND events_table.event_type < 3 ) UNION (SELECT users_table.user_id, 'action=>2'AS event, events_table.time FROM users_table, events_table WHERE users_table.user_id = events_table.user_id AND users_table.user_id >= 1 AND users_table.user_id <= 3 AND events_table.event_type > 1 AND events_table.event_type < 4 ) ) AS subquery_1 LEFT JOIN (SELECT user_id, COUNT(*) AS count_pay FROM users_table WHERE user_id >= 1 AND user_id <= 3 AND users_table.value_1 > 2 AND users_table.value_1 < 4 GROUP BY user_id HAVING COUNT(*) > 1) AS subquery_2 ON subquery_1.user_id = subquery_2.user_id GROUP BY subquery_1.user_id, count_pay) AS subquery_top WHERE array_ndims(events_table) > 0 GROUP BY count_pay, user_id HAVING avg(array_length(events_table, 1)) > 0 ORDER BY event_average DESC, count_pay DESC, user_id DESC; user_id | event_average | count_pay --------------------------------------------------------------------- 3 | 19.0000000000000000 | 3 2 | 12.0000000000000000 | 4 1 | 7.0000000000000000 | 3 (3 rows) -- Same queries rewritten without using unions SELECT user_id, avg(array_length(events_table, 1)) AS event_average, count_pay FROM ( SELECT subquery_1.user_id, array_agg(event ORDER BY time) AS events_table, COALESCE(count_pay, 0) AS count_pay FROM ( SELECT users_table.user_id, CASE WHEN events_table.event_type > 1 AND events_table.event_type < 3 THEN 'action=>1' ELSE 'action=>2' END AS event, events_table.time FROM users_table, events_table WHERE users_table.user_id = events_table.user_id AND users_table.user_id >= 1 AND users_table.user_id <= 3 AND (events_table.event_type > 1 AND events_table.event_type < 3 OR events_table.event_type > 2 AND events_table.event_type < 4) GROUP BY 1, 2, 3 ) AS subquery_1 LEFT JOIN (SELECT user_id, COUNT(*) AS count_pay FROM users_table WHERE user_id >= 1 AND user_id <= 3 AND users_table.value_1 > 3 AND users_table.value_1 < 5 GROUP BY user_id HAVING COUNT(*) > 1) AS subquery_2 ON subquery_1.user_id = subquery_2.user_id GROUP BY subquery_1.user_id, count_pay) AS subquery_top WHERE array_ndims(events_table) > 0 GROUP BY count_pay, user_id ORDER BY event_average DESC, count_pay DESC, user_id DESC; user_id | event_average | count_pay --------------------------------------------------------------------- 3 | 12.0000000000000000 | 4 2 | 9.0000000000000000 | 5 1 | 5.0000000000000000 | 2 (3 rows) SELECT user_id, avg(array_length(events_table, 1)) AS event_average, count_pay FROM ( SELECT subquery_1.user_id, array_agg(event ORDER BY time) AS events_table, COALESCE(count_pay, 0) AS count_pay FROM ( SELECT users_table.user_id, CASE WHEN events_table.event_type > 1 AND events_table.event_type < 3 THEN 'action=>1' ELSE 'action=>2' END AS event, events_table.time FROM users_table, events_table WHERE users_table.user_id = events_table.user_id AND users_table.user_id >= 1 AND users_table.user_id <= 3 AND (events_table.event_type > 1 AND events_table.event_type < 3 OR events_table.event_type > 2 AND events_table.event_type < 4) GROUP BY 1, 2, 3 ) AS subquery_1 LEFT JOIN (SELECT user_id, COUNT(*) AS count_pay FROM users_table WHERE user_id >= 1 AND user_id <= 3 AND users_table.value_1 > 3 AND users_table.value_1 < 5 GROUP BY user_id HAVING COUNT(*) > 1) AS subquery_2 ON subquery_1.user_id = subquery_2.user_id GROUP BY subquery_1.user_id, count_pay) AS subquery_top WHERE array_ndims(events_table) > 0 GROUP BY count_pay, user_id HAVING avg(array_length(events_table, 1)) > 0 ORDER BY event_average DESC, count_pay DESC, user_id DESC; user_id | event_average | count_pay --------------------------------------------------------------------- 3 | 12.0000000000000000 | 4 2 | 9.0000000000000000 | 5 1 | 5.0000000000000000 | 2 (3 rows) --------------------------------------------------------------------- -- Most recently seen users_table events_table --------------------------------------------------------------------- -- Note that we don't use ORDER BY/LIMIT yet --------------------------------------------------------------------- SELECT user_id, user_lastseen, array_length(event_array, 1) FROM ( SELECT user_id, max(u.time) as user_lastseen, array_agg(event_type ORDER BY u.time) AS event_array FROM ( SELECT user_id, time FROM users_table WHERE user_id >= 1 AND user_id <= 3 AND users_table.value_1 > 1 AND users_table.value_1 < 3 ) u LEFT JOIN LATERAL ( SELECT event_type, time FROM events_table WHERE user_id = u.user_id AND events_table.event_type > 1 AND events_table.event_type < 3 ) t ON true GROUP BY user_id ) AS shard_union ORDER BY user_lastseen DESC, user_id; user_id | user_lastseen | array_length --------------------------------------------------------------------- 2 | Thu Nov 23 11:47:26.900284 2017 | 12 3 | Thu Nov 23 11:18:53.114408 2017 | 14 (2 rows) --------------------------------------------------------------------- -- Count the number of distinct users_table who are in segment X and Y and Z --------------------------------------------------------------------- SELECT user_id FROM users_table WHERE user_id IN (SELECT user_id FROM users_table WHERE value_1 >= 1 AND value_1 <= 2) AND user_id IN (SELECT user_id FROM users_table WHERE value_1 >= 3 AND value_1 <= 4) AND user_id IN (SELECT user_id FROM users_table WHERE value_1 >= 5 AND value_1 <= 6) GROUP BY user_id ORDER BY user_id DESC LIMIT 5; user_id --------------------------------------------------------------------- 6 5 4 3 1 (5 rows) --------------------------------------------------------------------- -- Find customers who have done X, and satisfy other customer specific criteria --------------------------------------------------------------------- SELECT user_id, value_2 FROM users_table WHERE value_1 > 1 AND value_1 < 3 AND value_2 >= 1 AND EXISTS (SELECT user_id FROM events_table WHERE event_type > 1 AND event_type < 3 AND value_3 > 1 AND user_id = users_table.user_id) ORDER BY 2 DESC, 1 DESC LIMIT 5; user_id | value_2 --------------------------------------------------------------------- 6 | 4 6 | 4 2 | 4 6 | 3 4 | 3 (5 rows) --------------------------------------------------------------------- -- Customers who haven’t done X, and satisfy other customer specific criteria --------------------------------------------------------------------- SELECT user_id, value_2 FROM users_table WHERE value_1 = 2 AND value_2 >= 1 AND NOT EXISTS (SELECT user_id FROM events_table WHERE event_type=2 AND value_3 > 1 AND user_id = users_table.user_id) ORDER BY 1 DESC, 2 DESC LIMIT 3; user_id | value_2 --------------------------------------------------------------------- 5 | 5 5 | 5 5 | 2 (3 rows) --------------------------------------------------------------------- -- Customers who have done X and Y, and satisfy other customer specific criteria --------------------------------------------------------------------- SELECT user_id, sum(value_2) as cnt FROM users_table WHERE value_1 > 1 AND value_2 >= 1 AND EXISTS (SELECT user_id FROM events_table WHERE event_type != 1 AND value_3 > 1 AND user_id = users_table.user_id) AND EXISTS (SELECT user_id FROM events_table WHERE event_type = 2 AND value_3 > 1 AND user_id = users_table.user_id) GROUP BY user_id ORDER BY cnt DESC, user_id DESC LIMIT 5; user_id | cnt --------------------------------------------------------------------- 4 | 43 2 | 37 3 | 34 1 | 17 6 | 15 (5 rows) --------------------------------------------------------------------- -- Customers who have done X and haven’t done Y, and satisfy other customer specific criteria --------------------------------------------------------------------- SELECT user_id, value_2 FROM users_table WHERE value_2 >= 1 AND EXISTS (SELECT user_id FROM events_table WHERE event_type > 1 AND event_type <= 3 AND value_3 > 1 AND user_id = users_table.user_id) AND NOT EXISTS (SELECT user_id FROM events_table WHERE event_type > 3 AND event_type <= 4 AND value_3 > 1 AND user_id = users_table.user_id) ORDER BY 2 DESC, 1 DESC LIMIT 4; user_id | value_2 --------------------------------------------------------------------- 5 | 5 5 | 5 5 | 5 5 | 4 (4 rows) --------------------------------------------------------------------- -- Customers who have done X more than 2 times, and satisfy other customer specific criteria --------------------------------------------------------------------- SELECT user_id, avg(value_2) FROM users_table WHERE value_1 > 1 AND value_1 < 3 AND value_2 >= 1 AND EXISTS (SELECT user_id FROM events_table WHERE event_type > 1 AND event_type < 3 AND value_3 > 1 AND user_id = users_table.user_id GROUP BY user_id HAVING Count(*) > 2) GROUP BY user_id ORDER BY 1 DESC, 2 DESC LIMIT 5; user_id | avg --------------------------------------------------------------------- 4 | 2.0000000000000000 3 | 2.0000000000000000 (2 rows) --------------------------------------------------------------------- -- Find me all users_table who logged in more than once --------------------------------------------------------------------- SELECT user_id, value_1 from ( SELECT user_id, value_1 From users_table WHERE value_2 > 1 and user_id = 2 GROUP BY value_1, user_id HAVING count(*) > 1 ) AS a ORDER BY user_id ASC, value_1 ASC; user_id | value_1 --------------------------------------------------------------------- 2 | 0 2 | 2 2 | 3 2 | 4 (4 rows) -- same query with additional filter to make it not router plannable SELECT user_id, value_1 from ( SELECT user_id, value_1 From users_table WHERE value_2 > 1 and (user_id = 2 OR user_id = 3) GROUP BY value_1, user_id HAVING count(*) > 1 ) AS a ORDER BY user_id ASC, value_1 ASC; user_id | value_1 --------------------------------------------------------------------- 2 | 0 2 | 2 2 | 3 2 | 4 3 | 1 3 | 2 3 | 3 3 | 4 (8 rows) --------------------------------------------------------------------- -- Find me all users_table who has done some event and has filters --------------------------------------------------------------------- SELECT user_id FROM events_table WHERE event_type = 3 AND value_2 > 2 AND user_id IN (SELECT user_id FROM users_table WHERE value_1 = 1 AND value_2 > 2 ) ORDER BY 1; user_id --------------------------------------------------------------------- 1 2 2 3 5 (5 rows) --------------------------------------------------------------------- -- Which events_table did people who has done some specific events_table --------------------------------------------------------------------- SELECT user_id, event_type FROM events_table WHERE user_id in (SELECT user_id from events_table WHERE event_type > 3 and event_type < 5) GROUP BY user_id, event_type ORDER BY 2 DESC, 1 LIMIT 3; user_id | event_type --------------------------------------------------------------------- 1 | 6 2 | 5 3 | 5 (3 rows) --------------------------------------------------------------------- -- Find me all the users_table who has done some event more than three times --------------------------------------------------------------------- SELECT user_id FROM ( SELECT user_id FROM events_table WHERE event_type = 2 GROUP BY user_id HAVING count(*) > 1 ) AS a ORDER BY user_id; user_id --------------------------------------------------------------------- 1 2 3 4 6 (5 rows) --------------------------------------------------------------------- -- Find my assets that have the highest probability and fetch their metadata --------------------------------------------------------------------- CREATE TEMP TABLE assets AS SELECT users_table.user_id, users_table.value_1, prob FROM users_table JOIN (SELECT ma.user_id, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob FROM users_table AS ma, events_table as short_list WHERE short_list.user_id = ma.user_id and ma.value_1 < 2 and short_list.event_type < 2 ) temp ON users_table.user_id = temp.user_id WHERE users_table.value_1 < 2; -- get some statistics from the aggregated results to ensure the results are correct SELECT count(*), count(DISTINCT user_id), avg(user_id) FROM assets; count | count | avg --------------------------------------------------------------------- 732 | 6 | 3.3934426229508197 (1 row) DROP TABLE assets; -- count number of distinct users who have value_1 equal to 5 or 13 but not 3 -- is recusrively planned SET client_min_messages TO DEBUG1; SELECT count(*) FROM ( SELECT user_id FROM users_table WHERE (value_1 = '1' OR value_1 = '3') AND user_id NOT IN (select user_id from users_table where value_1 = '4') GROUP BY user_id HAVING count(distinct value_1) = 2 ) as foo; DEBUG: generating subplan XXX_1 for subquery SELECT user_id FROM public.users_table WHERE (value_1 OPERATOR(pg_catalog.=) 4) DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT count(*) AS count FROM (SELECT users_table.user_id FROM public.users_table WHERE (((users_table.value_1 OPERATOR(pg_catalog.=) 1) OR (users_table.value_1 OPERATOR(pg_catalog.=) 3)) AND (NOT (users_table.user_id OPERATOR(pg_catalog.=) ANY (SELECT intermediate_result.user_id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer))))) GROUP BY users_table.user_id HAVING (count(DISTINCT users_table.value_1) OPERATOR(pg_catalog.=) 2)) foo count --------------------------------------------------------------------- 1 (1 row) RESET client_min_messages; -- previous push down query SELECT subquery_count FROM (SELECT count(*) as subquery_count FROM (SELECT user_id FROM users_table WHERE (value_1 = '1' OR value_1 = '3') GROUP BY user_id HAVING count(distinct value_1) = 2) as a LEFT JOIN (SELECT user_id FROM users_table WHERE (value_1 = '2') GROUP BY user_id) as b ON a.user_id = b.user_id WHERE b.user_id IS NULL GROUP BY a.user_id ) AS inner_subquery; subquery_count --------------------------------------------------------------------- 1 (1 row) -- new pushdown query without single range table entry at top requirement SELECT count(*) as subquery_count FROM ( SELECT user_id FROM users_table WHERE (value_1 = '1' OR value_1 = '3') GROUP BY user_id HAVING count(distinct value_1) = 2 ) as a LEFT JOIN ( SELECT user_id FROM users_table WHERE (value_1 = '2') GROUP BY user_id) AS b ON a.user_id = b.user_id WHERE b.user_id IS NULL GROUP BY a.user_id; subquery_count --------------------------------------------------------------------- 1 (1 row) -- most queries below has limit clause -- therefore setting subquery_pushdown flag for all SET citus.subquery_pushdown to ON; NOTICE: Setting citus.subquery_pushdown flag is discouraged becuase it forces the planner to pushdown certain queries, skipping relevant correctness checks. DETAIL: When enabled, the planner skips many correctness checks for subqueries and pushes down the queries to shards as-is. It means that the queries are likely to return wrong results unless the user is absolutely sure that pushing down the subquery is safe. This GUC is maintained only for backward compatibility, no new users are supposed to use it. The planner is capable of pushing down as much computation as possible to the shards depending on the query. -- multi-subquery-join -- The first query has filters on partition column to make it router plannable -- but it is processed by logical planner since we disabled router execution SELECT e1.user_id, sum(view_homepage) AS viewed_homepage, sum(use_demo) AS use_demo, sum(enter_credit_card) AS entered_credit_card, sum(submit_card_info) as submit_card_info, sum(see_bought_screen) as see_bought_screen FROM ( -- Get the first time each user viewed the homepage. SELECT user_id, 1 AS view_homepage, min(time) AS view_homepage_time FROM events_table WHERE user_id = 1 and event_type IN (1, 2) GROUP BY user_id ) e1 LEFT JOIN LATERAL ( SELECT user_id, 1 AS use_demo, time AS use_demo_time FROM events_table WHERE user_id = e1.user_id AND user_id = 1 and event_type IN (2, 3) ORDER BY time LIMIT 1 ) e2 ON true LEFT JOIN LATERAL ( SELECT user_id, 1 AS enter_credit_card, time AS enter_credit_card_time FROM events_table WHERE user_id = e2.user_id AND user_id = 1 and event_type IN (3, 4) ORDER BY time LIMIT 1 ) e3 ON true LEFT JOIN LATERAL ( SELECT 1 AS submit_card_info, user_id, time AS enter_credit_card_time FROM events_table WHERE user_id = e3.user_id AND user_id = 1 and event_type IN (4, 5) ORDER BY time LIMIT 1 ) e4 ON true LEFT JOIN LATERAL ( SELECT 1 AS see_bought_screen FROM events_table WHERE user_id = e4.user_id AND user_id = 1 and event_type IN (5, 6) ORDER BY time LIMIT 1 ) e5 ON true WHERE e1.user_id = 1 GROUP BY e1.user_id LIMIT 1; user_id | viewed_homepage | use_demo | entered_credit_card | submit_card_info | see_bought_screen --------------------------------------------------------------------- 1 | 1 | 1 | 1 | 1 | 1 (1 row) -- Same query without all limitations SELECT e1.user_id, sum(view_homepage) AS viewed_homepage, sum(use_demo) AS use_demo, sum(enter_credit_card) AS entered_credit_card, sum(submit_card_info) as submit_card_info, sum(see_bought_screen) as see_bought_screen FROM ( -- Get the first time each user viewed the homepage. SELECT user_id, 1 AS view_homepage, min(time) AS view_homepage_time FROM events_table WHERE event_type IN (1, 2) GROUP BY user_id ) e1 LEFT JOIN LATERAL ( SELECT user_id, 1 AS use_demo, time AS use_demo_time FROM events_table WHERE user_id = e1.user_id AND event_type IN (2, 3) ORDER BY time ) e2 ON true LEFT JOIN LATERAL ( SELECT user_id, 1 AS enter_credit_card, time AS enter_credit_card_time FROM events_table WHERE user_id = e2.user_id AND event_type IN (3, 4) ORDER BY time ) e3 ON true LEFT JOIN LATERAL ( SELECT 1 AS submit_card_info, user_id, time AS enter_credit_card_time FROM events_table WHERE user_id = e3.user_id AND event_type IN (4, 5) ORDER BY time ) e4 ON true LEFT JOIN LATERAL ( SELECT 1 AS see_bought_screen FROM events_table WHERE user_id = e4.user_id AND event_type IN (5, 6) ORDER BY time ) e5 ON true GROUP BY e1.user_id ORDER BY 6 DESC NULLS LAST, 5 DESC NULLS LAST, 4 DESC NULLS LAST, 3 DESC NULLS LAST, 2 DESC NULLS LAST, 1 LIMIT 15; user_id | viewed_homepage | use_demo | entered_credit_card | submit_card_info | see_bought_screen --------------------------------------------------------------------- 2 | 1080 | 1080 | 1080 | 1080 | 1080 3 | 540 | 540 | 540 | 540 | 540 4 | 252 | 252 | 252 | 252 | 252 1 | 200 | 200 | 200 | 200 | 200 6 | 128 | 128 | 128 | 128 | 128 5 | 72 | 72 | 72 | 72 | 72 (6 rows) -- Same query without all limitations but uses having() to show only those submitted their credit card info SELECT e1.user_id, sum(view_homepage) AS viewed_homepage, sum(use_demo) AS use_demo, sum(enter_credit_card) AS entered_credit_card, sum(submit_card_info) as submit_card_info, sum(see_bought_screen) as see_bought_screen FROM ( -- Get the first time each user viewed the homepage. SELECT user_id, 1 AS view_homepage, min(time) AS view_homepage_time FROM events_table WHERE event_type IN (1, 2) GROUP BY user_id ) e1 LEFT JOIN LATERAL ( SELECT user_id, 1 AS use_demo, time AS use_demo_time FROM events_table WHERE user_id = e1.user_id AND event_type IN (2, 3) ORDER BY time ) e2 ON true LEFT JOIN LATERAL ( SELECT user_id, 1 AS enter_credit_card, time AS enter_credit_card_time FROM events_table WHERE user_id = e2.user_id AND event_type IN (3, 4) ORDER BY time ) e3 ON true LEFT JOIN LATERAL ( SELECT 1 AS submit_card_info, user_id, time AS enter_credit_card_time FROM events_table WHERE user_id = e3.user_id AND event_type IN (4, 5) ORDER BY time ) e4 ON true LEFT JOIN LATERAL ( SELECT 1 AS see_bought_screen FROM events_table WHERE user_id = e4.user_id AND event_type IN (5, 6) ORDER BY time ) e5 ON true group by e1.user_id HAVING sum(submit_card_info) > 0 ORDER BY 6 DESC NULLS LAST, 5 DESC NULLS LAST, 4 DESC NULLS LAST, 3 DESC NULLS LAST, 2 DESC NULLS LAST, 1 LIMIT 15; user_id | viewed_homepage | use_demo | entered_credit_card | submit_card_info | see_bought_screen --------------------------------------------------------------------- 2 | 1080 | 1080 | 1080 | 1080 | 1080 3 | 540 | 540 | 540 | 540 | 540 4 | 252 | 252 | 252 | 252 | 252 1 | 200 | 200 | 200 | 200 | 200 6 | 128 | 128 | 128 | 128 | 128 5 | 72 | 72 | 72 | 72 | 72 (6 rows) -- Explain analyze on this query fails due to #756 -- avg expression used on order by SELECT a.user_id, avg(b.value_2) as subquery_avg FROM ( SELECT user_id FROM users_table WHERE (value_1 > 2) GROUP BY user_id HAVING count(distinct value_1) > 2 ) as a LEFT JOIN ( SELECT user_id, value_2, value_3 FROM users_table WHERE (value_1 > 3)) AS b ON a.user_id = b.user_id WHERE b.user_id IS NOT NULL GROUP BY a.user_id ORDER BY avg(b.value_3), 2, 1 LIMIT 5; user_id | subquery_avg --------------------------------------------------------------------- 3 | 3.6000000000000000 5 | 2.1666666666666667 4 | 2.6666666666666667 1 | 2.3333333333333333 (4 rows) -- add having to the same query SELECT a.user_id, avg(b.value_2) as subquery_avg FROM ( SELECT user_id FROM users_table WHERE (value_1 > 2) GROUP BY user_id HAVING count(distinct value_1) > 2 ) as a LEFT JOIN ( SELECT user_id, value_2, value_3 FROM users_table WHERE (value_1 > 3)) AS b ON a.user_id = b.user_id WHERE b.user_id IS NOT NULL GROUP BY a.user_id HAVING sum(b.value_3) > 5 ORDER BY avg(b.value_3), 2, 1 LIMIT 5; user_id | subquery_avg --------------------------------------------------------------------- 3 | 3.6000000000000000 5 | 2.1666666666666667 4 | 2.6666666666666667 1 | 2.3333333333333333 (4 rows) -- avg on the value_3 is not a resjunk SELECT a.user_id, avg(b.value_2) as subquery_avg, avg(b.value_3) FROM (SELECT user_id FROM users_table WHERE (value_1 > 2) GROUP BY user_id HAVING count(distinct value_1) > 2 ) as a LEFT JOIN ( SELECT user_id, value_2, value_3 FROM users_table WHERE (value_1 > 3) ) AS b ON a.user_id = b.user_id WHERE b.user_id IS NOT NULL GROUP BY a.user_id ORDER BY avg(b.value_3) DESC, 2, 1 LIMIT 5; user_id | subquery_avg | avg --------------------------------------------------------------------- 1 | 2.3333333333333333 | 3.33333333333333 4 | 2.6666666666666667 | 2.55555555555556 5 | 2.1666666666666667 | 2.16666666666667 3 | 3.6000000000000000 | 1.6 (4 rows) -- a powerful query structure that analyzes users/events -- using (relation JOIN subquery JOIN relation) SELECT u.user_id, sub.value_2, sub.value_3, COUNT(e2.user_id) counts FROM users_table u LEFT OUTER JOIN LATERAL (SELECT * FROM events_table e1 WHERE e1.user_id = u.user_id ORDER BY e1.value_3 DESC LIMIT 1 ) sub ON true LEFT OUTER JOIN events_table e2 ON e2.user_id = sub.user_id WHERE e2.value_2 > 1 AND e2.value_2 < 5 AND u.value_2 > 1 AND u.value_2 < 5 GROUP BY u.user_id, sub.value_2, sub.value_3 ORDER BY 4 DESC, 1 DESC, 2 ASC, 3 ASC LIMIT 10; user_id | value_2 | value_3 | counts --------------------------------------------------------------------- 5 | 3 | 4 | 160 2 | 3 | 5 | 156 3 | 2 | 5 | 108 4 | 2 | 4 | 90 1 | 2 | 5 | 60 6 | 2 | 5 | 48 (6 rows) -- distinct users joined with events SELECT avg(events_table.event_type) as avg_type, count(*) as users_count FROM events_table JOIN (SELECT DISTINCT user_id FROM users_table ) as distinct_users ON distinct_users.user_id = events_table.user_id GROUP BY distinct_users.user_id ORDER BY users_count desc, avg_type DESC LIMIT 5; avg_type | users_count --------------------------------------------------------------------- 2.3750000000000000 | 24 2.5714285714285714 | 21 2.5294117647058824 | 17 2.7333333333333333 | 15 2.2142857142857143 | 14 (5 rows) -- reduce the data set, aggregate and join SELECT events_table.event_type, users_count.ct FROM events_table JOIN (SELECT distinct_users.user_id, count(1) as ct FROM (SELECT user_id FROM users_table ) as distinct_users GROUP BY distinct_users.user_id ) as users_count ON users_count.user_id = events_table.user_id ORDER BY users_count.ct desc, event_type DESC LIMIT 5; event_type | ct --------------------------------------------------------------------- 5 | 26 4 | 26 3 | 26 3 | 26 3 | 26 (5 rows) --- now, test (subquery JOIN subquery) SELECT n1.user_id, count_1, total_count FROM (SELECT user_id, count(1) as count_1 FROM users_table GROUP BY user_id ) n1 INNER JOIN ( SELECT user_id, count(1) as total_count FROM events_table GROUP BY user_id, event_type ) n2 ON (n2.user_id = n1.user_id) ORDER BY total_count DESC, count_1 DESC, 1 DESC LIMIT 10; user_id | count_1 | total_count --------------------------------------------------------------------- 2 | 18 | 7 3 | 17 | 7 2 | 18 | 6 5 | 26 | 5 5 | 26 | 5 4 | 23 | 5 3 | 17 | 5 1 | 7 | 5 4 | 23 | 4 4 | 23 | 4 (10 rows) SELECT a.user_id, avg(b.value_2) as subquery_avg FROM (SELECT user_id FROM users_table WHERE (value_1 > 2) GROUP BY user_id HAVING count(distinct value_1) > 2 ) as a LEFT JOIN (SELECT DISTINCT ON (user_id) user_id, value_2, value_3 FROM users_table WHERE (value_1 > 3) ORDER BY 1,2,3 ) AS b ON a.user_id = b.user_id WHERE b.user_id IS NOT NULL GROUP BY a.user_id ORDER BY avg(b.value_3), 2, 1 LIMIT 5; user_id | subquery_avg --------------------------------------------------------------------- 5 | 0.00000000000000000000 3 | 2.0000000000000000 4 | 1.00000000000000000000 1 | 0.00000000000000000000 (4 rows) -- distinct clause must include partition column -- when used in target list SELECT a.user_id, avg(b.value_2) as subquery_avg FROM (SELECT user_id FROM users_table WHERE (value_1 > 2) GROUP BY user_id HAVING count(distinct value_1) > 2 ) as a LEFT JOIN (SELECT DISTINCT ON (value_2) value_2 , user_id, value_3 FROM users_table WHERE (value_1 > 3) ORDER BY 1,2,3 ) AS b USING (user_id) GROUP BY user_id; ERROR: cannot push down this subquery DETAIL: Distinct on columns without partition column is currently unsupported SELECT a.user_id, avg(b.value_2) as subquery_avg FROM (SELECT user_id FROM users_table WHERE (value_1 > 2) GROUP BY user_id HAVING count(distinct value_1) > 2 ) as a LEFT JOIN (SELECT DISTINCT ON (value_2, user_id) value_2 , user_id, value_3 FROM users_table WHERE (value_1 > 3) ORDER BY 1,2,3 ) AS b ON a.user_id = b.user_id WHERE b.user_id IS NOT NULL GROUP BY a.user_id ORDER BY avg(b.value_3), 2, 1 LIMIT 5; user_id | subquery_avg --------------------------------------------------------------------- 3 | 3.3333333333333333 5 | 2.2000000000000000 4 | 3.2500000000000000 1 | 2.3333333333333333 (4 rows) SELECT user_id, event_type FROM (SELECT * FROM ( (SELECT event_type, user_id as a_user_id FROM events_table) AS a JOIN (SELECT ma.user_id AS user_id, ma.value_2 AS value_2, (GREATEST(coalesce((ma.value_3 * ma.value_2) / 20, 0.0) + GREATEST(1.0))) / 2 AS prob FROM users_table AS ma WHERE (ma.value_2 > 1) ORDER BY prob DESC, value_2 DESC, user_id DESC LIMIT 10 ) AS ma ON (a.a_user_id = ma.user_id) ) AS inner_sub ORDER BY prob DESC, value_2 DESC, user_id DESC, event_type DESC LIMIT 10 ) AS outer_sub ORDER BY prob DESC, value_2 DESC, user_id DESC, event_type DESC LIMIT 10; user_id | event_type --------------------------------------------------------------------- 3 | 5 3 | 4 3 | 4 3 | 4 3 | 4 3 | 3 3 | 3 3 | 3 3 | 3 3 | 3 (10 rows) -- very similar query but produces different result due to -- ordering difference in the previous one's inner query SELECT user_id, event_type FROM (SELECT event_type, user_id as a_user_id FROM events_table) AS a JOIN (SELECT ma.user_id AS user_id, ma.value_2 AS value_2, (GREATEST(coalesce((ma.value_3 * ma.value_2) / 20, 0.0) + GREATEST(1.0))) / 2 AS prob FROM users_table AS ma WHERE (ma.value_2 > 1) ORDER BY prob DESC, user_id DESC LIMIT 10 ) AS ma ON (a.a_user_id = ma.user_id) ORDER BY prob DESC, event_type DESC, user_id DESC LIMIT 10; user_id | event_type --------------------------------------------------------------------- 3 | 5 2 | 5 2 | 5 3 | 4 3 | 4 3 | 4 3 | 4 2 | 4 2 | 4 2 | 4 (10 rows) -- now they produce the same result when ordering fixed in 'outer_sub' SELECT user_id, event_type FROM (SELECT * FROM ( (SELECT event_type, user_id as a_user_id FROM events_table ) AS a JOIN (SELECT ma.user_id AS user_id, ma.value_2 AS value_2, (GREATEST(coalesce((ma.value_3 * ma.value_2) / 20, 0.0) + GREATEST(1.0))) / 2 AS prob FROM users_table AS ma WHERE (ma.value_2 > 1) ORDER BY prob DESC, user_id DESC LIMIT 10 ) AS ma ON (a.a_user_id = ma.user_id) ) AS inner_sub ORDER BY prob DESC, event_type DESC, user_id DESC LIMIT 10 ) AS outer_sub ORDER BY prob DESC, event_type DESC, user_id DESC LIMIT 10; user_id | event_type --------------------------------------------------------------------- 3 | 5 2 | 5 2 | 5 3 | 4 3 | 4 3 | 4 3 | 4 2 | 4 2 | 4 2 | 4 (10 rows) -- this is one complex join query derived from a user's production query -- declare the function on workers and master -- With array_index: CREATE OR REPLACE FUNCTION array_index(ANYARRAY, ANYELEMENT) RETURNS INT AS $$ SELECT i FROM (SELECT generate_series(array_lower($1, 1), array_upper($1, 1))) g(i) WHERE $1 [i] = $2 LIMIT 1; $$ LANGUAGE sql; SELECT * FROM (SELECT * FROM ( (SELECT user_id AS user_id_e, event_type AS event_type_e FROM events_table ) AS ma_e JOIN (SELECT value_2, value_3, user_id FROM (SELECT * FROM ( (SELECT user_id_p AS user_id FROM (SELECT * FROM ( (SELECT user_id AS user_id_p FROM events_table WHERE (event_type IN (1,2,3,4,5)) ) AS ma_p JOIN (SELECT user_id AS user_id_a FROM users_table WHERE (value_2 % 5 = 1) ) AS a ON (a.user_id_a = ma_p.user_id_p) ) ) AS a_ma_p ) AS inner_filter_q JOIN (SELECT value_2, value_3, user_id AS user_id_ck FROM events_table WHERE event_type = ANY(ARRAY [4, 5, 6]) ORDER BY value_3 ASC, user_id_ck DESC, array_index(ARRAY [1, 2, 3], (value_2 % 3)) ASC LIMIT 10 ) AS ma_ck ON (ma_ck.user_id_ck = inner_filter_q.user_id) ) AS inner_sub_q ORDER BY value_3 ASC, user_id_ck DESC, array_index(ARRAY [1, 2, 3], (value_2 % 3)) ASC LIMIT 10 ) AS outer_sub_q ORDER BY value_3 ASC, user_id DESC, array_index(ARRAY [1, 2, 3], (value_2 % 3)) ASC LIMIT 10) AS inner_search_q ON (ma_e.user_id_e = inner_search_q.user_id) ) AS outer_inner_sub_q ORDER BY value_3 ASC, user_id DESC, array_index(ARRAY [1, 2, 3], (value_2 % 3)) ASC, event_type_e DESC LIMIT 10) AS outer_outer_sub_q ORDER BY value_3 ASC, user_id DESC, array_index(ARRAY [1, 2, 3], (value_2 % 3)) ASC, event_type_e DESC LIMIT 10; user_id_e | event_type_e | value_2 | value_3 | user_id --------------------------------------------------------------------- 5 | 5 | 2 | 0 | 5 5 | 5 | 2 | 0 | 5 5 | 5 | 2 | 0 | 5 5 | 5 | 2 | 0 | 5 5 | 5 | 2 | 0 | 5 5 | 5 | 2 | 0 | 5 5 | 5 | 2 | 0 | 5 5 | 5 | 2 | 0 | 5 5 | 5 | 2 | 0 | 5 5 | 5 | 2 | 0 | 5 (10 rows) -- top level select * is removed now there is -- a join at top level. SELECT * FROM ( (SELECT user_id AS user_id_e, event_type as event_type_e FROM events_table ) AS ma_e JOIN (SELECT value_2, value_3, user_id FROM (SELECT * FROM ( (SELECT user_id_p AS user_id FROM (SELECT * FROM ( (SELECT user_id AS user_id_p FROM events_table WHERE (event_type IN (1, 2, 3, 4, 5)) ) AS ma_p JOIN (SELECT user_id AS user_id_a FROM users_table WHERE (value_2 % 5 = 1) ) AS a ON (a.user_id_a = ma_p.user_id_p) ) ) AS a_ma_p ) AS inner_filter_q JOIN (SELECT value_2, value_3, user_id AS user_id_ck FROM events_table WHERE event_type = ANY(ARRAY [4, 5, 6]) ORDER BY value_3 ASC, user_id_ck DESC, array_index(ARRAY [1, 2, 3], (value_2 % 3)) ASC LIMIT 10 ) AS ma_ck ON (ma_ck.user_id_ck = inner_filter_q.user_id) ) AS inner_sub_q ORDER BY value_3 ASC, user_id_ck DESC, array_index(ARRAY [1, 2, 3], (value_2 % 3)) ASC LIMIT 10 ) AS outer_sub_q ORDER BY value_3 ASC, user_id DESC, array_index(ARRAY [1, 2, 3], (value_2 % 3)) ASC LIMIT 10) AS inner_search_q ON (ma_e.user_id_e = inner_search_q.user_id) ) AS outer_inner_sub_q ORDER BY value_3 ASC, user_id DESC, array_index(ARRAY [1, 2, 3], (value_2 % 3)) ASC, event_type_e DESC LIMIT 10; user_id_e | event_type_e | value_2 | value_3 | user_id --------------------------------------------------------------------- 5 | 5 | 2 | 0 | 5 5 | 5 | 2 | 0 | 5 5 | 5 | 2 | 0 | 5 5 | 5 | 2 | 0 | 5 5 | 5 | 2 | 0 | 5 5 | 5 | 2 | 0 | 5 5 | 5 | 2 | 0 | 5 5 | 5 | 2 | 0 | 5 5 | 5 | 2 | 0 | 5 5 | 5 | 2 | 0 | 5 (10 rows) -- drop created functions DROP FUNCTION array_index(ANYARRAY, ANYELEMENT); -- a query with a constant subquery SELECT count(*) as subquery_count FROM ( SELECT user_id FROM users_table WHERE (value_1 = '1' OR value_1 = '3') GROUP BY user_id HAVING count(distinct value_1) = 2 ) as a LEFT JOIN ( SELECT 1 as user_id ) AS b ON a.user_id = b.user_id WHERE b.user_id IS NULL GROUP BY a.user_id; subquery_count --------------------------------------------------------------------- 1 1 1 1 1 (5 rows) -- volatile function in the subquery SELECT count(*) as subquery_count FROM ( SELECT user_id FROM users_table WHERE (value_1 = '1' OR value_1 = '3') GROUP BY user_id HAVING count(distinct value_1) = 2 ) as a INNER JOIN ( SELECT random()::int as user_id ) AS b ON a.user_id = b.user_id WHERE b.user_id IS NULL GROUP BY a.user_id; ERROR: cannot push down this subquery DETAIL: Subqueries without a FROM clause can only contain immutable functions -- this is slightly different, we use RTE_VALUEs here SELECT Count(*) AS subquery_count FROM (SELECT user_id FROM users_table WHERE (value_1 = '1' OR value_1 = '3' ) GROUP BY user_id HAVING Count(DISTINCT value_1) = 2) AS a INNER JOIN (SELECT * FROM (VALUES (1, 'one'), (2, 'two'), (3, 'three')) AS t (user_id, letter)) AS b ON a.user_id = b.user_id WHERE b.user_id IS NULL GROUP BY a.user_id; subquery_count --------------------------------------------------------------------- (0 rows) -- same query without LIMIT/OFFSET returns 30 rows SET client_min_messages TO DEBUG1; -- now, lets use a simple expression on the LIMIT and explicit coercion on the OFFSET SELECT user_id, array_length(events_table, 1) FROM ( SELECT user_id, array_agg(event ORDER BY time) AS events_table FROM ( SELECT u.user_id, e.event_type::text AS event, e.time FROM users_table AS u, events_table AS e WHERE u.user_id = e.user_id AND e.event_type IN (1, 2) ) t GROUP BY user_id ) q ORDER BY 2 DESC, 1 LIMIT 1+1 OFFSET 1::smallint; DEBUG: push down of limit count: 3 user_id | array_length --------------------------------------------------------------------- 4 | 184 2 | 180 (2 rows) -- now, lets use implicit coersion in LIMIT and a simple expressions on OFFSET SELECT user_id, array_length(events_table, 1) FROM ( SELECT user_id, array_agg(event ORDER BY time) AS events_table FROM ( SELECT u.user_id, e.event_type::text AS event, e.time FROM users_table AS u, events_table AS e WHERE u.user_id = e.user_id AND e.event_type IN (1, 2) ) t GROUP BY user_id ) q ORDER BY 2 DESC, 1 LIMIT '3' OFFSET 2+1; DEBUG: push down of limit count: 6 user_id | array_length --------------------------------------------------------------------- 5 | 156 6 | 40 1 | 28 (3 rows) -- create a test function which is marked as volatile CREATE OR REPLACE FUNCTION volatile_func_test() RETURNS INT AS $$ SELECT 1; $$ LANGUAGE sql VOLATILE; DEBUG: switching to sequential query execution mode DETAIL: A command for a distributed function is run. To make sure subsequent commands see the function correctly we need to make sure to use only one connection for all future commands -- Citus should be able to evalute functions/row comparisons on the LIMIT/OFFSET SELECT user_id, array_length(events_table, 1) FROM ( SELECT user_id, array_agg(event ORDER BY time) AS events_table FROM ( SELECT u.user_id, e.event_type::text AS event, e.time FROM users_table AS u, events_table AS e WHERE u.user_id = e.user_id AND e.event_type IN (1, 2, 3, 4) ) t GROUP BY user_id ) q ORDER BY 2 DESC, 1 LIMIT volatile_func_test() + (ROW(1,2,NULL) < ROW(1,3,0))::int OFFSET volatile_func_test() + volatile_func_test(); DEBUG: push down of limit count: 4 user_id | array_length --------------------------------------------------------------------- 3 | 340 5 | 312 (2 rows) -- now, lets use expressions on both the LIMIT and OFFSET SELECT user_id, array_length(events_table, 1) FROM ( SELECT user_id, array_agg(event ORDER BY time) AS events_table FROM ( SELECT u.user_id, e.event_type::text AS event, e.time FROM users_table AS u, events_table AS e WHERE u.user_id = e.user_id AND e.event_type IN (1, 2) ) t GROUP BY user_id ) q ORDER BY 2 DESC, 1 LIMIT (5 > 4)::int OFFSET CASE WHEN 5 != 5 THEN 27 WHEN 1 > 5 THEN 28 ELSE 2 END; DEBUG: push down of limit count: 3 user_id | array_length --------------------------------------------------------------------- 2 | 180 (1 row) -- we don't allow parameters on the LIMIT/OFFSET clauses PREPARE parametrized_limit AS SELECT user_id, array_length(events_table, 1) FROM ( SELECT user_id, array_agg(event ORDER BY time) AS events_table FROM ( SELECT u.user_id, e.event_type::text AS event, e.time FROM users_table AS u, events_table AS e WHERE u.user_id = e.user_id AND e.event_type IN (1, 2) ) t GROUP BY user_id ) q ORDER BY 2 DESC, 1 LIMIT $1 OFFSET $2; EXECUTE parametrized_limit(1,1); DEBUG: push down of limit count: 2 user_id | array_length --------------------------------------------------------------------- 4 | 184 (1 row) PREPARE parametrized_offset AS SELECT user_id, array_length(events_table, 1) FROM ( SELECT user_id, array_agg(event ORDER BY time) AS events_table FROM ( SELECT u.user_id, e.event_type::text AS event, e.time FROM users_table AS u, events_table AS e WHERE u.user_id = e.user_id AND e.event_type IN (1, 2) ) t GROUP BY user_id ) q ORDER BY 2 DESC, 1 LIMIT 1 OFFSET $1; EXECUTE parametrized_offset(1); DEBUG: push down of limit count: 2 user_id | array_length --------------------------------------------------------------------- 4 | 184 (1 row) SET client_min_messages TO DEFAULT; DROP FUNCTION volatile_func_test(); CREATE FUNCTION test_join_function_2(integer, integer) RETURNS bool AS 'select $1 > $2;' LANGUAGE SQL IMMUTABLE RETURNS NULL ON NULL INPUT; -- we don't support joins via functions SELECT user_id, array_length(events_table, 1) FROM ( SELECT user_id, array_agg(event ORDER BY time) AS events_table FROM ( SELECT u.user_id, e.event_type::text AS event, e.time FROM users_table AS u, events_table AS e WHERE test_join_function_2(u.user_id, e.user_id) ) t GROUP BY user_id ) q ORDER BY 2 DESC, 1; ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns -- note that the following query has both equi-joins on the partition keys -- and non-equi-joins on other columns. We now support query filters -- having non-equi-joins as long as they have equi-joins on partition keys. SELECT users_table.user_id, users_table.value_1, prob FROM users_table JOIN (SELECT ma.user_id, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob FROM users_table AS ma, events_table as short_list WHERE short_list.user_id = ma.user_id and ma.value_1 < 3 and short_list.event_type < 3 ) temp ON users_table.user_id = temp.user_id WHERE users_table.value_1 < 3 AND test_join_function_2(users_table.user_id, temp.user_id); user_id | value_1 | prob --------------------------------------------------------------------- (0 rows) -- we do support the following since there is already an equality on the partition -- key and we have an additional join via a function SELECT temp.user_id, users_table.value_1, prob FROM users_table JOIN (SELECT ma.user_id, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob, random() FROM users_table AS ma, events_table as short_list WHERE short_list.user_id = ma.user_id and ma.value_1 < 3 and short_list.event_type < 4 AND test_join_function_2(ma.value_1, short_list.value_2) ) temp ON users_table.user_id = temp.user_id WHERE users_table.value_1 < 3 ORDER BY 2 DESC, 1 DESC LIMIT 10; user_id | value_1 | prob --------------------------------------------------------------------- 6 | 2 | 0.50000000000000000000 6 | 2 | 0.50000000000000000000 6 | 2 | 0.50000000000000000000 6 | 2 | 0.50000000000000000000 6 | 2 | 0.50000000000000000000 6 | 2 | 0.50000000000000000000 6 | 2 | 0.50000000000000000000 6 | 2 | 0.50000000000000000000 6 | 2 | 0.50000000000000000000 5 | 2 | 0.50000000000000000000 (10 rows) -- similarly we do support non equi joins on columns if there is aready -- an equality join SELECT count(*) FROM (SELECT event_type, random() FROM events_table, users_table WHERE events_table.user_id = users_table.user_id AND events_table.time > users_table.time AND events_table.value_2 IN (0, 4) ) as foo; count --------------------------------------------------------------------- 180 (1 row) -- the other way around is not supported SELECT count(*) FROM (SELECT event_type, random() FROM events_table, users_table WHERE events_table.user_id > users_table.user_id AND events_table.time = users_table.time AND events_table.value_2 IN (0, 4) ) as foo; ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns -- we can even allow that on top level joins SELECT count(*) FROM (SELECT event_type, random(), events_table.user_id FROM events_table, users_table WHERE events_table.user_id = users_table.user_id AND events_table.value_2 IN (0, 4) ) as foo, (SELECT event_type, random(), events_table.user_id FROM events_table, users_table WHERE events_table.user_id = users_table.user_id AND events_table.value_2 IN (1, 5) ) as bar WHERE foo.event_type > bar.event_type AND foo.user_id = bar.user_id; count --------------------------------------------------------------------- 11971 (1 row) -- note that the following is not supported -- since the top level join is not on the distribution key SELECT count(*) FROM (SELECT event_type, random() FROM events_table, users_table WHERE events_table.user_id = users_table.user_id AND events_table.value_2 IN (0, 4) ) as foo, (SELECT event_type, random() FROM events_table, users_table WHERE events_table.user_id = users_table.user_id AND events_table.value_2 IN (1, 5) ) as bar WHERE foo.event_type = bar.event_type; ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns -- DISTINCT in the outer query and DISTINCT in the subquery SELECT DISTINCT users_ids.user_id FROM (SELECT DISTINCT user_id FROM users_table) as users_ids JOIN (SELECT ma.user_id, ma.value_1, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob FROM users_table AS ma, events_table as short_list WHERE short_list.user_id = ma.user_id and ma.value_1 < 3 and short_list.event_type < 3 ) temp ON users_ids.user_id = temp.user_id WHERE temp.value_1 < 3 ORDER BY 1 LIMIT 5; user_id --------------------------------------------------------------------- 1 2 3 4 5 (5 rows) -- DISTINCT ON in the outer query and DISTINCT in the subquery SELECT DISTINCT ON (users_ids.user_id) users_ids.user_id, temp.value_1, prob FROM (SELECT DISTINCT user_id FROM users_table) as users_ids JOIN (SELECT ma.user_id, ma.value_1, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob FROM users_table AS ma, events_table as short_list WHERE short_list.user_id = ma.user_id and ma.value_1 < 3 and short_list.event_type < 2 ) temp ON users_ids.user_id = temp.user_id WHERE temp.value_1 < 3 ORDER BY 1, 2 LIMIT 5; user_id | value_1 | prob --------------------------------------------------------------------- 1 | 1 | 0.50000000000000000000 2 | 0 | 0.50000000000000000000 3 | 0 | 0.50000000000000000000 4 | 0 | 0.50000000000000000000 5 | 0 | 0.50000000000000000000 (5 rows) -- DISTINCT ON in the outer query and DISTINCT ON in the subquery SELECT DISTINCT ON (users_ids.user_id) users_ids.user_id, temp.value_1, prob FROM (SELECT DISTINCT ON (user_id) user_id, value_1 FROM users_table ORDER BY 1,2) as users_ids JOIN (SELECT ma.user_id, ma.value_1, (GREATEST(coalesce(ma.value_4 / 250, 0.0) + GREATEST(1.0))) / 2 AS prob FROM users_table AS ma, events_table as short_list WHERE short_list.user_id = ma.user_id and ma.value_1 < 2 and short_list.event_type < 3 ) temp ON users_ids.user_id = temp.user_id ORDER BY 1,2 LIMIT 5; user_id | value_1 | prob --------------------------------------------------------------------- 1 | 1 | 0.50000000000000000000 2 | 0 | 0.50000000000000000000 3 | 0 | 0.50000000000000000000 4 | 0 | 0.50000000000000000000 5 | 0 | 0.50000000000000000000 (5 rows) -- Getting aggregation of value which is created by aggregation in subquery SELECT count(1), avg(agg_value) FROM (SELECT users_table.user_id, avg(users_table.value_1 / events_table.value_4) AS agg_value FROM users_table, events_table WHERE users_table.user_id = events_table.user_id GROUP BY 1 ) AS temp; count | avg --------------------------------------------------------------------- 6 | (1 row) -- Test the case when a subquery has a lateral reference to two levels upper SELECT b.user_id, b.value_2, b.cnt FROM ( SELECT user_id, value_2 FROM events_table WHERE events_table.user_id BETWEEN 2 AND 5 ) a, LATERAL ( SELECT user_id, value_2, count(*) as cnt FROM ( SELECT value_2, time, user_id FROM events_table WHERE user_id BETWEEN 2 AND 5 AND events_table.user_id = a.user_id AND events_table.value_2 = a.value_2 ORDER BY time DESC ) events GROUP BY user_id, value_2 ) b ORDER BY user_id, value_2, cnt LIMIT 1; user_id | value_2 | cnt --------------------------------------------------------------------- 2 | 0 | 1 (1 row) DROP FUNCTION test_join_function_2(integer, integer); SET citus.enable_router_execution TO TRUE; SET citus.subquery_pushdown to OFF;