-- =================================================================== -- test recursive planning functionality with subqueries in WHERE -- =================================================================== CREATE SCHEMA subquery_in_where; SET search_path TO subquery_in_where, public; SET client_min_messages TO DEBUG1; --CTEs can be used as a recurring tuple with subqueries in WHERE -- prevent PG 11 - PG 12 outputs to diverge SET citus.enable_cte_inlining TO false; WITH event_id AS (SELECT user_id AS events_user_id, time AS events_time, event_type FROM events_table) SELECT Count(*) FROM event_id WHERE events_user_id IN (SELECT user_id FROM users_table); DEBUG: generating subplan XXX_1 for CTE event_id: SELECT user_id AS events_user_id, "time" AS events_time, event_type FROM public.events_table DEBUG: generating subplan XXX_2 for subquery SELECT user_id FROM public.users_table DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT count(*) AS count FROM (SELECT intermediate_result.events_user_id, intermediate_result.events_time, intermediate_result.event_type FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(events_user_id integer, events_time timestamp without time zone, event_type integer)) event_id WHERE (events_user_id OPERATOR(pg_catalog.=) ANY (SELECT intermediate_result.user_id FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer))) count --------------------------------------------------------------------- 101 (1 row) SET citus.enable_cte_inlining TO true; --Correlated subqueries can not be used in WHERE clause WITH event_id AS (SELECT user_id AS events_user_id, time AS events_time, event_type FROM events_table) SELECT Count(*) FROM event_id WHERE events_user_id IN (SELECT user_id FROM users_table WHERE users_table.time = events_time); DEBUG: CTE event_id is going to be inlined via distributed planning DEBUG: generating subplan XXX_1 for CTE event_id: SELECT user_id AS events_user_id, "time" AS events_time, event_type FROM public.events_table DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT count(*) AS count FROM (SELECT intermediate_result.events_user_id, intermediate_result.events_time, intermediate_result.event_type FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(events_user_id integer, events_time timestamp without time zone, event_type integer)) event_id WHERE (events_user_id OPERATOR(pg_catalog.=) ANY (SELECT users_table.user_id FROM public.users_table WHERE (users_table."time" OPERATOR(pg_catalog.=) event_id.events_time))) ERROR: correlated subqueries are not supported when the FROM clause contains a CTE or subquery -- Recurring tuples as empty join tree SELECT * FROM (SELECT 1 AS id, 2 AS value_1, 3 AS value_3 UNION ALL SELECT 2 as id, 3 as value_1, 4 as value_3) AS tt1 WHERE id IN (SELECT user_id FROM events_table); DEBUG: generating subplan XXX_1 for subquery SELECT 1 AS id, 2 AS value_1, 3 AS value_3 UNION ALL SELECT 2 AS id, 3 AS value_1, 4 AS value_3 DEBUG: generating subplan XXX_2 for subquery SELECT user_id FROM public.events_table DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT id, value_1, value_3 FROM (SELECT intermediate_result.id, intermediate_result.value_1, intermediate_result.value_3 FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, value_1 integer, value_3 integer)) tt1 WHERE (id OPERATOR(pg_catalog.=) ANY (SELECT intermediate_result.user_id FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer))) id | value_1 | value_3 --------------------------------------------------------------------- 1 | 2 | 3 2 | 3 | 4 (2 rows) -- Recurring tuples in from clause as CTE and SET operation in WHERE clause SELECT Count(*) FROM (WITH event_id AS (SELECT user_id AS events_user_id, time AS events_time, event_type FROM events_table) SELECT events_user_id, events_time, event_type FROM event_id ORDER BY 1,2,3 LIMIT 10) AS sub_table WHERE events_user_id IN ( (SELECT user_id FROM users_table ORDER BY 1 LIMIT 10) UNION ALL (SELECT value_1 FROM users_table ORDER BY 1 limit 10)); DEBUG: CTE event_id is going to be inlined via distributed planning DEBUG: push down of limit count: 10 DEBUG: generating subplan XXX_1 for subquery SELECT user_id FROM public.users_table ORDER BY user_id LIMIT 10 DEBUG: push down of limit count: 10 DEBUG: generating subplan XXX_2 for subquery SELECT value_1 FROM public.users_table ORDER BY value_1 LIMIT 10 DEBUG: generating subplan XXX_3 for subquery SELECT intermediate_result.user_id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer) UNION ALL SELECT intermediate_result.value_1 FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(value_1 integer) DEBUG: push down of limit count: 10 DEBUG: generating subplan XXX_4 for subquery SELECT events_user_id, events_time, event_type FROM (SELECT events_table.user_id AS events_user_id, events_table."time" AS events_time, events_table.event_type FROM public.events_table) event_id ORDER BY events_user_id, events_time, event_type LIMIT 10 DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT count(*) AS count FROM (SELECT intermediate_result.events_user_id, intermediate_result.events_time, intermediate_result.event_type FROM read_intermediate_result('XXX_4'::text, 'binary'::citus_copy_format) intermediate_result(events_user_id integer, events_time timestamp without time zone, event_type integer)) sub_table WHERE (events_user_id OPERATOR(pg_catalog.=) ANY (SELECT intermediate_result.user_id FROM read_intermediate_result('XXX_3'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer))) count --------------------------------------------------------------------- 10 (1 row) -- Recurring tuples in from clause as SET operation on recursively plannable -- queries and CTE in WHERE clause SELECT * FROM ( (SELECT user_id FROM users_table ORDER BY user_id ASC LIMIT 10 ) UNION ALL (SELECT value_1 FROM users_table ORDER BY value_1 ASC LIMIT 10 ) ) as SUB_TABLE WHERE user_id IN ( WITH event_id AS ( SELECT user_id as events_user_id, time as events_time, event_type FROM events_table ) SELECT events_user_id FROM event_id ORDER BY events_user_id LIMIT 10 ); DEBUG: CTE event_id is going to be inlined via distributed planning DEBUG: push down of limit count: 10 DEBUG: generating subplan XXX_1 for subquery SELECT events_user_id FROM (SELECT events_table.user_id AS events_user_id, events_table."time" AS events_time, events_table.event_type FROM public.events_table) event_id ORDER BY events_user_id LIMIT 10 DEBUG: push down of limit count: 10 DEBUG: generating subplan XXX_2 for subquery SELECT user_id FROM public.users_table ORDER BY user_id LIMIT 10 DEBUG: push down of limit count: 10 DEBUG: generating subplan XXX_3 for subquery SELECT value_1 FROM public.users_table ORDER BY value_1 LIMIT 10 DEBUG: generating subplan XXX_4 for subquery SELECT intermediate_result.user_id FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer) UNION ALL SELECT intermediate_result.value_1 FROM read_intermediate_result('XXX_3'::text, 'binary'::citus_copy_format) intermediate_result(value_1 integer) DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT user_id FROM (SELECT intermediate_result.user_id FROM read_intermediate_result('XXX_4'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer)) sub_table WHERE (user_id OPERATOR(pg_catalog.=) ANY (SELECT intermediate_result.events_user_id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(events_user_id integer))) user_id --------------------------------------------------------------------- 1 1 1 1 1 1 1 (7 rows) -- Complex target list in WHERE clause SELECT COUNT(*) FROM (SELECT user_id as events_user_id, time as events_time, event_type FROM events_table ORDER BY 1,2 LIMIT 10 ) as SUB_TABLE WHERE events_user_id <=ANY ( SELECT max(abs(user_id * 1) + mod(user_id, 3)) as val_1 FROM users_table GROUP BY user_id ); DEBUG: push down of limit count: 10 DEBUG: generating subplan XXX_1 for subquery SELECT user_id AS events_user_id, "time" AS events_time, event_type FROM public.events_table ORDER BY user_id, "time" LIMIT 10 DEBUG: generating subplan XXX_2 for subquery SELECT max((abs((user_id OPERATOR(pg_catalog.*) 1)) OPERATOR(pg_catalog.+) mod(user_id, 3))) AS val_1 FROM public.users_table GROUP BY user_id DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT count(*) AS count FROM (SELECT intermediate_result.events_user_id, intermediate_result.events_time, intermediate_result.event_type FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(events_user_id integer, events_time timestamp without time zone, event_type integer)) sub_table WHERE (events_user_id OPERATOR(pg_catalog.<=) ANY (SELECT intermediate_result.val_1 FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(val_1 integer))) count --------------------------------------------------------------------- 10 (1 row) -- DISTINCT clause in WHERE SELECT COUNT(*) FROM (SELECT user_id as events_user_id, time as events_time, event_type FROM events_table LIMIT 10 ) as SUB_TABLE WHERE events_user_id IN ( SELECT distinct user_id FROM users_table GROUP BY user_id ); DEBUG: push down of limit count: 10 DEBUG: generating subplan XXX_1 for subquery SELECT user_id AS events_user_id, "time" AS events_time, event_type FROM public.events_table LIMIT 10 DEBUG: generating subplan XXX_2 for subquery SELECT DISTINCT user_id FROM public.users_table GROUP BY user_id DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT count(*) AS count FROM (SELECT intermediate_result.events_user_id, intermediate_result.events_time, intermediate_result.event_type FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(events_user_id integer, events_time timestamp without time zone, event_type integer)) sub_table WHERE (events_user_id OPERATOR(pg_catalog.=) ANY (SELECT intermediate_result.user_id FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer))) count --------------------------------------------------------------------- 10 (1 row) -- AND in WHERE clause SELECT COUNT(*) FROM (SELECT user_id as events_user_id, time as events_time, event_type FROM events_table ORDER BY 1,2,3 LIMIT 10 ) as SUB_TABLE WHERE events_user_id >=ANY ( SELECT min(user_id) FROM users_table GROUP BY user_id ) AND events_user_id <=ANY ( SELECT max(user_id) FROM users_table GROUP BY user_id ); DEBUG: push down of limit count: 10 DEBUG: generating subplan XXX_1 for subquery SELECT user_id AS events_user_id, "time" AS events_time, event_type FROM public.events_table ORDER BY user_id, "time", event_type LIMIT 10 DEBUG: generating subplan XXX_2 for subquery SELECT min(user_id) AS min FROM public.users_table GROUP BY user_id DEBUG: generating subplan XXX_3 for subquery SELECT max(user_id) AS max FROM public.users_table GROUP BY user_id DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT count(*) AS count FROM (SELECT intermediate_result.events_user_id, intermediate_result.events_time, intermediate_result.event_type FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(events_user_id integer, events_time timestamp without time zone, event_type integer)) sub_table WHERE ((events_user_id OPERATOR(pg_catalog.>=) ANY (SELECT intermediate_result.min FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(min integer))) AND (events_user_id OPERATOR(pg_catalog.<=) ANY (SELECT intermediate_result.max FROM read_intermediate_result('XXX_3'::text, 'binary'::citus_copy_format) intermediate_result(max integer)))) count --------------------------------------------------------------------- 10 (1 row) -- AND in WHERE clause, part of the AND is pushdownable other is not SELECT COUNT(*) FROM (SELECT user_id as events_user_id, time as events_time, event_type FROM events_table ORDER BY 1,2,3 LIMIT 10 ) as SUB_TABLE WHERE events_user_id >=ANY ( SELECT min(user_id) FROM users_table GROUP BY user_id ) AND events_user_id <=ANY ( SELECT max(value_2) FROM users_table GROUP BY user_id ); DEBUG: push down of limit count: 10 DEBUG: generating subplan XXX_1 for subquery SELECT user_id AS events_user_id, "time" AS events_time, event_type FROM public.events_table ORDER BY user_id, "time", event_type LIMIT 10 DEBUG: generating subplan XXX_2 for subquery SELECT min(user_id) AS min FROM public.users_table GROUP BY user_id DEBUG: generating subplan XXX_3 for subquery SELECT max(value_2) AS max FROM public.users_table GROUP BY user_id DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT count(*) AS count FROM (SELECT intermediate_result.events_user_id, intermediate_result.events_time, intermediate_result.event_type FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(events_user_id integer, events_time timestamp without time zone, event_type integer)) sub_table WHERE ((events_user_id OPERATOR(pg_catalog.>=) ANY (SELECT intermediate_result.min FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(min integer))) AND (events_user_id OPERATOR(pg_catalog.<=) ANY (SELECT intermediate_result.max FROM read_intermediate_result('XXX_3'::text, 'binary'::citus_copy_format) intermediate_result(max integer)))) count --------------------------------------------------------------------- 10 (1 row) -- Planning subqueries in WHERE clause in CTE recursively WITH cte AS ( SELECT * FROM (SELECT * FROM users_table ORDER BY user_id ASC, value_2 DESC LIMIT 10 ) as sub_table WHERE user_id IN (SELECT value_2 FROM events_table ) ) SELECT COUNT(*) FROM cte; DEBUG: CTE cte is going to be inlined via distributed planning DEBUG: push down of limit count: 10 DEBUG: generating subplan XXX_1 for subquery SELECT user_id, "time", value_1, value_2, value_3, value_4 FROM public.users_table ORDER BY user_id, value_2 DESC LIMIT 10 DEBUG: generating subplan XXX_2 for subquery SELECT value_2 FROM public.events_table DEBUG: generating subplan XXX_3 for subquery SELECT user_id, "time", value_1, value_2, value_3, value_4 FROM (SELECT intermediate_result.user_id, intermediate_result."time", intermediate_result.value_1, intermediate_result.value_2, intermediate_result.value_3, intermediate_result.value_4 FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, "time" timestamp without time zone, value_1 integer, value_2 integer, value_3 double precision, value_4 bigint)) sub_table WHERE (user_id OPERATOR(pg_catalog.=) ANY (SELECT intermediate_result.value_2 FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(value_2 integer))) DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT count(*) AS count FROM (SELECT intermediate_result.user_id, intermediate_result."time", intermediate_result.value_1, intermediate_result.value_2, intermediate_result.value_3, intermediate_result.value_4 FROM read_intermediate_result('XXX_3'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, "time" timestamp without time zone, value_1 integer, value_2 integer, value_3 double precision, value_4 bigint)) cte count --------------------------------------------------------------------- 10 (1 row) -- Planing subquery in WHERE clause in FROM clause of a subquery recursively SELECT COUNT(*) FROM (SELECT * FROM (SELECT * FROM users_table ORDER BY user_id ASC, value_2 DESC LIMIT 10 ) as sub_table_1 WHERE user_id IN (SELECT value_2 FROM events_table ) ) as sub_table_2; DEBUG: push down of limit count: 10 DEBUG: generating subplan XXX_1 for subquery SELECT user_id, "time", value_1, value_2, value_3, value_4 FROM public.users_table ORDER BY user_id, value_2 DESC LIMIT 10 DEBUG: generating subplan XXX_2 for subquery SELECT value_2 FROM public.events_table DEBUG: generating subplan XXX_3 for subquery SELECT user_id, "time", value_1, value_2, value_3, value_4 FROM (SELECT intermediate_result.user_id, intermediate_result."time", intermediate_result.value_1, intermediate_result.value_2, intermediate_result.value_3, intermediate_result.value_4 FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, "time" timestamp without time zone, value_1 integer, value_2 integer, value_3 double precision, value_4 bigint)) sub_table_1 WHERE (user_id OPERATOR(pg_catalog.=) ANY (SELECT intermediate_result.value_2 FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(value_2 integer))) DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT count(*) AS count FROM (SELECT intermediate_result.user_id, intermediate_result."time", intermediate_result.value_1, intermediate_result.value_2, intermediate_result.value_3, intermediate_result.value_4 FROM read_intermediate_result('XXX_3'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, "time" timestamp without time zone, value_1 integer, value_2 integer, value_3 double precision, value_4 bigint)) sub_table_2 count --------------------------------------------------------------------- 10 (1 row) -- Recurring table in the FROM clause of a subquery in the FROM clause -- Recurring table is created by joining a two recurrign table SELECT SUM(user_id) FROM (SELECT * FROM (SELECT user_id FROM users_table ORDER BY user_id LIMIT 10) as t1 INNER JOIN (SELECT user_id as user_id_2 FROM users_table ORDER BY user_id LIMIT 10) as t2 ON t1.user_id = t2.user_id_2 WHERE t1.user_id IN (SELECT value_2 FROM events_table) ) as t3 WHERE user_id >ANY (SELECT min(user_id) FROM events_table GROUP BY user_id); DEBUG: push down of limit count: 10 DEBUG: generating subplan XXX_1 for subquery SELECT user_id FROM public.users_table ORDER BY user_id LIMIT 10 DEBUG: push down of limit count: 10 DEBUG: generating subplan XXX_2 for subquery SELECT user_id AS user_id_2 FROM public.users_table ORDER BY user_id LIMIT 10 DEBUG: generating subplan XXX_3 for subquery SELECT value_2 FROM public.events_table DEBUG: generating subplan XXX_4 for subquery SELECT t1.user_id, t2.user_id_2 FROM ((SELECT intermediate_result.user_id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer)) t1 JOIN (SELECT intermediate_result.user_id_2 FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(user_id_2 integer)) t2 ON ((t1.user_id OPERATOR(pg_catalog.=) t2.user_id_2))) WHERE (t1.user_id OPERATOR(pg_catalog.=) ANY (SELECT intermediate_result.value_2 FROM read_intermediate_result('XXX_3'::text, 'binary'::citus_copy_format) intermediate_result(value_2 integer))) DEBUG: generating subplan XXX_5 for subquery SELECT min(user_id) AS min FROM public.events_table GROUP BY user_id DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT sum(user_id) AS sum FROM (SELECT intermediate_result.user_id, intermediate_result.user_id_2 FROM read_intermediate_result('XXX_4'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, user_id_2 integer)) t3 WHERE (user_id OPERATOR(pg_catalog.>) ANY (SELECT intermediate_result.min FROM read_intermediate_result('XXX_5'::text, 'binary'::citus_copy_format) intermediate_result(min integer))) sum --------------------------------------------------------------------- 18 (1 row) -- Same example with the above query, but now check the rows with EXISTS SELECT SUM(user_id) FROM (SELECT * FROM (SELECT user_id FROM users_table ORDER BY user_id LIMIT 10) as t1 INNER JOIN (SELECT user_id as user_id_2 FROM users_table ORDER BY user_id LIMIT 10) as t2 ON t1.user_id = t2.user_id_2 WHERE t1.user_id IN (SELECT value_2 FROM events_table) ) as t3 WHERE EXISTS (SELECT 1,2 FROM events_table WHERE events_table.value_2 = events_table.user_id); DEBUG: push down of limit count: 10 DEBUG: generating subplan XXX_1 for subquery SELECT user_id FROM public.users_table ORDER BY user_id LIMIT 10 DEBUG: push down of limit count: 10 DEBUG: generating subplan XXX_2 for subquery SELECT user_id AS user_id_2 FROM public.users_table ORDER BY user_id LIMIT 10 DEBUG: generating subplan XXX_3 for subquery SELECT value_2 FROM public.events_table DEBUG: generating subplan XXX_4 for subquery SELECT t1.user_id, t2.user_id_2 FROM ((SELECT intermediate_result.user_id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer)) t1 JOIN (SELECT intermediate_result.user_id_2 FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(user_id_2 integer)) t2 ON ((t1.user_id OPERATOR(pg_catalog.=) t2.user_id_2))) WHERE (t1.user_id OPERATOR(pg_catalog.=) ANY (SELECT intermediate_result.value_2 FROM read_intermediate_result('XXX_3'::text, 'binary'::citus_copy_format) intermediate_result(value_2 integer))) DEBUG: generating subplan XXX_5 for subquery SELECT 1, 2 FROM public.events_table WHERE (value_2 OPERATOR(pg_catalog.=) user_id) DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT sum(user_id) AS sum FROM (SELECT intermediate_result.user_id, intermediate_result.user_id_2 FROM read_intermediate_result('XXX_4'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, user_id_2 integer)) t3 WHERE (EXISTS (SELECT intermediate_result."?column?", intermediate_result."?column?_1" AS "?column?" FROM read_intermediate_result('XXX_5'::text, 'binary'::citus_copy_format) intermediate_result("?column?" integer, "?column?_1" integer))) sum --------------------------------------------------------------------- 67 (1 row) -- Same query with the above one, yet now we check the row's NON-existence -- by NOT EXISTS. Note that, max value_2 of events_table is 5 SELECT SUM(user_id) FROM (SELECT * FROM (SELECT user_id FROM users_table ORDER BY user_id LIMIT 10) as t1 INNER JOIN (SELECT user_id as user_id_2 FROM users_table ORDER BY user_id LIMIT 10) as t2 ON t1.user_id = t2.user_id_2 WHERE t1.user_id IN (SELECT value_2 FROM events_table) ) as t3 WHERE NOT EXISTS (SELECT 1,2 FROM events_table WHERE events_table.value_2 = events_table.user_id + 6); DEBUG: push down of limit count: 10 DEBUG: generating subplan XXX_1 for subquery SELECT user_id FROM public.users_table ORDER BY user_id LIMIT 10 DEBUG: push down of limit count: 10 DEBUG: generating subplan XXX_2 for subquery SELECT user_id AS user_id_2 FROM public.users_table ORDER BY user_id LIMIT 10 DEBUG: generating subplan XXX_3 for subquery SELECT value_2 FROM public.events_table DEBUG: generating subplan XXX_4 for subquery SELECT t1.user_id, t2.user_id_2 FROM ((SELECT intermediate_result.user_id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer)) t1 JOIN (SELECT intermediate_result.user_id_2 FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(user_id_2 integer)) t2 ON ((t1.user_id OPERATOR(pg_catalog.=) t2.user_id_2))) WHERE (t1.user_id OPERATOR(pg_catalog.=) ANY (SELECT intermediate_result.value_2 FROM read_intermediate_result('XXX_3'::text, 'binary'::citus_copy_format) intermediate_result(value_2 integer))) DEBUG: generating subplan XXX_5 for subquery SELECT 1, 2 FROM public.events_table WHERE (value_2 OPERATOR(pg_catalog.=) (user_id OPERATOR(pg_catalog.+) 6)) DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT sum(user_id) AS sum FROM (SELECT intermediate_result.user_id, intermediate_result.user_id_2 FROM read_intermediate_result('XXX_4'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, user_id_2 integer)) t3 WHERE (NOT (EXISTS (SELECT intermediate_result."?column?", intermediate_result."?column?_1" AS "?column?" FROM read_intermediate_result('XXX_5'::text, 'binary'::citus_copy_format) intermediate_result("?column?" integer, "?column?_1" integer)))) sum --------------------------------------------------------------------- 67 (1 row) -- Check the existence of row by comparing it with the result of subquery in -- WHERE clause. Note that subquery is planned recursively since there is no -- distributed table in the from SELECT * FROM (SELECT user_id, value_1 FROM users_table ORDER BY user_id ASC, value_1 ASC LIMIT 10) as t3 WHERE row(user_id, value_1) = (SELECT min(user_id) + 1, min(user_id) + 1 FROM events_table); DEBUG: generating subplan XXX_1 for subquery SELECT (min(user_id) OPERATOR(pg_catalog.+) 1), (min(user_id) OPERATOR(pg_catalog.+) 1) FROM public.events_table DEBUG: push down of limit count: 10 DEBUG: generating subplan XXX_2 for subquery SELECT user_id, value_1 FROM public.users_table ORDER BY user_id, value_1 LIMIT 10 DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT user_id, value_1 FROM (SELECT intermediate_result.user_id, intermediate_result.value_1 FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, value_1 integer)) t3 WHERE ((user_id, value_1) OPERATOR(pg_catalog.=) (SELECT intermediate_result."?column?", intermediate_result."?column?_1" AS "?column?" FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result("?column?" integer, "?column?_1" integer))) user_id | value_1 --------------------------------------------------------------------- (0 rows) -- Recursively plan subquery in WHERE clause when the FROM clause has a subquery -- generated by generate_series function SELECT * FROM (SELECT * FROM generate_series(1,10) ) as gst WHERE generate_series IN (SELECT value_2 FROM events_table ) ORDER BY generate_series ASC; DEBUG: generating subplan XXX_1 for subquery SELECT value_2 FROM public.events_table DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT generate_series FROM (SELECT generate_series.generate_series FROM generate_series(1, 10) generate_series(generate_series)) gst WHERE (generate_series OPERATOR(pg_catalog.=) ANY (SELECT intermediate_result.value_2 FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(value_2 integer))) ORDER BY generate_series generate_series --------------------------------------------------------------------- 1 2 3 4 5 (5 rows) -- Similar to the test above, now we also have a generate_series in WHERE clause SELECT * FROM (SELECT * FROM generate_series(1,10) ) as gst WHERE generate_series IN (SELECT user_id FROM users_table WHERE user_id IN (SELECT * FROM generate_series(1,3) ) ) ORDER BY generate_series ASC; DEBUG: generating subplan XXX_1 for subquery SELECT user_id FROM public.users_table WHERE (user_id OPERATOR(pg_catalog.=) ANY (SELECT generate_series.generate_series FROM generate_series(1, 3) generate_series(generate_series))) DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT generate_series FROM (SELECT generate_series.generate_series FROM generate_series(1, 10) generate_series(generate_series)) gst WHERE (generate_series OPERATOR(pg_catalog.=) ANY (SELECT intermediate_result.user_id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer))) ORDER BY generate_series generate_series --------------------------------------------------------------------- 1 2 3 (3 rows) -- non-colocated subquery in WHERE clause ANDed with false SELECT count(*) FROM users_Table WHERE (FALSE AND EXISTS (SELECT * FROM events_table)); count --------------------------------------------------------------------- 0 (1 row) -- multiple non-colocated subqueries in WHERE clause ANDed with false SELECT count(*) FROM users_Table WHERE value_1 IN (SELECT value_1 FROM users_Table) OR (FALSE AND EXISTS (SELECT * FROM events_table)); DEBUG: generating subplan XXX_1 for subquery SELECT value_1 FROM public.users_table DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT count(*) AS count FROM public.users_table WHERE ((value_1 OPERATOR(pg_catalog.=) ANY (SELECT intermediate_result.value_1 FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(value_1 integer))) OR (false AND (EXISTS (SELECT events_table.user_id, events_table."time", events_table.event_type, events_table.value_2, events_table.value_3, events_table.value_4 FROM public.events_table)))) count --------------------------------------------------------------------- 101 (1 row) -- multiple non-colocated subqueries in WHERE clause ANDed with false SELECT count(*) FROM users_Table WHERE value_1 IN (SELECT value_1 FROM users_Table) AND (FALSE AND EXISTS (SELECT * FROM events_table)); count --------------------------------------------------------------------- 0 (1 row) -- non-colocated subquery in WHERE clause ANDed with true SELECT count(*) FROM users_Table WHERE (TRUE AND EXISTS (SELECT * FROM events_table)); DEBUG: generating subplan XXX_1 for subquery SELECT user_id, "time", event_type, value_2, value_3, value_4 FROM public.events_table DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT count(*) AS count FROM public.users_table WHERE (true AND (EXISTS (SELECT intermediate_result.user_id, intermediate_result."time", intermediate_result.event_type, intermediate_result.value_2, intermediate_result.value_3, intermediate_result.value_4 FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, "time" timestamp without time zone, event_type integer, value_2 integer, value_3 double precision, value_4 bigint)))) count --------------------------------------------------------------------- 101 (1 row) -- multiple non-colocated subqueries in WHERE clause ANDed with true SELECT count(*) FROM users_Table WHERE value_1 IN (SELECT value_1 FROM users_Table) OR (EXISTS (SELECT * FROM events_table)); DEBUG: generating subplan XXX_1 for subquery SELECT value_1 FROM public.users_table DEBUG: generating subplan XXX_2 for subquery SELECT user_id, "time", event_type, value_2, value_3, value_4 FROM public.events_table DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT count(*) AS count FROM public.users_table WHERE ((value_1 OPERATOR(pg_catalog.=) ANY (SELECT intermediate_result.value_1 FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(value_1 integer))) OR (EXISTS (SELECT intermediate_result.user_id, intermediate_result."time", intermediate_result.event_type, intermediate_result.value_2, intermediate_result.value_3, intermediate_result.value_4 FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, "time" timestamp without time zone, event_type integer, value_2 integer, value_3 double precision, value_4 bigint)))) count --------------------------------------------------------------------- 101 (1 row) -- correlated subquery with aggregate in WHERE SELECT * FROM users_table WHERE user_id IN ( SELECT SUM(events_table.user_id) FROM events_table WHERE users_table.user_id = events_table.user_id ) ; user_id | time | value_1 | value_2 | value_3 | value_4 --------------------------------------------------------------------- (0 rows) -- correlated subquery with aggregate in HAVING SELECT * FROM users_table WHERE user_id IN ( SELECT SUM(events_table.user_id) FROM events_table WHERE events_table.user_id = users_table.user_id HAVING MIN(value_2) > 2 ) ; user_id | time | value_1 | value_2 | value_3 | value_4 --------------------------------------------------------------------- (0 rows) -- Local tables also planned recursively, so using it as part of the FROM clause -- make the clause recurring CREATE TABLE local_table(id int, value_1 int); INSERT INTO local_table VALUES(1,1), (2,2); SELECT * FROM (SELECT * FROM local_table) as sub_table WHERE id IN (SELECT user_id FROM users_table); DEBUG: generating subplan XXX_1 for subquery SELECT id, value_1 FROM subquery_in_where.local_table DEBUG: generating subplan XXX_2 for subquery SELECT user_id FROM public.users_table DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT id, value_1 FROM (SELECT intermediate_result.id, intermediate_result.value_1 FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer, value_1 integer)) sub_table WHERE (id OPERATOR(pg_catalog.=) ANY (SELECT intermediate_result.user_id FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer))) id | value_1 --------------------------------------------------------------------- 1 | 1 2 | 2 (2 rows) -- Use local table in WHERE clause SELECT COUNT(*) FROM (SELECT * FROM users_table ORDER BY user_id LIMIT 10) as sub_table WHERE user_id IN (SELECT id FROM local_table); DEBUG: generating subplan XXX_1 for subquery SELECT id FROM subquery_in_where.local_table DEBUG: push down of limit count: 10 DEBUG: generating subplan XXX_2 for subquery SELECT user_id, "time", value_1, value_2, value_3, value_4 FROM public.users_table ORDER BY user_id LIMIT 10 DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT count(*) AS count FROM (SELECT intermediate_result.user_id, intermediate_result."time", intermediate_result.value_1, intermediate_result.value_2, intermediate_result.value_3, intermediate_result.value_4 FROM read_intermediate_result('XXX_2'::text, 'binary'::citus_copy_format) intermediate_result(user_id integer, "time" timestamp without time zone, value_1 integer, value_2 integer, value_3 double precision, value_4 bigint)) sub_table WHERE (user_id OPERATOR(pg_catalog.=) ANY (SELECT intermediate_result.id FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(id integer))) count --------------------------------------------------------------------- 10 (1 row) -- basic NOT IN correlated subquery SELECT count(*) FROM events_table e WHERE value_2 NOT IN (SELECT value_2 FROM users_table WHERE user_id = e.user_id); count --------------------------------------------------------------------- 7 (1 row) -- correlated subquery with limit SELECT count(*) FROM events_table e WHERE value_2 IN (SELECT value_2 FROM users_table WHERE user_id = e.user_id ORDER BY value_2 LIMIT 1); count --------------------------------------------------------------------- 10 (1 row) -- correlated subquery with distinct SELECT count(*) FROM events_table e WHERE value_2 IN (SELECT DISTINCT (value_3) FROM users_table WHERE user_id = e.user_id); count --------------------------------------------------------------------- 90 (1 row) -- correlated subquery with aggregate SELECT count(*) FROM events_table e WHERE value_2 = (SELECT max(value_2) FROM users_table WHERE user_id = e.user_id); count --------------------------------------------------------------------- 11 (1 row) -- correlated subquery with window function SELECT count(*) FROM events_table e WHERE value_2 IN (SELECT row_number() OVER () FROM users_table WHERE user_id = e.user_id); count --------------------------------------------------------------------- 94 (1 row) -- correlated subquery with group by SELECT count(*) FROM events_table e WHERE value_3 IN (SELECT min(value_3) FROM users_table WHERE user_id = e.user_id GROUP BY value_2); count --------------------------------------------------------------------- 72 (1 row) SELECT count(*) FROM events_table e WHERE value_3 IN (SELECT min(value_3) FROM users_table WHERE user_id = e.user_id GROUP BY value_2); count --------------------------------------------------------------------- 72 (1 row) -- correlated subquery with group by SELECT count(*) FROM events_table e WHERE value_3 IN (SELECT min(value_3) v FROM users_table WHERE user_id = e.user_id GROUP BY e.value_2); count --------------------------------------------------------------------- 10 (1 row) -- correlated subquery with having SELECT count(*) FROM events_table e WHERE value_3 IN (SELECT min(value_3) v FROM users_table WHERE user_id = e.user_id GROUP BY e.value_2 HAVING min(value_3) > (SELECT 1)); count --------------------------------------------------------------------- 0 (1 row) SELECT count(*) FROM events_table e WHERE value_3 IN (SELECT min(value_3) v FROM users_table WHERE user_id = e.user_id GROUP BY e.value_2 HAVING min(value_3) > (SELECT e.value_3)); ERROR: Subqueries in HAVING cannot refer to outer query -- nested correlated subquery SELECT count(*) FROM events_table e WHERE value_3 IN ( SELECT min(r.value_3) v FROM users_reference_table r JOIN (SELECT * FROM users_table WHERE user_id = e.user_id) u USING (user_id) WHERE u.value_2 > 3 GROUP BY e.value_2 HAVING min(r.value_3) > e.value_3); count --------------------------------------------------------------------- 0 (1 row) -- not co-located correlated subquery SELECT count(*) FROM events_table e WHERE value_3 IN ( SELECT min(r.value_3) v FROM users_reference_table r JOIN (SELECT * FROM users_table WHERE value_2 = e.user_id) u USING (user_id) WHERE u.value_2 > 3 GROUP BY e.value_2 HAVING min(r.value_3) > e.value_3); ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns -- cartesian correlated subquery SELECT count(*) FROM events_table e WHERE value_3 IN ( SELECT min(r.value_3) v FROM users_reference_table r JOIN users_table u USING (user_id) WHERE u.value_2 > 3 GROUP BY e.value_2 HAVING min(r.value_3) > e.value_3); ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns -- even more subtle cartesian correlated subquery SELECT count(*) FROM events_table e WHERE value_3 IN ( SELECT min(r.value_3) v FROM users_reference_table r JOIN users_table u USING (user_id) WHERE u.value_2 > 3 GROUP BY u.value_2 HAVING min(r.value_3) > e.value_3); ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns -- not a correlated subquery, uses recursive planning SELECT count(*) FROM events_table e WHERE value_3 IN ( SELECT min(r.value_3) v FROM users_reference_table r JOIN users_table u USING (user_id) WHERE u.value_2 > 3 GROUP BY r.value_2 HAVING min(r.value_3) > 0); DEBUG: generating subplan XXX_1 for subquery SELECT min(r.value_3) AS v FROM (public.users_reference_table r JOIN public.users_table u USING (user_id)) WHERE (u.value_2 OPERATOR(pg_catalog.>) 3) GROUP BY r.value_2 HAVING (min(r.value_3) OPERATOR(pg_catalog.>) (0)::double precision) DEBUG: Plan XXX query after replacing subqueries and CTEs: SELECT count(*) AS count FROM public.events_table e WHERE (value_3 OPERATOR(pg_catalog.=) ANY (SELECT intermediate_result.v FROM read_intermediate_result('XXX_1'::text, 'binary'::citus_copy_format) intermediate_result(v double precision))) count --------------------------------------------------------------------- 24 (1 row) -- two levels of correlation should also allow -- merge step in the subquery SELECT sum(value_1) FROM users_table u WHERE EXISTS (SELECT 1 FROM events_table e WHERE u.user_id = e.user_id AND EXISTS (SELECT 1 FROM users_table u2 WHERE u2.user_id = u.user_id AND u2.value_1 = 5 LIMIT 1)); sum --------------------------------------------------------------------- 216 (1 row) -- correlated subquery in WHERE, with a slightly -- different syntax that the result of the subquery -- is compared with a constant SELECT sum(value_1) FROM users_table u1 WHERE (SELECT COUNT(DISTINCT e1.value_2) FROM events_table e1 WHERE e1.user_id = u1.user_id ) > 115; sum --------------------------------------------------------------------- (1 row) -- a correlated subquery which requires merge step -- can be pushed down on UPDATE/DELETE queries as well -- rollback to keep the rest of the tests unchanged BEGIN; UPDATE users_table u1 SET value_1 = (SELECT count(DISTINCT value_2) FROM events_table e1 WHERE e1.user_id = u1.user_id); DELETE FROM users_table u1 WHERE (SELECT count(DISTINCT value_2) FROM events_table e1 WHERE e1.user_id = u1.user_id) > 10; ROLLBACK; -- a correlated anti-join can also be pushed down even if the subquery -- has a LIMIT SELECT avg(value_1) FROM users_table u WHERE NOT EXISTS (SELECT 'XXX' FROM events_table e WHERE u.user_id = e.user_id and e.value_2 > 10000 LIMIT 1); avg --------------------------------------------------------------------- 2.5544554455445545 (1 row) -- a [correlated] lateral join can also be pushed down even if the subquery -- has an aggregate wout a GROUP BY SELECT max(min_of_val_2), max(u1.value_1) FROM users_table u1 LEFT JOIN LATERAL (SELECT min(e1.value_2) as min_of_val_2 FROM events_table e1 WHERE e1.user_id = u1.user_id) as foo ON (true); max | max --------------------------------------------------------------------- 1 | 5 (1 row) -- a self join is followed by a correlated subquery EXPLAIN (COSTS OFF) SELECT * FROM users_table u1 JOIN users_table u2 USING (user_id) WHERE u1.value_1 < u2.value_1 AND (SELECT count(*) FROM events_table e1 WHERE e1.user_id = u2.user_id) > 10; QUERY PLAN --------------------------------------------------------------------- Custom Scan (Citus Adaptive) Task Count: 4 Tasks Shown: One of 4 -> Task Node: host=localhost port=xxxxx dbname=regression -> Hash Join Hash Cond: (u2.user_id = u1.user_id) Join Filter: (u1.value_1 < u2.value_1) -> Seq Scan on users_table_1400256 u2 Filter: ((SubPlan 1) > 10) SubPlan 1 -> Aggregate -> Seq Scan on events_table_1400260 e1 Filter: (user_id = u2.user_id) -> Hash -> Seq Scan on users_table_1400256 u1 (16 rows) -- when the colocated join of the FROM clause -- entries happen on WHERE clause, Citus cannot -- pushdown -- Likely that the colocation checks should be -- improved SELECT u1.user_id, u2.user_id FROM users_table u1, users_table u2 WHERE u1.value_1 < u2.value_1 AND (SELECT count(*) FROM events_table e1 WHERE e1.user_id = u2.user_id AND u1.user_id = u2.user_id) > 10 ORDER BY 1,2; ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns -- create a view that contains correlated subquery CREATE TEMPORARY VIEW correlated_subquery_view AS SELECT u1.user_id FROM users_table u1 WHERE (SELECT COUNT(DISTINCT e1.value_2) FROM events_table e1 WHERE e1.user_id = u1.user_id ) > 0; SELECT sum(user_id) FROM correlated_subquery_view; sum --------------------------------------------------------------------- 376 (1 row) -- now, join the view with another correlated subquery SELECT sum(mx) FROM correlated_subquery_view LEFT JOIN LATERAL (SELECT max(value_2) as mx FROM events_table WHERE correlated_subquery_view.user_id = events_table.user_id) as foo ON (true); sum --------------------------------------------------------------------- 459 (1 row) -- as an edge case, JOIN is on false SELECT sum(mx) FROM correlated_subquery_view LEFT JOIN LATERAL (SELECT max(value_2) as mx FROM events_table WHERE correlated_subquery_view.user_id = events_table.user_id) as foo ON (false); sum --------------------------------------------------------------------- (1 row) SELECT sum(value_1) FROM users_table u1 WHERE (SELECT COUNT(DISTINCT e1.value_2) FROM events_table e1 WHERE e1.user_id = u1.user_id AND false ) > 115; ERROR: complex joins are only supported when all distributed tables are co-located and joined on their distribution columns SELECT sum(value_1) FROM users_table u1 WHERE (SELECT COUNT(DISTINCT e1.value_2) FROM events_table e1 WHERE e1.user_id = u1.user_id ) > 115 AND false; sum --------------------------------------------------------------------- (1 row) SET client_min_messages TO DEFAULT; DROP TABLE local_table; DROP SCHEMA subquery_in_where CASCADE; SET search_path TO public;