-- -- COMPLEX_COUNT_DISTINCT -- SET citus.next_shard_id TO 240000; SET citus.shard_count TO 8; SET citus.shard_replication_factor TO 1; CREATE TABLE lineitem_hash ( l_orderkey bigint not null, l_partkey integer not null, l_suppkey integer not null, l_linenumber integer not null, l_quantity decimal(15, 2) not null, l_extendedprice decimal(15, 2) not null, l_discount decimal(15, 2) not null, l_tax decimal(15, 2) not null, l_returnflag char(1) not null, l_linestatus char(1) not null, l_shipdate date not null, l_commitdate date not null, l_receiptdate date not null, l_shipinstruct char(25) not null, l_shipmode char(10) not null, l_comment varchar(44) not null, PRIMARY KEY(l_orderkey, l_linenumber) ); SELECT create_distributed_table('lineitem_hash', 'l_orderkey', 'hash'); create_distributed_table -------------------------- (1 row) \copy lineitem_hash FROM '@abs_srcdir@/data/lineitem.1.data' with delimiter '|' \copy lineitem_hash FROM '@abs_srcdir@/data/lineitem.2.data' with delimiter '|' ANALYZE lineitem_hash; SET citus.task_executor_type to "task-tracker"; -- count(distinct) is supported on top level query if there -- is a grouping on the partition key SELECT l_orderkey, count(DISTINCT l_partkey) FROM lineitem_hash GROUP BY l_orderkey ORDER BY 2 DESC, 1 DESC LIMIT 10; l_orderkey | count ------------+------- 14885 | 7 14884 | 7 14821 | 7 14790 | 7 14785 | 7 14755 | 7 14725 | 7 14694 | 7 14627 | 7 14624 | 7 (10 rows) EXPLAIN (COSTS false, VERBOSE true) SELECT l_orderkey, count(DISTINCT l_partkey) FROM lineitem_hash GROUP BY l_orderkey ORDER BY 2 DESC, 1 DESC LIMIT 10; QUERY PLAN ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Limit Output: remote_scan.l_orderkey, COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint))))::bigint, '0'::bigint))))::bigint, '0'::bigint) -> Sort Output: remote_scan.l_orderkey, COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint))))::bigint, '0'::bigint) Sort Key: COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint))))::bigint, '0'::bigint) DESC, remote_scan.l_orderkey DESC -> HashAggregate Output: remote_scan.l_orderkey, COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint) Group Key: remote_scan.l_orderkey -> Custom Scan (Citus Task-Tracker) Output: remote_scan.l_orderkey, remote_scan.count Task Count: 8 Tasks Shown: One of 8 -> Task Node: host=localhost port=57637 dbname=regression -> Limit Output: l_orderkey, (count(DISTINCT l_partkey)) -> Sort Output: l_orderkey, (count(DISTINCT l_partkey)) Sort Key: (count(DISTINCT lineitem_hash.l_partkey)) DESC, lineitem_hash.l_orderkey DESC -> GroupAggregate Output: l_orderkey, count(DISTINCT l_partkey) Group Key: lineitem_hash.l_orderkey -> Index Scan Backward using lineitem_hash_pkey_240000 on public.lineitem_hash_240000 lineitem_hash Output: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment (24 rows) -- it is also supported if there is no grouping or grouping is on non-partition field SELECT count(DISTINCT l_partkey) FROM lineitem_hash ORDER BY 1 DESC LIMIT 10; count ------- 11661 (1 row) EXPLAIN (COSTS false, VERBOSE true) SELECT count(DISTINCT l_partkey) FROM lineitem_hash ORDER BY 1 DESC LIMIT 10; QUERY PLAN ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Limit Output: count(DISTINCT (count(DISTINCT (count(DISTINCT remote_scan.count))))) -> Sort Output: count(DISTINCT (count(DISTINCT remote_scan.count))) Sort Key: count(DISTINCT (count(DISTINCT remote_scan.count))) DESC -> Aggregate Output: count(DISTINCT remote_scan.count) -> Custom Scan (Citus Task-Tracker) Output: remote_scan.count Task Count: 8 Tasks Shown: One of 8 -> Task Node: host=localhost port=57637 dbname=regression -> HashAggregate Output: l_partkey Group Key: lineitem_hash.l_partkey -> Seq Scan on public.lineitem_hash_240000 lineitem_hash Output: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment (18 rows) SELECT l_shipmode, count(DISTINCT l_partkey) FROM lineitem_hash GROUP BY l_shipmode ORDER BY 2 DESC, 1 DESC LIMIT 10; l_shipmode | count ------------+------- TRUCK | 1757 MAIL | 1730 AIR | 1702 FOB | 1700 RAIL | 1696 SHIP | 1684 REG AIR | 1676 (7 rows) EXPLAIN (COSTS false, VERBOSE true) SELECT l_shipmode, count(DISTINCT l_partkey) FROM lineitem_hash GROUP BY l_shipmode ORDER BY 2 DESC, 1 DESC LIMIT 10; QUERY PLAN ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Limit Output: remote_scan.l_shipmode, count(DISTINCT (count(DISTINCT (count(DISTINCT remote_scan.count))))) -> Sort Output: remote_scan.l_shipmode, count(DISTINCT (count(DISTINCT remote_scan.count))) Sort Key: count(DISTINCT (count(DISTINCT remote_scan.count))) DESC, remote_scan.l_shipmode DESC -> GroupAggregate Output: remote_scan.l_shipmode, count(DISTINCT remote_scan.count) Group Key: remote_scan.l_shipmode -> Sort Output: remote_scan.l_shipmode, remote_scan.count Sort Key: remote_scan.l_shipmode DESC -> Custom Scan (Citus Task-Tracker) Output: remote_scan.l_shipmode, remote_scan.count Task Count: 8 Tasks Shown: One of 8 -> Task Node: host=localhost port=57637 dbname=regression -> HashAggregate Output: l_shipmode, l_partkey Group Key: lineitem_hash.l_shipmode, lineitem_hash.l_partkey -> Seq Scan on public.lineitem_hash_240000 lineitem_hash Output: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment (22 rows) -- mixed mode count distinct, grouped by partition column SELECT l_orderkey, count(distinct l_partkey), count(distinct l_shipmode) FROM lineitem_hash GROUP BY l_orderkey ORDER BY 3 DESC, 2 DESC, 1 LIMIT 10; l_orderkey | count | count ------------+-------+------- 226 | 7 | 7 1316 | 7 | 7 1477 | 7 | 7 3555 | 7 | 7 12258 | 7 | 7 12835 | 7 | 7 768 | 7 | 6 1121 | 7 | 6 1153 | 7 | 6 1281 | 7 | 6 (10 rows) EXPLAIN (COSTS false, VERBOSE true) SELECT l_orderkey, count(distinct l_partkey), count(distinct l_shipmode) FROM lineitem_hash GROUP BY l_orderkey ORDER BY 3 DESC, 2 DESC, 1 LIMIT 10; QUERY PLAN ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Limit Output: remote_scan.l_orderkey, COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint))))::bigint, '0'::bigint))))::bigint, '0'::bigint), COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum(remote_scan.count_1))::bigint, '0'::bigint))))::bigint, '0'::bigint))))::bigint, '0'::bigint) -> Sort Output: remote_scan.l_orderkey, COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint))))::bigint, '0'::bigint), COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum(remote_scan.count_1))::bigint, '0'::bigint))))::bigint, '0'::bigint) Sort Key: COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum(remote_scan.count_1))::bigint, '0'::bigint))))::bigint, '0'::bigint) DESC, COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint))))::bigint, '0'::bigint) DESC, remote_scan.l_orderkey -> HashAggregate Output: remote_scan.l_orderkey, COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint), COALESCE((pg_catalog.sum(remote_scan.count_1))::bigint, '0'::bigint) Group Key: remote_scan.l_orderkey -> Custom Scan (Citus Task-Tracker) Output: remote_scan.l_orderkey, remote_scan.count, remote_scan.count_1 Task Count: 8 Tasks Shown: One of 8 -> Task Node: host=localhost port=57637 dbname=regression -> Limit Output: l_orderkey, (count(DISTINCT l_partkey)), (count(DISTINCT l_shipmode)) -> Sort Output: l_orderkey, (count(DISTINCT l_partkey)), (count(DISTINCT l_shipmode)) Sort Key: (count(DISTINCT lineitem_hash.l_shipmode)) DESC, (count(DISTINCT lineitem_hash.l_partkey)) DESC, lineitem_hash.l_orderkey -> GroupAggregate Output: l_orderkey, count(DISTINCT l_partkey), count(DISTINCT l_shipmode) Group Key: lineitem_hash.l_orderkey -> Index Scan using lineitem_hash_pkey_240000 on public.lineitem_hash_240000 lineitem_hash Output: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment (24 rows) -- partition/non-partition column count distinct no grouping SELECT count(distinct l_orderkey), count(distinct l_partkey), count(distinct l_shipmode) FROM lineitem_hash; count | count | count -------+-------+------- 2985 | 11661 | 7 (1 row) EXPLAIN (COSTS false, VERBOSE true) SELECT count(distinct l_orderkey), count(distinct l_partkey), count(distinct l_shipmode) FROM lineitem_hash; QUERY PLAN ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Aggregate Output: count(DISTINCT remote_scan.count), count(DISTINCT remote_scan.count_1), count(DISTINCT remote_scan.count_2) -> Custom Scan (Citus Task-Tracker) Output: remote_scan.count, remote_scan.count_1, remote_scan.count_2 Task Count: 8 Tasks Shown: One of 8 -> Task Node: host=localhost port=57637 dbname=regression -> HashAggregate Output: l_orderkey, l_partkey, l_shipmode Group Key: lineitem_hash.l_orderkey, lineitem_hash.l_partkey, lineitem_hash.l_shipmode -> Seq Scan on public.lineitem_hash_240000 lineitem_hash Output: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment (13 rows) -- distinct/non-distinct on partition and non-partition columns SELECT count(distinct l_orderkey), count(l_orderkey), count(distinct l_partkey), count(l_partkey), count(distinct l_shipmode), count(l_shipmode) FROM lineitem_hash; count | count | count | count | count | count -------+-------+-------+-------+-------+------- 2985 | 12000 | 11661 | 12000 | 7 | 12000 (1 row) -- mixed mode count distinct, grouped by non-partition column SELECT l_shipmode, count(distinct l_partkey), count(distinct l_orderkey) FROM lineitem_hash GROUP BY l_shipmode ORDER BY 1, 2 DESC, 3 DESC; l_shipmode | count | count ------------+-------+------- AIR | 1702 | 1327 FOB | 1700 | 1276 MAIL | 1730 | 1299 RAIL | 1696 | 1265 REG AIR | 1676 | 1275 SHIP | 1684 | 1289 TRUCK | 1757 | 1333 (7 rows) -- mixed mode count distinct, grouped by non-partition column -- having on partition column SELECT l_shipmode, count(distinct l_partkey), count(distinct l_orderkey) FROM lineitem_hash GROUP BY l_shipmode HAVING count(distinct l_orderkey) > 1300 ORDER BY 1, 2 DESC; l_shipmode | count | count ------------+-------+------- AIR | 1702 | 1327 TRUCK | 1757 | 1333 (2 rows) -- same but having clause is not on target list SELECT l_shipmode, count(distinct l_partkey) FROM lineitem_hash GROUP BY l_shipmode HAVING count(distinct l_orderkey) > 1300 ORDER BY 1, 2 DESC; l_shipmode | count ------------+------- AIR | 1702 TRUCK | 1757 (2 rows) -- mixed mode count distinct, grouped by non-partition column -- having on non-partition column SELECT l_shipmode, count(distinct l_partkey), count(distinct l_suppkey) FROM lineitem_hash GROUP BY l_shipmode HAVING count(distinct l_suppkey) > 1550 ORDER BY 1, 2 DESC; l_shipmode | count | count ------------+-------+------- AIR | 1702 | 1564 FOB | 1700 | 1571 MAIL | 1730 | 1573 RAIL | 1696 | 1581 REG AIR | 1676 | 1557 SHIP | 1684 | 1554 TRUCK | 1757 | 1602 (7 rows) -- same but having clause is not on target list SELECT l_shipmode, count(distinct l_partkey) FROM lineitem_hash GROUP BY l_shipmode HAVING count(distinct l_suppkey) > 1550 ORDER BY 1, 2 DESC; l_shipmode | count ------------+------- AIR | 1702 FOB | 1700 MAIL | 1730 RAIL | 1696 REG AIR | 1676 SHIP | 1684 TRUCK | 1757 (7 rows) EXPLAIN (COSTS false, VERBOSE true) SELECT l_shipmode, count(distinct l_partkey) FROM lineitem_hash GROUP BY l_shipmode HAVING count(distinct l_suppkey) > 1550 ORDER BY 1, 2 DESC; QUERY PLAN ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Sort Output: remote_scan.l_shipmode, count(DISTINCT (count(DISTINCT remote_scan.count))) Sort Key: remote_scan.l_shipmode, count(DISTINCT (count(DISTINCT remote_scan.count))) DESC -> GroupAggregate Output: remote_scan.l_shipmode, count(DISTINCT remote_scan.count) Group Key: remote_scan.l_shipmode Filter: (count(DISTINCT remote_scan.worker_column_3) > 1550) -> Sort Output: remote_scan.l_shipmode, remote_scan.count, remote_scan.worker_column_3 Sort Key: remote_scan.l_shipmode -> Custom Scan (Citus Task-Tracker) Output: remote_scan.l_shipmode, remote_scan.count, remote_scan.worker_column_3 Task Count: 8 Tasks Shown: One of 8 -> Task Node: host=localhost port=57637 dbname=regression -> HashAggregate Output: l_shipmode, l_partkey, l_suppkey Group Key: lineitem_hash.l_shipmode, lineitem_hash.l_partkey, lineitem_hash.l_suppkey -> Seq Scan on public.lineitem_hash_240000 lineitem_hash Output: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment (21 rows) -- count distinct is supported on single table subqueries SELECT * FROM ( SELECT l_orderkey, count(DISTINCT l_partkey) FROM lineitem_hash GROUP BY l_orderkey) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; l_orderkey | count ------------+------- 14885 | 7 14884 | 7 14821 | 7 14790 | 7 14785 | 7 14755 | 7 14725 | 7 14694 | 7 14627 | 7 14624 | 7 (10 rows) SELECT * FROM ( SELECT l_partkey, count(DISTINCT l_orderkey) FROM lineitem_hash GROUP BY l_partkey) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; l_partkey | count -----------+------- 199146 | 3 188804 | 3 177771 | 3 160895 | 3 149926 | 3 136884 | 3 87761 | 3 15283 | 3 6983 | 3 1927 | 3 (10 rows) EXPLAIN (COSTS false, VERBOSE true) SELECT * FROM ( SELECT l_partkey, count(DISTINCT l_orderkey) FROM lineitem_hash GROUP BY l_partkey) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; QUERY PLAN ------------------------------------------------------------------------- Limit Output: remote_scan.l_partkey, remote_scan.count -> Sort Output: remote_scan.l_partkey, remote_scan.count Sort Key: remote_scan.count DESC, remote_scan.l_partkey DESC -> Custom Scan (Citus Task-Tracker) Output: remote_scan.l_partkey, remote_scan.count Task Count: 4 Tasks Shown: None, not supported for re-partition queries -> MapMergeJob Map Task Count: 8 Merge Task Count: 4 (12 rows) -- count distinct with filters SELECT l_orderkey, count(DISTINCT l_suppkey) FILTER (WHERE l_shipmode = 'AIR'), count(DISTINCT l_suppkey) FROM lineitem_hash GROUP BY l_orderkey ORDER BY 2 DESC, 3 DESC, 1 LIMIT 10; l_orderkey | count | count ------------+-------+------- 4964 | 4 | 7 12005 | 4 | 7 5409 | 4 | 6 164 | 3 | 7 322 | 3 | 7 871 | 3 | 7 1156 | 3 | 7 1574 | 3 | 7 2054 | 3 | 7 2309 | 3 | 7 (10 rows) EXPLAIN (COSTS false, VERBOSE true) SELECT l_orderkey, count(DISTINCT l_suppkey) FILTER (WHERE l_shipmode = 'AIR'), count(DISTINCT l_suppkey) FROM lineitem_hash GROUP BY l_orderkey ORDER BY 2 DESC, 3 DESC, 1 LIMIT 10; QUERY PLAN ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Limit Output: remote_scan.l_orderkey, COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint))))::bigint, '0'::bigint))))::bigint, '0'::bigint), COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum(remote_scan.count_1))::bigint, '0'::bigint))))::bigint, '0'::bigint))))::bigint, '0'::bigint) -> Sort Output: remote_scan.l_orderkey, COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint))))::bigint, '0'::bigint), COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum(remote_scan.count_1))::bigint, '0'::bigint))))::bigint, '0'::bigint) Sort Key: COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint))))::bigint, '0'::bigint) DESC, COALESCE((pg_catalog.sum((COALESCE((pg_catalog.sum(remote_scan.count_1))::bigint, '0'::bigint))))::bigint, '0'::bigint) DESC, remote_scan.l_orderkey -> HashAggregate Output: remote_scan.l_orderkey, COALESCE((pg_catalog.sum(remote_scan.count))::bigint, '0'::bigint), COALESCE((pg_catalog.sum(remote_scan.count_1))::bigint, '0'::bigint) Group Key: remote_scan.l_orderkey -> Custom Scan (Citus Task-Tracker) Output: remote_scan.l_orderkey, remote_scan.count, remote_scan.count_1 Task Count: 8 Tasks Shown: One of 8 -> Task Node: host=localhost port=57637 dbname=regression -> Limit Output: l_orderkey, (count(DISTINCT l_suppkey) FILTER (WHERE (l_shipmode = 'AIR'::bpchar))), (count(DISTINCT l_suppkey)) -> Sort Output: l_orderkey, (count(DISTINCT l_suppkey) FILTER (WHERE (l_shipmode = 'AIR'::bpchar))), (count(DISTINCT l_suppkey)) Sort Key: (count(DISTINCT lineitem_hash.l_suppkey) FILTER (WHERE (lineitem_hash.l_shipmode = 'AIR'::bpchar))) DESC, (count(DISTINCT lineitem_hash.l_suppkey)) DESC, lineitem_hash.l_orderkey -> GroupAggregate Output: l_orderkey, count(DISTINCT l_suppkey) FILTER (WHERE (l_shipmode = 'AIR'::bpchar)), count(DISTINCT l_suppkey) Group Key: lineitem_hash.l_orderkey -> Index Scan using lineitem_hash_pkey_240000 on public.lineitem_hash_240000 lineitem_hash Output: l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment (24 rows) -- group by on non-partition column SELECT l_suppkey, count(DISTINCT l_partkey) FILTER (WHERE l_shipmode = 'AIR') FROM lineitem_hash GROUP BY l_suppkey ORDER BY 2 DESC, 1 DESC LIMIT 10; l_suppkey | count -----------+------- 7680 | 4 7703 | 3 7542 | 3 7072 | 3 6335 | 3 5873 | 3 1318 | 3 1042 | 3 160 | 3 9872 | 2 (10 rows) -- explaining the same query fails EXPLAIN (COSTS false, VERBOSE true) SELECT l_suppkey, count(DISTINCT l_partkey) FILTER (WHERE l_shipmode = 'AIR') FROM lineitem_hash GROUP BY l_suppkey ORDER BY 2 DESC, 1 DESC LIMIT 10; ERROR: bogus varattno for OUTER_VAR var: 3 -- without group by, on partition column SELECT count(DISTINCT l_orderkey) FILTER (WHERE l_shipmode = 'AIR') FROM lineitem_hash; count ------- 1327 (1 row) -- without group by, on non-partition column SELECT count(DISTINCT l_partkey) FILTER (WHERE l_shipmode = 'AIR') FROM lineitem_hash; count ------- 1702 (1 row) SELECT count(DISTINCT l_partkey) FILTER (WHERE l_shipmode = 'AIR'), count(DISTINCT l_partkey), count(DISTINCT l_shipdate) FROM lineitem_hash; count | count | count -------+-------+------- 1702 | 11661 | 2470 (1 row) -- filter column already exists in target list SELECT * FROM ( SELECT l_orderkey, count(DISTINCT l_partkey) FILTER (WHERE l_orderkey > 100) FROM lineitem_hash GROUP BY l_orderkey) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; l_orderkey | count ------------+------- 14885 | 7 14884 | 7 14821 | 7 14790 | 7 14785 | 7 14755 | 7 14725 | 7 14694 | 7 14627 | 7 14624 | 7 (10 rows) -- filter column does not exist in target list SELECT * FROM ( SELECT l_orderkey, count(DISTINCT l_partkey) FILTER (WHERE l_shipmode = 'AIR') FROM lineitem_hash GROUP BY l_orderkey) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; l_orderkey | count ------------+------- 12005 | 4 5409 | 4 4964 | 4 14848 | 3 14496 | 3 13473 | 3 13122 | 3 12929 | 3 12645 | 3 12417 | 3 (10 rows) -- case expr in count distinct is supported. -- count orders partkeys if l_shipmode is air SELECT * FROM ( SELECT l_orderkey, count(DISTINCT CASE WHEN l_shipmode = 'AIR' THEN l_partkey ELSE NULL END) as count FROM lineitem_hash GROUP BY l_orderkey) sub WHERE count > 0 ORDER BY 2 DESC, 1 DESC LIMIT 10; l_orderkey | count ------------+------- 12005 | 4 5409 | 4 4964 | 4 14848 | 3 14496 | 3 13473 | 3 13122 | 3 12929 | 3 12645 | 3 12417 | 3 (10 rows) -- text like operator is also supported SELECT * FROM ( SELECT l_orderkey, count(DISTINCT CASE WHEN l_shipmode like '%A%' THEN l_partkey ELSE NULL END) as count FROM lineitem_hash GROUP BY l_orderkey) sub WHERE count > 0 ORDER BY 2 DESC, 1 DESC LIMIT 10; l_orderkey | count ------------+------- 14275 | 7 14181 | 7 13605 | 7 12707 | 7 12384 | 7 11746 | 7 10727 | 7 10467 | 7 5636 | 7 4614 | 7 (10 rows) -- count distinct is rejected if it does not reference any columns SELECT * FROM ( SELECT l_linenumber, count(DISTINCT 1) FROM lineitem_hash GROUP BY l_linenumber) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; ERROR: cannot compute aggregate (distinct) DETAIL: aggregate (distinct) with no columns is unsupported HINT: You can load the hll extension from contrib packages and enable distinct approximations. -- count distinct is rejected if it does not reference any columns SELECT * FROM ( SELECT l_linenumber, count(DISTINCT (random() * 5)::int) FROM lineitem_hash GROUP BY l_linenumber) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; ERROR: cannot compute aggregate (distinct) DETAIL: aggregate (distinct) with no columns is unsupported HINT: You can load the hll extension from contrib packages and enable distinct approximations. -- even non-const function calls are supported within count distinct SELECT * FROM ( SELECT l_orderkey, count(DISTINCT (random() * 5)::int = l_linenumber) FROM lineitem_hash GROUP BY l_orderkey) sub ORDER BY 2 DESC, 1 DESC LIMIT 0; l_orderkey | count ------------+------- (0 rows) -- multiple nested subquery SELECT total, avg(avg_count) as total_avg_count FROM ( SELECT number_sum, count(DISTINCT l_suppkey) as total, avg(total_count) avg_count FROM ( SELECT l_suppkey, sum(l_linenumber) as number_sum, count(DISTINCT l_shipmode) as total_count FROM lineitem_hash WHERE l_partkey > 100 and l_quantity > 2 and l_orderkey < 10000 GROUP BY l_suppkey) as distributed_table WHERE number_sum >= 10 GROUP BY number_sum) as distributed_table_2 GROUP BY total ORDER BY total_avg_count DESC; total | total_avg_count -------+-------------------- 1 | 3.6000000000000000 6 | 2.8333333333333333 10 | 2.6000000000000000 27 | 2.5555555555555556 32 | 2.4687500000000000 77 | 2.1948051948051948 57 | 2.1754385964912281 (7 rows) -- multiple cases query SELECT * FROM ( SELECT count(DISTINCT CASE WHEN l_shipmode = 'TRUCK' THEN l_partkey WHEN l_shipmode = 'AIR' THEN l_quantity WHEN l_shipmode = 'SHIP' THEN l_discount ELSE l_suppkey END) as count, l_shipdate FROM lineitem_hash GROUP BY l_shipdate) sub WHERE count > 0 ORDER BY 1 DESC, 2 DESC LIMIT 10; count | l_shipdate -------+------------ 14 | 07-30-1997 13 | 05-26-1998 13 | 08-08-1997 13 | 11-17-1995 13 | 01-09-1993 12 | 01-15-1998 12 | 10-15-1997 12 | 09-07-1997 12 | 06-02-1997 12 | 03-14-1997 (10 rows) -- count DISTINCT expression SELECT * FROM ( SELECT l_quantity, count(DISTINCT ((l_orderkey / 1000) * 1000 )) as count FROM lineitem_hash GROUP BY l_quantity) sub WHERE count > 0 ORDER BY 2 DESC, 1 DESC LIMIT 10; l_quantity | count ------------+------- 48.00 | 13 47.00 | 13 37.00 | 13 33.00 | 13 26.00 | 13 25.00 | 13 23.00 | 13 21.00 | 13 15.00 | 13 12.00 | 13 (10 rows) -- count DISTINCT is part of an expression which includes another aggregate SELECT * FROM ( SELECT sum(((l_partkey * l_tax) / 100)) / count(DISTINCT CASE WHEN l_shipmode = 'TRUCK' THEN l_partkey ELSE l_suppkey END) as avg, l_shipmode FROM lineitem_hash GROUP BY l_shipmode) sub ORDER BY 1 DESC, 2 DESC LIMIT 10; avg | l_shipmode -------------------------+------------ 44.82904609027336300064 | MAIL 44.80704536679536679537 | SHIP 44.68891732736572890026 | AIR 44.34106724470134874759 | REG AIR 43.12739987269255251432 | FOB 43.07299253636938646426 | RAIL 40.50298377916903813318 | TRUCK (7 rows) -- count DISTINCT CASE WHEN expression SELECT * FROM ( SELECT count(DISTINCT CASE WHEN l_shipmode = 'TRUCK' THEN l_linenumber WHEN l_shipmode = 'AIR' THEN l_linenumber + 10 ELSE 2 END) as avg FROM lineitem_hash GROUP BY l_shipdate) sub ORDER BY 1 DESC LIMIT 10; avg ----- 7 6 6 6 6 6 6 6 5 5 (10 rows) -- COUNT DISTINCT (c1, c2) SELECT * FROM (SELECT l_shipmode, count(DISTINCT (l_shipdate, l_tax)) FROM lineitem_hash GROUP BY l_shipmode) t ORDER BY 2 DESC,1 DESC LIMIT 10; l_shipmode | count ------------+------- TRUCK | 1689 MAIL | 1683 FOB | 1655 AIR | 1650 SHIP | 1644 RAIL | 1636 REG AIR | 1607 (7 rows) -- distinct on non-var (type cast/field select) columns are also -- supported if grouped on distribution column -- random is added to prevent flattening by postgresql SELECT l_orderkey, count(a::int), count(distinct a::int) FROM ( SELECT l_orderkey, l_orderkey * 1.5 a, random() b FROM lineitem_hash) sub GROUP BY 1 ORDER BY 1 DESC LIMIT 5; l_orderkey | count | count ------------+-------+------- 14947 | 2 | 1 14946 | 2 | 1 14945 | 6 | 1 14944 | 2 | 1 14919 | 1 | 1 (5 rows) SELECT user_id, count(sub.a::int), count(DISTINCT sub.a::int), count(DISTINCT (sub).a) FROM (SELECT user_id, unnest(ARRAY[user_id * 1.5])a, random() b FROM users_table ) sub GROUP BY 1 ORDER BY 1 DESC LIMIT 5; user_id | count | count | count ---------+-------+-------+------- 6 | 11 | 1 | 1 5 | 27 | 1 | 1 4 | 24 | 1 | 1 3 | 18 | 1 | 1 2 | 19 | 1 | 1 (5 rows) CREATE TYPE test_item AS ( id INTEGER, duration INTEGER ); SELECT * FROM run_command_on_workers($$CREATE TYPE test_item AS ( id INTEGER, duration INTEGER )$$) ORDER BY nodeport; nodename | nodeport | success | result -----------+----------+---------+------------- localhost | 57637 | t | CREATE TYPE localhost | 57638 | t | CREATE TYPE (2 rows) CREATE TABLE test_count_distinct_array (key int, value int , value_arr test_item[]); SELECT create_distributed_table('test_count_distinct_array', 'key'); create_distributed_table -------------------------- (1 row) INSERT INTO test_count_distinct_array SELECT i, i, ARRAY[(i,i)::test_item] FROM generate_Series(0, 1000) i; SELECT key, count(DISTINCT value), count(DISTINCT (item)."id"), count(DISTINCT (item)."id" * 3) FROM ( SELECT key, unnest(value_arr) as item, value FROM test_count_distinct_array ) as sub GROUP BY 1 ORDER BY 1 DESC LIMIT 5; key | count | count | count ------+-------+-------+------- 1000 | 1 | 1 | 1 999 | 1 | 1 | 1 998 | 1 | 1 | 1 997 | 1 | 1 | 1 996 | 1 | 1 | 1 (5 rows) DROP TABLE test_count_distinct_array; DROP TYPE test_item; SELECT * FROM run_command_on_workers($$DROP TYPE test_item$$) ORDER BY nodeport; nodename | nodeport | success | result -----------+----------+---------+----------- localhost | 57637 | t | DROP TYPE localhost | 57638 | t | DROP TYPE (2 rows) -- other distinct aggregate are not supported SELECT * FROM ( SELECT l_linenumber, sum(DISTINCT l_partkey) FROM lineitem_hash GROUP BY l_linenumber) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; ERROR: cannot compute aggregate (distinct) DETAIL: Only count(distinct) aggregate is supported in subqueries SELECT * FROM ( SELECT l_linenumber, avg(DISTINCT l_partkey) FROM lineitem_hash GROUP BY l_linenumber) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; ERROR: cannot compute aggregate (distinct) DETAIL: Only count(distinct) aggregate is supported in subqueries -- whole row references, oid, and ctid are not supported in count distinct -- test table does not have oid or ctid enabled, so tests for them are skipped SELECT * FROM ( SELECT l_linenumber, count(DISTINCT lineitem_hash) FROM lineitem_hash GROUP BY l_linenumber) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; ERROR: cannot compute count (distinct) DETAIL: Non-column references are not supported yet SELECT * FROM ( SELECT l_linenumber, count(DISTINCT lineitem_hash.*) FROM lineitem_hash GROUP BY l_linenumber) sub ORDER BY 2 DESC, 1 DESC LIMIT 10; ERROR: cannot compute count (distinct) DETAIL: Non-column references are not supported yet -- count distinct pushdown is enabled SELECT * FROM ( SELECT l_shipdate, count(DISTINCT CASE WHEN l_shipmode = 'TRUCK' THEN l_partkey ELSE NULL END) as distinct_part, extract(year from l_shipdate) as year FROM lineitem_hash GROUP BY l_shipdate, year) sub WHERE year = 1995 ORDER BY 2 DESC, 1 LIMIT 10; l_shipdate | distinct_part | year ------------+---------------+------ 11-29-1995 | 5 | 1995 03-24-1995 | 4 | 1995 09-18-1995 | 4 | 1995 01-17-1995 | 3 | 1995 04-02-1995 | 3 | 1995 05-23-1995 | 3 | 1995 08-11-1995 | 3 | 1995 09-27-1995 | 3 | 1995 10-27-1995 | 3 | 1995 10-30-1995 | 3 | 1995 (10 rows) RESET citus.task_executor_type; -- count distinct pushdown is enabled SELECT * FROM ( SELECT l_shipdate, count(DISTINCT CASE WHEN l_shipmode = 'TRUCK' THEN l_partkey ELSE NULL END) as distinct_part, extract(year from l_shipdate) as year FROM lineitem_hash GROUP BY l_shipdate, year) sub WHERE year = 1995 ORDER BY 2 DESC, 1 LIMIT 10; l_shipdate | distinct_part | year ------------+---------------+------ 11-29-1995 | 5 | 1995 03-24-1995 | 4 | 1995 09-18-1995 | 4 | 1995 01-17-1995 | 3 | 1995 04-02-1995 | 3 | 1995 05-23-1995 | 3 | 1995 08-11-1995 | 3 | 1995 09-27-1995 | 3 | 1995 10-27-1995 | 3 | 1995 10-30-1995 | 3 | 1995 (10 rows) SELECT * FROM ( SELECT l_shipdate, count(DISTINCT CASE WHEN l_shipmode = 'TRUCK' THEN l_partkey ELSE NULL END) as distinct_part, extract(year from l_shipdate) as year FROM lineitem_hash GROUP BY l_shipdate) sub WHERE year = 1995 ORDER BY 2 DESC, 1 LIMIT 10; l_shipdate | distinct_part | year ------------+---------------+------ 11-29-1995 | 5 | 1995 03-24-1995 | 4 | 1995 09-18-1995 | 4 | 1995 01-17-1995 | 3 | 1995 04-02-1995 | 3 | 1995 05-23-1995 | 3 | 1995 08-11-1995 | 3 | 1995 09-27-1995 | 3 | 1995 10-27-1995 | 3 | 1995 10-30-1995 | 3 | 1995 (10 rows) DROP TABLE lineitem_hash;