\set ECHO none
\pset format unaligned
SET search_path TO provsql_test,provsql;

-- ----------------------------------------------------------------------
-- HAVING COUNT(DISTINCT y) probability.  Regression for the bug where a
-- DISTINCT aggregate in a HAVING clause was evaluated as a plain COUNT
-- over the raw tuples (the agg-distinct GROUP-BY rewrite only handled the
-- target list, never q->havingQual), so duplicate same-value tuples were
-- counted more than once in the provenance circuit.
--
-- Setup (all tuples independent, p = 0.5):
--   g=1 : y in {10 (twice), 20, 30}   g=2 : y in {40, 50}
-- Per-distinct-value EXISTS probabilities:
--   g=1: P(10)=1-0.5^2=0.75, P(20)=0.5, P(30)=0.5;  g=2: P(40)=P(50)=0.5.
-- COUNT(DISTINCT y) is the number of distinct values present, so its
-- distribution is the (independent) sum of those per-value indicators.
-- ----------------------------------------------------------------------

DROP TABLE IF EXISTS hcd_t CASCADE;
CREATE TABLE hcd_t(g int, y int);
INSERT INTO hcd_t VALUES (1,10),(1,10),(1,20),(1,30),(2,40),(2,50);
SELECT add_provenance('hcd_t');
DO $$ BEGIN PERFORM set_prob(provenance(), 0.5) FROM hcd_t; END $$;

-- COUNT(DISTINCT y) >= 2
-- g=1: P(>=2 of {0.75,0.5,0.5}) = 0.625 ; g=2: P(both of {0.5,0.5}) = 0.25
CREATE TEMP TABLE r AS SELECT g, probability_evaluate(provenance()) AS p
  FROM hcd_t GROUP BY g HAVING count(DISTINCT y) >= 2;
SELECT remove_provenance('r');
SELECT 'count(distinct) >= 2' AS shape, g, ROUND(p::numeric,4) AS p FROM r ORDER BY g;
DROP TABLE r;

-- COUNT(DISTINCT y) >= 3
-- g=1: P(all 3) = 0.75*0.5*0.5 = 0.1875 ; g=2: impossible (only 2 distinct)
CREATE TEMP TABLE r AS SELECT g, probability_evaluate(provenance()) AS p
  FROM hcd_t GROUP BY g HAVING count(DISTINCT y) >= 3;
SELECT remove_provenance('r');
SELECT 'count(distinct) >= 3' AS shape, g, ROUND(p::numeric,4) AS p FROM r ORDER BY g;
DROP TABLE r;

-- COUNT(DISTINCT y) = 2
-- g=1: P(exactly 2) = 0.4375 ; g=2: P(2) = 0.25
CREATE TEMP TABLE r AS SELECT g, probability_evaluate(provenance()) AS p
  FROM hcd_t GROUP BY g HAVING count(DISTINCT y) = 2;
SELECT remove_provenance('r');
SELECT 'count(distinct) = 2' AS shape, g, ROUND(p::numeric,4) AS p FROM r ORDER BY g;
DROP TABLE r;

-- COUNT(DISTINCT y) <= 1  (empty group excluded: exactly one distinct present)
-- g=1: P(exactly 1) = 0.3125 ; g=2: P(1) = 0.5
CREATE TEMP TABLE r AS SELECT g, probability_evaluate(provenance()) AS p
  FROM hcd_t GROUP BY g HAVING count(DISTINCT y) <= 1;
SELECT remove_provenance('r');
SELECT 'count(distinct) <= 1' AS shape, g, ROUND(p::numeric,4) AS p FROM r ORDER BY g;
DROP TABLE r;

-- DISTINCT aggregate in both SELECT list and HAVING: value and probability
-- must both be correct (the value path was always correct; this pins that
-- the HAVING fix does not disturb it).
CREATE TEMP TABLE r AS
  SELECT g, count(DISTINCT y) AS cd, probability_evaluate(provenance()) AS p
  FROM hcd_t GROUP BY g HAVING count(DISTINCT y) >= 2;
SELECT remove_provenance('r');
SELECT 'select+having' AS shape, g, cd, ROUND(p::numeric,4) AS p FROM r ORDER BY g;
DROP TABLE r;

-- The off-path (exact possible-worlds enumeration) must agree with the
-- default path: the fix is in the circuit the rewrite builds, not in the
-- closed-form cmp evaluator, so both see the corrected contributors.
SET provsql.cmp_probability_evaluation = off;
CREATE TEMP TABLE off AS SELECT g, probability_evaluate(provenance()) AS p
  FROM hcd_t GROUP BY g HAVING count(DISTINCT y) >= 2;
SELECT remove_provenance('off');
SET provsql.cmp_probability_evaluation = on;
CREATE TEMP TABLE oncf AS SELECT g, probability_evaluate(provenance()) AS p
  FROM hcd_t GROUP BY g HAVING count(DISTINCT y) >= 2;
SELECT remove_provenance('oncf');
SELECT 'off vs on' AS shape, o.g,
       ROUND(o.p::numeric,4) AS p_off, ROUND(n.p::numeric,4) AS p_on,
       ROUND(ABS(o.p-n.p)::numeric,6) AS diff
FROM off o JOIN oncf n USING (g) ORDER BY o.g;
DROP TABLE off, oncf;

DROP TABLE hcd_t CASCADE;

-- Regression: COUNT(DISTINCT) over an OUTER join.  The agg-distinct GROUP-BY
-- rewrite appends its deduped subqueries to the range table but indexed them by
-- from-list position -- wrong when the FROM is a JoinExpr (an outer join is one
-- from-list item spanning several rtable slots), so a base arm was referenced
-- instead ("rel N already exists" / "no relation entry for relid N").
CREATE TABLE hcd_r(g int);
CREATE TABLE hcd_q(g int, y int);
INSERT INTO hcd_r VALUES (1),(2),(3);
INSERT INTO hcd_q VALUES (1,10),(2,20),(2,20),(3,30),(3,31);  -- g=2 same value; g=3 distinct
SELECT add_provenance('hcd_r');
SELECT add_provenance('hcd_q');
DO $$ BEGIN
  PERFORM set_prob(provenance(), 1.0) FROM hcd_r;
  PERFORM set_prob(provenance(), 0.5) FROM hcd_q;
END $$;

-- count(DISTINCT y) per g over the LEFT JOIN: g=1 -> 1, g=2 -> 1 (same value), g=3 -> 2.
CREATE TEMP TABLE ljc AS
  SELECT hcd_r.g AS g, count(DISTINCT hcd_q.y) AS c
  FROM hcd_r LEFT JOIN hcd_q ON hcd_q.g = hcd_r.g GROUP BY hcd_r.g;
SELECT remove_provenance('ljc');
SELECT 'lj count' AS shape, g, c FROM ljc ORDER BY g;
DROP TABLE ljc;

-- HAVING count(DISTINCT y) <= 1 over the LEFT JOIN: g=2 (same value) is admitted
-- (p=1); g=3 (two distinct) is gated when both present -> p = 1 - 0.25 = 0.75.
CREATE TEMP TABLE lj AS
  SELECT hcd_r.g AS g, probability_evaluate(provenance()) AS p
  FROM hcd_r LEFT JOIN hcd_q ON hcd_q.g = hcd_r.g
  GROUP BY hcd_r.g HAVING count(DISTINCT hcd_q.y) <= 1;
SELECT remove_provenance('lj');
SELECT 'lj having' AS shape, g, ROUND(p::numeric,4) AS p FROM lj ORDER BY g;
DROP TABLE lj;
DROP TABLE hcd_q; DROP TABLE hcd_r;