\set ECHO none
\pset format unaligned
SET search_path TO provsql_test, provsql;

-- End-to-end exercise of provsql.boolean_provenance for a two-atom
-- self-join-free hierarchical conjunctive query of the form
-- q(x) :- A(x), B(x).  The rewrite is only sound for global /
-- existential interrogations of the result (per-row semantics
-- change because each atom is wrapped in SELECT DISTINCT), so the
-- canonical user-facing pattern is GROUP BY the root variable.
--
-- Multiple-match rows on each side make the read-once vs
-- shared-input distinction visible after GROUP BY:
--
--   GUC off, GROUP BY id  -> per-id Boolean expression is the OR
--      of all join-pair products, with every leaf appearing
--      multiple times.  Not read-once;
--      probability_evaluate('independent') rejects.
--   GUC on,  GROUP BY id  -> rewriter wraps each atom as
--      SELECT DISTINCT id; the per-id provenance becomes
--      gate_times(gate_plus(left ids), gate_plus(right ids));
--      each leaf appears once; 'independent' succeeds and the
--      probabilities match the baseline computed by the default
--      evaluator.

CREATE TABLE sqb_left (id int);
CREATE TABLE sqb_right(id int);
INSERT INTO sqb_left  VALUES (1),(3),(3);
INSERT INTO sqb_right VALUES (1),(3),(3);

SELECT add_provenance('sqb_left');
SELECT add_provenance('sqb_right');

DO $$ BEGIN
  PERFORM set_prob(provsql, 0.5) FROM sqb_left;
  PERFORM set_prob(provsql, 0.4) FROM sqb_right;
END $$;

-- (1) Without the rewrite: the unrewritten per-id circuit has shared
--     input gates (each leaf appears in multiple join pairs), so
--     'independent' rejects on id=3.
SET provsql.boolean_provenance = off;
DO $$
DECLARE raised boolean := false;
BEGIN
  BEGIN
    PERFORM probability_evaluate(provenance(), 'independent')
      FROM sqb_left a, sqb_right b
     WHERE a.id = b.id AND a.id = 3
     GROUP BY a.id;
  EXCEPTION WHEN OTHERS THEN raised := true;
  END;
  IF NOT raised THEN
    RAISE EXCEPTION 'expected ''independent'' to reject the unrewritten '
                    'id=3 circuit (shared input gates)';
  END IF;
END $$;

-- (2) With the rewrite: per-id provenance is read-once, so
--     'independent' succeeds and returns the analytical value:
--       id=1: 0.5 * 0.4                            = 0.20
--       id=3: (1-(1-0.5)^2) * (1-(1-0.4)^2) = 0.75 * 0.64 = 0.48
SET provsql.boolean_provenance = on;
CREATE TEMP TABLE sqb_prob AS
  SELECT a.id,
         probability_evaluate(provenance(), 'independent') AS p
    FROM sqb_left a, sqb_right b
   WHERE a.id = b.id
   GROUP BY a.id;
SELECT remove_provenance('sqb_prob');
SELECT id, ROUND(p::numeric, 6) AS prob FROM sqb_prob ORDER BY id;

-- (3) Cross-check: rewritten probabilities must match the default
--     baseline (which uses dDNNF / tree-decomposition / d4 fallback
--     on the unrewritten circuit).
SET provsql.boolean_provenance = off;
CREATE TEMP TABLE sqb_baseline AS
  SELECT a.id, probability_evaluate(provenance()) AS p
    FROM sqb_left a, sqb_right b
   WHERE a.id = b.id
   GROUP BY a.id;
SELECT remove_provenance('sqb_baseline');
SELECT b.id, ROUND((b.p - r.p)::numeric, 9) AS diff_baseline_vs_rewritten
  FROM sqb_baseline b JOIN sqb_prob r ON b.id = r.id
 ORDER BY b.id;

-- (4) Rewritten root gate shape: gate_times over two gate_plus children.
SET provsql.boolean_provenance = on;
CREATE TEMP TABLE sqb_shape AS
  SELECT a.id,
         get_gate_type(provenance())                  AS root,
         array_length(get_children(provenance()), 1)  AS root_nchildren
    FROM sqb_left a, sqb_right b
   WHERE a.id = b.id
   GROUP BY a.id;
SELECT remove_provenance('sqb_shape');
SELECT id, root, root_nchildren FROM sqb_shape ORDER BY id;

-- (5) The per-atom SELECT DISTINCT wraps collapse duplicate source
--     tuples on their projection slots, which would shrink the
--     user-visible row count of a query without an outer GROUP BY or
--     top-level DISTINCT.  The candidate gate refuses such queries
--     so the GUC stays observationally transparent.  Cardinality
--     under the GUC must match the GUC-off baseline: 1 row at id=1
--     (1x1 self-join) plus 4 rows at id=3 (2x2 self-join) = 5 rows.
SET provsql.boolean_provenance = off;
CREATE TEMP TABLE sqb_nogb_off AS
  SELECT a.id FROM sqb_left a, sqb_right b WHERE a.id = b.id;
SELECT remove_provenance('sqb_nogb_off');
SELECT count(*) AS rows_off FROM sqb_nogb_off;

SET provsql.boolean_provenance = on;
CREATE TEMP TABLE sqb_nogb_on AS
  SELECT a.id FROM sqb_left a, sqb_right b WHERE a.id = b.id;
SELECT remove_provenance('sqb_nogb_on');
SELECT count(*) AS rows_on FROM sqb_nogb_on;

-- (6) SELECT DISTINCT 1 is the standard Boolean-query idiom: project
--     every body variable out and return a single tuple whose
--     provenance is the OR over all matching join tuples.  The
--     rewriter must accept it (distinctClause is non-NIL) and produce
--     a circuit whose probability matches the baseline.  With
--     duplicates on both sides (id=3 contributes 2x2 matching
--     pairs), the baseline circuit shares input gates and the
--     rewritten one does not, so 'independent' should succeed on the
--     rewritten circuit while still agreeing with the baseline
--     probability.  Expected value with p_a=0.5, p_b=0.4:
--       P(matched at id=1) = 0.5 * 0.4                    = 0.20
--       P(at least one A-id=3) = 1 - 0.5^2                = 0.75
--       P(at least one B-id=3) = 1 - 0.6^2                = 0.64
--       P(matched at id=3) = 0.75 * 0.64                  = 0.48
--       P(non-empty answer) = 1 - (1-0.20) * (1-0.48)
--                           = 1 - 0.8 * 0.52              = 0.584
SET provsql.boolean_provenance = off;
CREATE TEMP TABLE sqb_dist1_off AS
  SELECT DISTINCT 1 AS one FROM sqb_left a, sqb_right b WHERE a.id = b.id;
CREATE TEMP TABLE sqb_dist1_off_p AS
  SELECT one, ROUND(probability_evaluate(provsql)::numeric, 6) AS p
    FROM sqb_dist1_off;
SELECT remove_provenance('sqb_dist1_off_p');
SELECT count(*) AS rows_off, p AS p_off FROM sqb_dist1_off_p GROUP BY p;

SET provsql.boolean_provenance = on;
CREATE TEMP TABLE sqb_dist1_on AS
  SELECT DISTINCT 1 AS one FROM sqb_left a, sqb_right b WHERE a.id = b.id;
CREATE TEMP TABLE sqb_dist1_on_p AS
  SELECT one, ROUND(probability_evaluate(provsql, 'independent')::numeric, 6) AS p
    FROM sqb_dist1_on;
SELECT remove_provenance('sqb_dist1_on_p');
SELECT count(*) AS rows_on, p AS p_on FROM sqb_dist1_on_p GROUP BY p;

SET provsql.boolean_provenance = off;
DROP TABLE sqb_left;
DROP TABLE sqb_right;