\set ECHO none \pset format unaligned SET search_path TO provsql_test,provsql; -- Determinism for every Monte Carlo call below. SET provsql.monte_carlo_seed = 42; CREATE TABLE sensors(id text, reading provsql.random_variable); INSERT INTO sensors VALUES ('s1', provsql.normal(2.5, 0.5)), ('s2', provsql.uniform(1, 3)); SELECT add_provenance('sensors'); -- ==================================================================== -- A: provenance-tracked table with WHERE on a random_variable column. -- The TODO's anchor: WHERE reading > 2 lifts into provenance, -- expect P(s1)=0.84..., P(s2)=0.5 (within 0.02 of truth at n=100k). -- The standard idiom is: capture into a table, remove_provenance, -- then SELECT to see clean output. Without this, the planner hook -- adds a provsql column to the outer SELECT and the UUIDs in the -- output are non-deterministic. -- ==================================================================== CREATE TABLE result_a AS SELECT id, abs(probability_evaluate(provenance(), 'monte-carlo', '100000') - CASE id WHEN 's1' THEN 0.8413 WHEN 's2' THEN 0.5 END) < 0.02 AS within_tolerance FROM sensors WHERE reading > 2; SELECT remove_provenance('result_a'); SELECT * FROM result_a ORDER BY id; DROP TABLE result_a; -- ==================================================================== -- B: mixed WHERE -- non-RV conjunct stays in WHERE and filters rows; -- RV conjunct lifts into provenance. Only s1 survives the id filter. -- ==================================================================== CREATE TABLE result_b AS SELECT id, abs(probability_evaluate(provenance(), 'monte-carlo', '100000') - 0.8413) < 0.02 AS within_tolerance FROM sensors WHERE id = 's1' AND reading > 2; SELECT remove_provenance('result_b'); SELECT * FROM result_b ORDER BY id; DROP TABLE result_b; -- ==================================================================== -- C: NOT(reading > 2) -- De Morgan flips the comparator at the gate -- level. P(N(2.5,0.5) <= 2) = 1 - 0.8413 = 0.1587. -- ==================================================================== CREATE TABLE result_c AS SELECT id, abs(probability_evaluate(provenance(), 'monte-carlo', '100000') - CASE id WHEN 's1' THEN 0.1587 WHEN 's2' THEN 0.5 END) < 0.02 AS within_tolerance FROM sensors WHERE NOT (reading > 2); SELECT remove_provenance('result_c'); SELECT * FROM result_c ORDER BY id; DROP TABLE result_c; -- ==================================================================== -- D: OR over two RV comparisons. The De Morgan walker emits -- provenance_plus when not negated; this is a Boolean OR (with the -- usual circuit-level inclusion-exclusion handled by downstream -- evaluators). -- For s1 ~ N(2.5, 0.5): -- P(reading > 3) = 0.1587, P(reading < 1) ~= 0.0013, OR ~ 0.16. -- For s2 ~ U(1, 3): -- P(reading > 3) = 0, P(reading < 1) = 0, OR = 0. -- The events are disjoint here, so monte-carlo on the gate_plus is -- exact for s2 and within MC noise for s1. -- ==================================================================== CREATE TABLE result_d AS SELECT id, abs(probability_evaluate(provenance(), 'monte-carlo', '100000') - CASE id WHEN 's1' THEN 0.1600 WHEN 's2' THEN 0.0 END) < 0.03 AS within_tolerance FROM sensors WHERE reading > 3 OR reading < 1; SELECT remove_provenance('result_d'); SELECT * FROM result_d ORDER BY id; DROP TABLE result_d; -- ==================================================================== -- E: WHERE on RV arithmetic -- reading + 1 > 3 is reading > 2 in -- disguise. The lifted gate_cmp's left child is a gate_arith. -- ==================================================================== CREATE TABLE result_e AS SELECT id, abs(probability_evaluate(provenance(), 'monte-carlo', '100000') - CASE id WHEN 's1' THEN 0.8413 WHEN 's2' THEN 0.5 END) < 0.02 AS within_tolerance FROM sensors WHERE reading + 1 > 3; SELECT remove_provenance('result_e'); SELECT * FROM result_e ORDER BY id; DROP TABLE result_e; DROP TABLE sensors; -- ==================================================================== -- F: untracked table (no add_provenance) -- WHERE on the RV column -- still produces a result with a provenance column, synthesised from -- gate_one * gate_cmp = gate_cmp. -- ==================================================================== CREATE TABLE untracked(id text, reading provsql.random_variable); INSERT INTO untracked VALUES ('u1', provsql.normal(0, 1)), ('u2', provsql.normal(0, 1)); CREATE TABLE result_f AS SELECT id, abs(probability_evaluate(provenance(), 'monte-carlo', '10000') - 0.5) < 0.02 AS within_tolerance FROM untracked WHERE reading > 0; SELECT remove_provenance('result_f'); SELECT * FROM result_f ORDER BY id; DROP TABLE result_f; DROP TABLE untracked; -- ==================================================================== -- G: full-stack composition -- WHERE on RV + GROUP BY (duplicate -- elimination) + HAVING on the aggregate count. Verifies the -- planner-hook RV rewrite slots cleanly between the existing -- aggregation rewriting (replaces aggregates with -- provenance_aggregate, builds an agg_token), the array_agg combiner -- that GROUP BY uses to fold within-group provenances, and the -- HAVING walker that emits a provenance_cmp on the agg result. The -- resulting provsql is a multi-layer circuit: -- cmp(HAVING) -> agg(count over array_agg(times(row.provsql, -- cmp(val > 1)))) -- We verify this STRUCTURE rather than running probability_evaluate -- because the RV-aware Monte Carlo sampler does not yet pre-resolve -- HAVING gate_agg children (the legacy boolean MC path does this via -- @c provsql_having in @c CircuitFromMMap.cpp; the RV path can grow -- the equivalent later). Probability over WHERE + GROUP BY without -- HAVING is exercised by section H below. -- ==================================================================== CREATE TABLE readings(category text, val provsql.random_variable); INSERT INTO readings VALUES ('hot', provsql.normal(2, 1)), ('hot', provsql.normal(2, 1)), ('warm', provsql.uniform(0, 2)), ('warm', provsql.uniform(0, 2)); SELECT add_provenance('readings'); -- Capture the per-group provsql so we can probe its shape. CREATE TABLE result_g AS SELECT category FROM readings WHERE val > 1 GROUP BY category HAVING count(*) > 1; -- The added provsql column is the HAVING cmp at root. Walk down: -- (a) root is gate_cmp; -- (b) the cmp's left child is gate_agg (the HAVING aggregate); -- (c) at the bottom of the agg's semimod children, the per-row token -- is a gate_times with a gate_cmp(val > 1) factor folded in by -- the WHERE rewrite (proves the RV cmp lifted into per-row -- provenance, not stranded at group level). SELECT (SELECT bool_and(get_gate_type(provsql) = 'cmp') FROM result_g) AS root_is_cmp, (SELECT bool_and(get_gate_type((get_children(provsql))[1]) = 'agg') FROM result_g) AS having_lhs_is_agg, -- Drill down: the agg's first child is a semimod whose k_gate -- (semimod_extract_M_and_K's k) is a gate_times wrapping the -- per-row provsql AND the gate_cmp built by the WHERE rewrite. (SELECT bool_and(get_gate_type( (get_children( (get_children((get_children(provsql))[1]))[1]) )[1]) = 'times') FROM result_g) AS per_row_token_is_times, -- One of the times's children is the gate_cmp from `val > 1`. (SELECT bool_and('cmp' = ANY( ARRAY(SELECT get_gate_type(c) FROM unnest(get_children( (get_children( (get_children((get_children(provsql))[1]))[1]))[1]) ) c))) FROM result_g) AS row_times_has_cmp_factor; SELECT remove_provenance('result_g'); SELECT category FROM result_g ORDER BY category; DROP TABLE result_g; -- ==================================================================== -- H: WHERE on RV + GROUP BY (no HAVING). Probability check for the -- full pipeline excluding HAVING, since HAVING + RV does not yet run -- under MC (see section G). P(group fires) = P(at least one row -- survives WHERE): -- hot: 1 - (1 - 0.8413)^2 ~= 0.9748 -- warm: 1 - (1 - 0.5)^2 = 0.75 -- ==================================================================== CREATE TABLE result_h AS SELECT category, abs(probability_evaluate(provenance(), 'monte-carlo', '100000') - CASE category WHEN 'hot' THEN 0.9748 WHEN 'warm' THEN 0.75 END) < 0.03 AS within_tolerance FROM readings WHERE val > 1 GROUP BY category; SELECT remove_provenance('result_h'); SELECT * FROM result_h ORDER BY category; DROP TABLE result_h; DROP TABLE readings; -- ==================================================================== -- I: mixed RV / non-RV inside a single Boolean expression is -- explicitly rejected with a clear error -- the planner cannot lift -- the RV part without lifting the deterministic part too, and the -- latter would require synthesising a Boolean RV gate (CASE WHEN ... -- THEN gate_one() ELSE gate_zero() END), which is deferred work. -- This mirrors agg_token's check_expr_on_aggregate behaviour for -- mixed WHERE clauses. -- ==================================================================== CREATE TABLE mix(id text, reading provsql.random_variable); INSERT INTO mix VALUES ('m1', provsql.normal(0, 1)); SELECT add_provenance('mix'); \set VERBOSITY terse SELECT id FROM mix WHERE reading > 0 OR id = 'm1'; \set VERBOSITY default DROP TABLE mix; -- ==================================================================== -- J: WHERE on a random_variable with NO add_provenance()'d source -- AND NO provenance() in the SELECT. Before the has_provenance -- walker recursed with itself (rather than the limited -- provenance_function_walker), the rv-comparison OpExpr inside the -- WHERE clause was invisible to the gate, so the planner hook was -- skipped and the executor reached random_variable_cmp_placeholder. -- The fix is the smallest case that exercises pure-RV provenance -- synthesis: no tracked relation, no provenance() call, just a CTE -- producing an RV and a WHERE comparing it to a constant. -- ==================================================================== CREATE TABLE result_j AS WITH x(u) AS (SELECT uniform(0, 1)) SELECT 1 AS one FROM x WHERE u > 0.5; -- Two structural checks before removing the provenance column: -- (a) row count: the rewriter must let the row through (WHERE rewritten -- to TRUE, the lifted comparison goes to the provsql column). -- (b) the root gate is the cmp itself, NOT a single-child times wrapping -- it. provenance_times(gate_one(), cmp) used to leave that useless -- wrapper because the CASE dispatched on the unfiltered token count -- and missed the [one, cmp] -> [cmp] collapse. -- SET LOCAL provsql.active=off so the rewriter doesn't auto-add a fresh -- provsql column to this very SELECT and wrap count(*) in agg_token. BEGIN; SET LOCAL provsql.active = off; SELECT count(*) AS rows_returned, bool_and(get_gate_type(provsql) = 'cmp') AS root_is_cmp FROM result_j; COMMIT; SELECT remove_provenance('result_j'); DROP TABLE result_j; -- ==================================================================== -- K: FROM-less SELECT with an rv_cmp in WHERE. The planner hook used -- to short-circuit on `q->commandType == CMD_SELECT && q->rtable`, so -- a query with no FROM never even reached has_provenance and the -- placeholder fired. After widening the gate to drop the rtable -- check, the rewriter lifts the comparison into the synthesised -- provsql column the same way it does for FROM-bearing queries. -- ==================================================================== CREATE TABLE result_k AS SELECT 1 AS one WHERE normal(0, 1) > 2; BEGIN; SET LOCAL provsql.active = off; SELECT count(*) AS rows_returned, bool_and(get_gate_type(provsql) = 'cmp') AS root_is_cmp FROM result_k; COMMIT; SELECT remove_provenance('result_k'); DROP TABLE result_k; RESET provsql.monte_carlo_seed; SELECT 'ok'::text AS continuous_selection_done;