\set ECHO none \pset format unaligned SET search_path TO provsql_test, provsql; -- General FD closure (detector-only; no rewriter change). -- -- The textbook framework (Dalvi & Suciu 2007 §5.1) envisions a -- fixpoint over the @c DETERMINED matrix: collect every FD in -- Γ_p(q) (constant-selection FDs, PK FDs, deterministic-relation -- FDs) and iteratively apply each FD until no new -- FD-determined-in-RTE tags are set. The hierarchicality check -- then runs on the FD-reduced atom-sets. -- -- In the current rule set (constant-pinning + schema PK/UNIQUE + -- deterministic transparency), every FD application is -- @em independent: the PK-FD pass already fires every FD in a -- single walk over the RTE list, and constant-selection and -- deterministic transparency act through orthogonal mechanisms -- (residual-cleanup multi-component for constant pinning, -- @c DETERMINED-all-classes for deterministic atoms). No -- iteration unlocks a new tag. The fixpoint reduces to a no-op on -- the canonical motivating case -- the triangle CQ with two PKs on -- different relations. An FD-induced nested rewrite -- (function/free split) is required to compose constant-pinning -- with a PK on the same relation; that is documented in -- @c doc/TODO/safe-query-followups.md and deferred. -- -- This file regression-tests the canonical triangle example to -- confirm the cumulative passes deliver the closure guarantee: -- "accept any query whose FD-reduced atom sets are hierarchical -- and whose existing single-level wrap is read-once". -- --------------------------------------------------------------------- -- (1) Triangle CQ with PKs on R(a) and T(c) (Dalvi & Suciu 2007 -- §5.1). Raw atom-sets are pairwise neither nested nor -- disjoint -- non-hierarchical without the FDs. Applying both -- PKs gives @c atoms(a) @c = @c {R}, @c atoms(b) @c = @c {S}, -- @c atoms(c) @c = @c {S, T}: pairwise nested-or-disjoint -- (a disjoint from b and c; b nested in c). -- --------------------------------------------------------------------- CREATE TABLE fdc_r (a int PRIMARY KEY, b int); CREATE TABLE fdc_s (b int, c int); CREATE TABLE fdc_t (c int PRIMARY KEY, a int); INSERT INTO fdc_r VALUES (1, 10); INSERT INTO fdc_s VALUES (10, 100); INSERT INTO fdc_t VALUES (100, 1); SELECT add_provenance('fdc_r'); SELECT add_provenance('fdc_s'); SELECT add_provenance('fdc_t'); DO $$ BEGIN PERFORM set_prob(provsql, 0.5) FROM fdc_r; PERFORM set_prob(provsql, 0.5) FROM fdc_s; PERFORM set_prob(provsql, 0.5) FROM fdc_t; END $$; SET provsql.boolean_provenance = off; CREATE TEMP TABLE fdc_baseline AS SELECT q.x, probability_evaluate(provenance()) AS p FROM (SELECT DISTINCT 1 AS x FROM fdc_r r, fdc_s s, fdc_t t WHERE r.b = s.b AND s.c = t.c AND t.a = r.a) q GROUP BY q.x; SELECT remove_provenance('fdc_baseline'); SELECT x, ROUND(p::numeric, 6) AS prob_baseline FROM fdc_baseline; SET provsql.boolean_provenance = on; CREATE TEMP TABLE fdc_rewritten AS SELECT q.x, probability_evaluate(provenance(), 'independent') AS p FROM (SELECT DISTINCT 1 AS x FROM fdc_r r, fdc_s s, fdc_t t WHERE r.b = s.b AND s.c = t.c AND t.a = r.a) q GROUP BY q.x; SELECT remove_provenance('fdc_rewritten'); SELECT x, ROUND(p::numeric, 6) AS prob_rewritten FROM fdc_rewritten; SELECT b.x, ROUND((b.p - r.p)::numeric, 9) AS diff_baseline_vs_rewritten FROM fdc_baseline b JOIN fdc_rewritten r ON b.x = r.x; -- --------------------------------------------------------------------- -- (2) PK-FD + deterministic-transparency composition (star schema -- with PKs on every dimension). Transparency drops the -- dimension atoms from atom-set membership; the PK on each -- dimension is then irrelevant (the dimension's FDs would have -- been redundant once it became transparent), but the rewrite -- still uses the PK FD on the fact table if one is present. -- Cross-check that this composition works and that no FD -- interaction regression occurs. -- --------------------------------------------------------------------- CREATE TABLE fdc_fact (pid int, cid int); CREATE TABLE fdc_dim_p (pid int PRIMARY KEY, cat text); CREATE TABLE fdc_dim_c (cid int PRIMARY KEY, region text); INSERT INTO fdc_fact VALUES (1, 10), (1, 10), (2, 11); INSERT INTO fdc_dim_p VALUES (1, 'A'), (2, 'B'); INSERT INTO fdc_dim_c VALUES (10, 'EU'), (11, 'US'); SELECT add_provenance('fdc_fact'); DO $$ BEGIN PERFORM set_prob(provsql, 0.5) FROM fdc_fact; END $$; SET provsql.boolean_provenance = off; CREATE TEMP TABLE fdc_star_baseline AS SELECT q.x, probability_evaluate(provenance()) AS p FROM (SELECT DISTINCT 1 AS x FROM fdc_fact f, fdc_dim_p p, fdc_dim_c c WHERE f.pid = p.pid AND f.cid = c.cid AND p.cat = 'A' AND c.region = 'EU') q GROUP BY q.x; SELECT remove_provenance('fdc_star_baseline'); SET provsql.boolean_provenance = on; CREATE TEMP TABLE fdc_star_rewritten AS SELECT q.x, probability_evaluate(provenance(), 'independent') AS p FROM (SELECT DISTINCT 1 AS x FROM fdc_fact f, fdc_dim_p p, fdc_dim_c c WHERE f.pid = p.pid AND f.cid = c.cid AND p.cat = 'A' AND c.region = 'EU') q GROUP BY q.x; SELECT remove_provenance('fdc_star_rewritten'); SELECT b.x, ROUND((b.p - r.p)::numeric, 9) AS diff_star_baseline_vs_rewritten FROM fdc_star_baseline b JOIN fdc_star_rewritten r ON b.x = r.x; DROP TABLE fdc_r, fdc_s, fdc_t, fdc_fact, fdc_dim_p, fdc_dim_c CASCADE;