\set ECHO none
\pset format unaligned
SET search_path TO provsql_test,provsql;

-- Case Study 6: The City Air-Quality Sensor Network
-- Tests the random_variable surface end-to-end: gate_rv leaves for the
-- four distribution families, the planner-hook lifting of WHERE
-- comparators on RV columns (gate_cmp), Bernoulli mixtures via
-- provsql.mixture, RV aggregates (sum / avg) lowered through
-- rv_aggregate_semimod, expected() applied to a grouped RV via the
-- subquery + outer-WHERE idiom, UNION ALL on RV columns, and an
-- independent / tree-decomposition / monte-carlo cross-check.  Backs
-- doc/source/user/casestudy6.rst.

-- Pin the MC RNG and sample budget so cases that fall back to MC
-- (probabilities, conditional moments) are reproducible at a tolerance
-- loose enough to absorb sampler noise but tight enough to be
-- meaningful.
SET provsql.monte_carlo_seed = 42;
SET provsql.rv_mc_samples    = 50000;

-- ---------------------------------------------------------------------
-- Setup: schema and seed, mirroring doc/casestudy6/setup.sql.
-- ---------------------------------------------------------------------
CREATE TABLE stations (
  id        text PRIMARY KEY,
  name      text NOT NULL,
  district  text NOT NULL
);
INSERT INTO stations VALUES
  ('s1', 'City Centre',        'centre'),
  ('s2', 'Riverside Park',     'centre'),
  ('s3', 'Industrial Estate',  'east'),
  ('s4', 'Suburban Reference', 'east');
SELECT add_provenance('stations');

CREATE TABLE calibration_status (
  station_id  text PRIMARY KEY REFERENCES stations(id),
  p           double precision NOT NULL
);
INSERT INTO calibration_status VALUES
  ('s1', 0.95),
  ('s2', 0.70),
  ('s3', 0.60),
  ('s4', 1.00);
SELECT add_provenance('calibration_status');

CREATE TABLE readings (
  id          integer PRIMARY KEY,
  station_id  text NOT NULL REFERENCES stations(id),
  ts          timestamp NOT NULL,
  pm25        random_variable NOT NULL
);
INSERT INTO readings (id, station_id, ts, pm25) VALUES
  (1, 's1', '2026-05-12 08:00', provsql.normal(28.0, 2.0)),
  (2, 's2', '2026-05-12 08:00', provsql.uniform(10.0, 22.0)),
  (3, 's3', '2026-05-12 08:00', provsql.exponential(0.04)),
  (4, 's4', '2026-05-12 08:00', 15.0),
  (5, 's1', '2026-05-12 09:00', provsql.normal(40.0, 4.0)),
  (6, 's2', '2026-05-12 09:00', provsql.uniform(12.0, 24.0)),
  (7, 's3', '2026-05-12 09:00', provsql.erlang(3, 0.1)),
  (8, 's4', '2026-05-12 09:00', 16.5);
SELECT add_provenance('readings');

CREATE TABLE historical_readings (
  id          integer PRIMARY KEY,
  station_id  text NOT NULL REFERENCES stations(id),
  ts          timestamp NOT NULL,
  pm25        random_variable NOT NULL
);
INSERT INTO historical_readings (id, station_id, ts, pm25) VALUES
  (1, 's1', '2026-05-11 08:00', provsql.normal(34.0, 2.5)),
  (2, 's2', '2026-05-11 08:00', provsql.uniform(15.0, 28.0)),
  (3, 's3', '2026-05-11 08:00', provsql.exponential(0.03)),
  (4, 's4', '2026-05-11 08:00', 18.0),
  (5, 's1', '2026-05-11 09:00', provsql.normal(42.0, 3.0)),
  (6, 's2', '2026-05-11 09:00', provsql.uniform(20.0, 35.0)),
  (7, 's3', '2026-05-11 09:00', provsql.erlang(3, 0.08)),
  (8, 's4', '2026-05-11 09:00', 19.5);
SELECT add_provenance('historical_readings');

-- ---------------------------------------------------------------------
-- Step 1: per-row pm25 gates.  Rows 1 and 5 are Normal; row 3 is
-- Exponential; row 4 is the deterministic reference station lifted via
-- the implicit numeric -> random_variable cast (gate_value); row 7 is
-- Erlang.  One representative pick per shape.  The materialise +
-- remove_provenance dance hides the row's own auto-added provsql uuid
-- column from the output (it's non-deterministic).
-- ---------------------------------------------------------------------
CREATE TABLE result_cs6_pm25_kind AS
  SELECT id, get_gate_type(pm25::uuid) AS pm25_kind
    FROM readings
   WHERE id IN (1, 3, 4, 7);
SELECT remove_provenance('result_cs6_pm25_kind');
SELECT id, pm25_kind FROM result_cs6_pm25_kind ORDER BY id;
DROP TABLE result_cs6_pm25_kind;

-- ---------------------------------------------------------------------
-- Step 2: WHERE pm25 > 35 lifts the comparator into a gate_cmp
-- conjoined with each row's input via gate_times; the procedure body
-- of random_variable_gt is never executed, so every row survives the
-- WHERE.  The pm25 column kind on rows 5 and 7 confirms that the lift
-- did not strip the underlying distribution.
-- ---------------------------------------------------------------------
CREATE TABLE result_cs6_thresh AS
  SELECT id, station_id,
         get_gate_type(provenance())  AS prov_kind,
         get_gate_type(pm25::uuid)    AS pm25_kind
    FROM readings
   WHERE pm25 > 35;
SELECT remove_provenance('result_cs6_thresh');
SELECT id, station_id, prov_kind, pm25_kind
  FROM result_cs6_thresh ORDER BY id;
DROP TABLE result_cs6_thresh;

-- Probability that the event pm25 > 35 fires on each tracked row.  For
-- the s1 readings (Normal):
--   row 1, N(28, 2):  P(X > 35) = 1 - Phi( 3.5) ~= 2.3e-4
--   row 5, N(40, 4):  P(X > 35) = 1 - Phi(-1.25) ~= 0.8944
-- The analytic 'independent' path is exact for a single gate_cmp under
-- a deterministic input gate; MC at seed=42 must sit within 0.02 of it.
CREATE TABLE result_cs6_thresh_prob AS
  SELECT id,
         abs(probability_evaluate(provenance(), 'monte-carlo', '50000')
             - probability_evaluate(provenance(), 'independent')) < 0.02
           AS mc_matches_independent
    FROM readings
   WHERE pm25 > 35
     AND station_id = 's1';
SELECT remove_provenance('result_cs6_thresh_prob');
SELECT id, mc_matches_independent
  FROM result_cs6_thresh_prob ORDER BY id;
DROP TABLE result_cs6_thresh_prob;

-- ---------------------------------------------------------------------
-- Step 3: provsql.mixture(p, x, y) returns a gate_mixture; the
-- pm25 / 1.2 arm (the calibration correction) is a gate_arith(DIV)
-- over the per-row pm25 and a gate_value.  The constructor result
-- type is random_variable, so ::uuid grabs the gate root.
-- ---------------------------------------------------------------------
CREATE TABLE result_cs6_mixture AS
  SELECT r.id,
         get_gate_type(
           provsql.mixture(cs.p, r.pm25, r.pm25 / 1.2)::uuid
         ) AS mixture_kind,
         get_gate_type(
           (r.pm25 / 1.2)::uuid
         ) AS scaled_kind
    FROM readings r JOIN calibration_status cs USING (station_id)
   WHERE r.station_id = 's1';
SELECT remove_provenance('result_cs6_mixture');
SELECT id, mixture_kind, scaled_kind
  FROM result_cs6_mixture ORDER BY id;
DROP TABLE result_cs6_mixture;

-- ---------------------------------------------------------------------
-- Step 4: sum(pm25) and avg(pm25) over a tracked RV column lower
-- through rv_aggregate_semimod into gate_arith roots.  Aggregate
-- dispatch on the random_variable result type picks the provsql.sum /
-- provsql.avg overloads from plain sum() / avg(), so the case study's
-- SQL uses the bare names.
-- ---------------------------------------------------------------------
CREATE TABLE result_cs6_agg AS
  SELECT s.district,
         sum(r.pm25) AS total_pm25,
         avg(r.pm25) AS avg_pm25
    FROM readings r JOIN stations s ON s.id = r.station_id
   GROUP BY s.district;
SELECT remove_provenance('result_cs6_agg');

-- Structure: sum -> gate_arith(PLUS); avg -> gate_arith(DIV) over two
-- gate_arith(PLUS) subtrees (numerator / denominator).
SELECT district,
       get_gate_type(total_pm25::uuid) AS sum_root,
       get_gate_type(avg_pm25::uuid)   AS avg_root
  FROM result_cs6_agg ORDER BY district;

-- The 'centre' district has rows 1, 2, 5, 6 (N(28,2), U(10,22),
-- N(40,4), U(12,24)) with E[sum] = 28 + 16 + 40 + 18 = 102.
-- The 'east' district has rows 3, 4, 7, 8 (Exp(0.04), 15, Erl(3,0.1),
-- 16.5) with E[sum] = 25 + 15 + 30 + 16.5 = 86.5.  Linearity of
-- expectation hits exactly under the analytical evaluator.
SELECT district,
       abs(provsql.expected(total_pm25) - CASE district
             WHEN 'centre' THEN 102.0
             WHEN 'east'   THEN  86.5
                                              END) < 1e-9
         AS total_mean_exact
  FROM result_cs6_agg ORDER BY district;

DROP TABLE result_cs6_agg;

-- ---------------------------------------------------------------------
-- Step 8: UNION across the two batches dedups by (station_id, id) and
-- combines the two row provenances via gate_plus.  Both batches share
-- the same id space (1..8), so the keys line up directly.  WHERE pm25
-- > 35 lifts the comparator into each contributing row's provenance,
-- so the resulting gate_plus wraps two gate_times(input, gate_cmp)
-- subtrees.
-- ---------------------------------------------------------------------
CREATE TABLE result_cs6_union AS
SELECT station_id, id, get_gate_type(provenance()) AS prov_kind
FROM (
    (SELECT station_id, id FROM readings            WHERE pm25 > 35)
  UNION
    (SELECT station_id, id FROM historical_readings WHERE pm25 > 35)
) t;
SELECT remove_provenance('result_cs6_union');
SELECT station_id, id, prov_kind
  FROM result_cs6_union
  ORDER BY station_id, id;
DROP TABLE result_cs6_union;

-- ---------------------------------------------------------------------
-- Step 9: filter the per-district aggregates by their expected average.
-- The HAVING qual is deterministic from the planner-hook's perspective
-- (no agg_token Var, no provenance_aggregate wrapper -- avg(rv)
-- returns random_variable and expected() collapses it to a double), so
-- needs_having_lift leaves it for PostgreSQL to evaluate natively while
-- the per-group provenance still gets a delta(gate_agg) wrapper.
-- E[avg(pm25)] is 102/4 = 25.5 for centre and 86.5/4 = 21.625 for
-- east, both above 20.
-- ---------------------------------------------------------------------
CREATE TABLE result_cs6_having AS
  SELECT s.district, get_gate_type(provenance()) AS prov_kind
    FROM readings r JOIN stations s ON s.id = r.station_id
   GROUP BY s.district
  HAVING expected(avg(r.pm25)) > 20;
SELECT remove_provenance('result_cs6_having');
SELECT district, prov_kind FROM result_cs6_having ORDER BY district;
DROP TABLE result_cs6_having;

-- ---------------------------------------------------------------------
-- Step 10: independent vs monte-carlo on the threshold query.  For
-- each surviving row the closed-form 'independent' result is exact;
-- MC at n=50000 must match within 0.02.  tree-decomposition is also
-- exact and must agree with 'independent' to full precision.
-- ---------------------------------------------------------------------
CREATE TABLE result_cs6_methods AS
  SELECT id,
         abs(probability_evaluate(provenance(), 'independent')
             - probability_evaluate(provenance(), 'tree-decomposition')) < 1e-9
           AS ind_equals_td,
         abs(probability_evaluate(provenance(), 'independent')
             - probability_evaluate(provenance(), 'monte-carlo', '50000')) < 0.02
           AS ind_close_to_mc
    FROM readings WHERE pm25 > 35;
SELECT remove_provenance('result_cs6_methods');
SELECT id, ind_equals_td, ind_close_to_mc
  FROM result_cs6_methods ORDER BY id;
DROP TABLE result_cs6_methods;

-- ---------------------------------------------------------------------
-- Cleanup.
-- ---------------------------------------------------------------------
DROP TABLE historical_readings;
DROP TABLE readings;
DROP TABLE calibration_status;
DROP TABLE stations;

RESET provsql.monte_carlo_seed;
RESET provsql.rv_mc_samples;