-- City air-quality sensor-network fixture for ProvSQL Studio's
-- continuous-distribution demo.  Backs the worked example in
-- doc/source/user/casestudy6.rst (*The City Air-Quality Sensor
-- Network*).
--
-- Exercises every code path the continuous-RV feature touches:
--
--   * gate_rv leaves with all four distribution kinds
--     (normal / uniform / exponential / erlang);
--   * a deterministic reference station that uses the implicit
--     numeric -> random_variable cast (gate_value);
--   * gate_arith (calibration-scaled reading: pm25 * 1.2);
--   * gate_mixture for calibration uncertainty
--     (mixture(p_cal, pm25, pm25 * 1.2));
--   * gate_cmp via WHERE pm25 > 35 (planner-hook rewrite);
--   * UNION ALL across today's batch and the historical batch;
--   * sum / avg over random_variable (semimodule lowering through
--     rv_aggregate_semimod);
--   * HAVING expected(avg(pm25)) > 20 (gate_agg + gate_cmp).
--
-- Run via:
--     psql -d air_quality_demo -f studio/scripts/demo_continuous.sql
-- or, for a one-shot "drop, recreate, load, launch Studio" loader:
--     python3 studio/scripts/demo_continuous.py

\set ECHO none
\pset format unaligned

CREATE EXTENSION IF NOT EXISTS provsql CASCADE;
CREATE EXTENSION IF NOT EXISTS "uuid-ossp";

SET search_path TO public, provsql;

DROP TABLE IF EXISTS readings CASCADE;
DROP TABLE IF EXISTS historical_readings CASCADE;
DROP TABLE IF EXISTS stations CASCADE;
DROP TABLE IF EXISTS calibration_status CASCADE;
DROP TABLE IF EXISTS categories CASCADE;

-- Four monitoring stations across two districts.
CREATE TABLE stations (
  id        TEXT PRIMARY KEY,
  name      TEXT NOT NULL,
  district  TEXT NOT NULL
);

INSERT INTO stations VALUES
  ('s1', 'City Centre',         'centre'),
  ('s2', 'Riverside Park',      'centre'),
  ('s3', 'Industrial Estate',   'east'),
  ('s4', 'Suburban Reference',  'east');

SELECT add_provenance('stations');

-- Per-station calibration probability (Bernoulli).  The high-end and
-- multi-pass units are well calibrated; the low-cost and drift-prone
-- ones less so; the reference station is always in-spec.
CREATE TABLE calibration_status (
  station_id  TEXT PRIMARY KEY REFERENCES stations(id),
  p           DOUBLE PRECISION NOT NULL
);

INSERT INTO calibration_status VALUES
  ('s1', 0.95),
  ('s2', 0.70),
  ('s3', 0.60),
  ('s4', 1.00);

SELECT add_provenance('calibration_status');

-- Regulatory categories (deterministic, no provenance).
CREATE TABLE categories (
  name  TEXT PRIMARY KEY,
  lo    DOUBLE PRECISION NOT NULL,
  hi    DOUBLE PRECISION NOT NULL
);

INSERT INTO categories VALUES
  ('Good',       0.0,    12.0),
  ('Moderate',   12.1,   35.0),
  ('Unhealthy',  35.1,   1000.0);

-- Today's readings.  Five rows of pm25, one per (station, sample),
-- exercising every distribution family plus the implicit numeric cast.
CREATE TABLE readings (
  id          INTEGER PRIMARY KEY,
  station_id  TEXT NOT NULL REFERENCES stations(id),
  ts          TIMESTAMP NOT NULL,
  pm25        provsql.random_variable NOT NULL
);

INSERT INTO readings (id, station_id, ts, pm25) VALUES
  (1, 's1', '2026-05-12 08:00', provsql.normal(28.0, 2.0)),       -- high-end Gaussian
  (2, 's2', '2026-05-12 08:00', provsql.uniform(10.0, 22.0)),     -- low-cost uniform window
  (3, 's3', '2026-05-12 08:00', provsql.exponential(0.04)),       -- drift-prone, mean = 25
  (4, 's4', '2026-05-12 08:00', 15.0),                            -- reference (implicit cast)
  (5, 's1', '2026-05-12 09:00', provsql.normal(40.0, 4.0)),       -- high-end, into Unhealthy
  (6, 's2', '2026-05-12 09:00', provsql.uniform(12.0, 24.0)),     -- low-cost, into Moderate
  (7, 's3', '2026-05-12 09:00', provsql.erlang(3, 0.1)),          -- multi-pass Erlang, mean = 30
  (8, 's4', '2026-05-12 09:00', 16.5);                            -- reference

SELECT add_provenance('readings');

-- Yesterday's batch, used for the UNION ALL step.  Same shape, slightly
-- different distributions (a heatwave bumped the means).
CREATE TABLE historical_readings (
  id          INTEGER PRIMARY KEY,
  station_id  TEXT NOT NULL REFERENCES stations(id),
  ts          TIMESTAMP NOT NULL,
  pm25        provsql.random_variable NOT NULL
);

INSERT INTO historical_readings (id, station_id, ts, pm25) VALUES
  (1, 's1', '2026-05-11 08:00', provsql.normal(34.0, 2.5)),
  (2, 's2', '2026-05-11 08:00', provsql.uniform(15.0, 28.0)),
  (3, 's3', '2026-05-11 08:00', provsql.exponential(0.03)),
  (4, 's4', '2026-05-11 08:00', 18.0),
  (5, 's1', '2026-05-11 09:00', provsql.normal(42.0, 3.0)),
  (6, 's2', '2026-05-11 09:00', provsql.uniform(20.0, 35.0)),
  (7, 's3', '2026-05-11 09:00', provsql.erlang(3, 0.08)),
  (8, 's4', '2026-05-11 09:00', 19.5);

SELECT add_provenance('historical_readings');

-- A provenance mapping so the Studio eval-strip's :sqlfunc:`sr_formula`
-- and PROV-XML export can label leaves with station names rather than
-- raw UUIDs.
DROP TABLE IF EXISTS station_mapping;
CREATE TABLE station_mapping AS
  SELECT s.name AS value, r.provsql AS provenance
  FROM readings r JOIN stations s ON s.id = r.station_id;
SELECT remove_provenance('station_mapping');
CREATE INDEX ON station_mapping(provenance);

\echo ''
\echo 'Loaded the air-quality sensor fixture.  Open Studio against this'
\echo 'database, then walk through the case study:'
\echo ''
\echo '  doc/source/user/casestudy6.rst -- The City Air-Quality Sensor Network'
\echo ''
\echo 'Starter queries (paste into the Studio query box):'
\echo ''
\echo '  -- Step 1: inspect a noisy reading'
\echo '  SELECT id, ts, pm25, provsql FROM readings'
\echo '   WHERE station_id = ''s1'' ORDER BY ts;'
\echo ''
\echo '  -- Step 2: probabilistic threshold (Unhealthy)'
\echo '  SELECT id, station_id, ts, provsql FROM readings'
\echo '   WHERE pm25 > 35;'
\echo ''
\echo '  -- Step 3: per-district aggregates'
\echo '  SELECT s.district, avg(r.pm25) AS avg_pm25, sum(r.pm25) AS total_pm25, provsql'
\echo '    FROM readings r JOIN stations s ON s.id = r.station_id'
\echo '   GROUP BY s.district;'
\echo ''