\set ECHO none
\pset format unaligned
SET search_path TO provsql_test,provsql;

CREATE TYPE study_quality AS ENUM ('no_evidence', 'case_report', 'observational', 'rct', 'meta_analysis', 'perfect_evidence');

CREATE TABLE study (
    id integer PRIMARY KEY,
    title text NOT NULL,
    year integer,
    study_type study_quality NOT NULL,
    reliability double precision NOT NULL
);
CREATE TABLE exposure (id integer PRIMARY KEY, name text NOT NULL);
CREATE TABLE outcome  (id integer PRIMARY KEY, name text NOT NULL);
CREATE TABLE finding (
    id integer PRIMARY KEY,
    study_id integer NOT NULL,
    exposure_id integer NOT NULL,
    outcome_id integer NOT NULL,
    effect text NOT NULL,
    effect_size double precision
);

COPY study (id, title, year, study_type, reliability) FROM stdin;
1	Smith2018	2018	rct	0.92
2	Johnson2020	2020	meta_analysis	0.98
3	Chen2019	2019	observational	0.76
4	Williams2021	2021	rct	0.88
5	Garcia2017	2017	observational	0.65
6	Brown2022	2022	meta_analysis	0.95
7	Martinez2020	2020	case_report	0.45
8	Park2021	2021	rct	0.85
\.

COPY exposure (id, name) FROM stdin;
1	Coffee
2	Exercise
3	Red Meat
4	Aspirin
5	Omega-3
6	Alcohol
7	Processed Food
\.

COPY outcome (id, name) FROM stdin;
1	Cardiovascular Disease
2	Type 2 Diabetes
3	Inflammation
4	Colorectal Cancer
5	Cognitive Decline
\.

-- 25 findings; contradictions on Coffee→CVD and Alcohol→CVD;
-- single-study claims on Aspirin→Cognitive Decline, Omega-3→CVD, Exercise→Inflammation.
COPY finding (id, study_id, exposure_id, outcome_id, effect, effect_size) FROM stdin;
1	5	1	1	harmful	1.3
2	6	1	1	beneficial	0.85
3	3	1	1	neutral	1.0
4	1	2	1	beneficial	0.60
5	2	2	1	beneficial	0.55
6	4	2	1	beneficial	0.65
7	2	3	4	harmful	1.50
8	3	3	4	harmful	1.40
9	1	4	1	beneficial	0.75
10	4	4	1	beneficial	0.78
11	6	5	3	beneficial	0.70
12	8	5	3	beneficial	0.72
13	5	6	1	beneficial	0.90
14	7	6	1	harmful	1.20
15	2	7	2	harmful	1.80
16	3	7	2	harmful	1.60
17	6	1	5	beneficial	0.80
18	8	1	5	beneficial	0.75
19	3	3	3	harmful	1.30
20	7	3	3	harmful	1.25
21	4	6	2	harmful	1.40
22	5	6	2	harmful	1.35
23	8	4	5	beneficial	0.60
24	6	5	1	beneficial	0.80
25	1	2	3	beneficial	0.70
\.

-- Step 2: Enable provenance on finding, then join with lookup tables;
-- ProvSQL propagates the finding token through the join into f.
SELECT add_provenance('finding');

CREATE VIEW f AS
  SELECT study.title    AS study,
         study.study_type,
         study.reliability,
         exposure.name  AS exposure,
         outcome.name   AS outcome,
         finding.effect
  FROM finding
    JOIN study    ON finding.study_id    = study.id
    JOIN exposure ON finding.exposure_id = exposure.id
    JOIN outcome  ON finding.outcome_id  = outcome.id;

-- Step 3: Create provenance mapping on f (tokens trace back to finding rows)
SELECT create_provenance_mapping('study_mapping', 'f', 'study');

-- Step 4: sr_formula for three single-study findings
CREATE TABLE result_single AS
SELECT exposure, outcome, effect,
    sr_formula(provenance(), 'study_mapping') AS formula
FROM f
WHERE (exposure = 'Coffee'   AND outcome = 'Cardiovascular Disease' AND effect = 'harmful')
   OR (exposure = 'Alcohol'  AND outcome = 'Cardiovascular Disease' AND effect = 'beneficial')
   OR (exposure = 'Exercise' AND outcome = 'Inflammation'           AND effect = 'beneficial')
GROUP BY exposure, outcome, effect
ORDER BY exposure, outcome, effect;

SELECT remove_provenance('result_single');
SELECT * FROM result_single ORDER BY exposure, outcome, effect;
DROP TABLE result_single;

-- Step 5: Why-provenance for two replicated multi-study findings
CREATE TABLE result_why AS
SELECT exposure, outcome, effect,
    sr_why(provenance(), 'study_mapping') AS witnesses
FROM f
WHERE (exposure = 'Exercise' AND outcome = 'Cardiovascular Disease')
   OR (exposure = 'Aspirin'  AND outcome = 'Cardiovascular Disease')
GROUP BY exposure, outcome, effect
ORDER BY exposure, outcome, effect;

SELECT remove_provenance('result_why');
SELECT * FROM result_why ORDER BY exposure, outcome, effect;
DROP TABLE result_why;

-- Step 6: Evidence grade semiring — best study type supporting a finding.
CREATE FUNCTION quality_plus_state(state study_quality, q study_quality)
  RETURNS study_quality AS $$
    SELECT GREATEST(state, q)
$$ LANGUAGE SQL IMMUTABLE;

CREATE FUNCTION quality_times_state(state study_quality, q study_quality)
  RETURNS study_quality AS $$
    SELECT LEAST(state, q)
$$ LANGUAGE SQL IMMUTABLE;

CREATE AGGREGATE quality_plus(study_quality) (
  sfunc = quality_plus_state, stype = study_quality, initcond = 'no_evidence'
);
CREATE AGGREGATE quality_times(study_quality) (
  sfunc = quality_times_state, stype = study_quality, initcond = 'perfect_evidence'
);

CREATE FUNCTION evidence_grade(token UUID, token2value regclass)
  RETURNS study_quality AS $$
BEGIN
  RETURN provenance_evaluate(
    token, token2value,
    'perfect_evidence'::study_quality,
    'quality_plus', 'quality_times'
  );
END
$$ LANGUAGE plpgsql;

SELECT create_provenance_mapping('quality_mapping', 'f', 'study_type');

CREATE TABLE result_grade AS
SELECT exposure, outcome, effect,
    evidence_grade(provenance(), 'quality_mapping') AS grade
FROM f
GROUP BY exposure, outcome, effect
ORDER BY exposure, outcome, effect;

SELECT remove_provenance('result_grade');
SELECT * FROM result_grade ORDER BY exposure, outcome, effect;
DROP TABLE result_grade;

-- Step 7: Where-provenance on f — only finding columns are tracked
SET provsql.where_provenance = on;

CREATE TABLE result_where_f AS
SELECT study, study_type, exposure, outcome, effect,
    regexp_replace(
      where_provenance(provenance()), ':[0-9a-f-]*:', '::', 'g') AS source
FROM f
WHERE exposure = 'Exercise' AND outcome = 'Cardiovascular Disease'
  AND study = 'Smith2018';

SELECT remove_provenance('result_where_f');
SELECT study, study_type, exposure, outcome, effect, source FROM result_where_f;
DROP TABLE result_where_f;

SET provsql.where_provenance = off;

-- Step 8: Where-provenance on the base table — all finding columns tracked
SET provsql.where_provenance = on;

CREATE TABLE result_where AS
SELECT finding.study_id, finding.exposure_id, finding.outcome_id, finding.effect,
    regexp_replace(
      where_provenance(provenance()), ':[0-9a-f-]*:', '::', 'g') AS source
FROM finding
JOIN study    ON finding.study_id    = study.id    AND study.title    = 'Smith2018'
JOIN exposure ON finding.exposure_id = exposure.id AND exposure.name  = 'Exercise'
JOIN outcome  ON finding.outcome_id  = outcome.id  AND outcome.name   = 'Cardiovascular Disease'
WHERE finding.effect = 'beneficial';

SELECT remove_provenance('result_where');
SELECT study_id, exposure_id, outcome_id, effect, source FROM result_where;
DROP TABLE result_where;

SET provsql.where_provenance = off;

-- Step 9: Assign probabilities from reliability scores
DO $$ BEGIN
  PERFORM set_prob(provenance(), reliability) FROM f;
END $$;

-- Probability that at least one study reports a finding (no replication requirement)
CREATE TABLE result_prob AS
SELECT exposure, outcome, effect,
    ROUND(probability_evaluate(provenance())::numeric, 4) AS prob
FROM f
WHERE (exposure = 'Exercise' AND outcome = 'Cardiovascular Disease' AND effect = 'beneficial')
   OR (exposure = 'Coffee'   AND outcome = 'Cardiovascular Disease' AND effect = 'harmful')
   OR (exposure = 'Aspirin'  AND outcome = 'Cognitive Decline'      AND effect = 'beneficial')
GROUP BY exposure, outcome, effect
ORDER BY exposure, outcome, effect;

SELECT remove_provenance('result_prob');
SELECT * FROM result_prob ORDER BY exposure, outcome, effect;
DROP TABLE result_prob;

-- Step 10: Build f_replicated view with HAVING replication threshold
CREATE VIEW f_replicated AS
SELECT exposure, outcome, effect FROM f
GROUP BY exposure, outcome, effect
HAVING COUNT(*) >= 2;

-- Step 11: sr_counting to inspect zero vs. non-zero provenance
ALTER TABLE finding ADD COLUMN cnt int DEFAULT 1;
SELECT create_provenance_mapping('count_mapping', 'finding', 'cnt');

CREATE TABLE result_replicated AS
SELECT exposure, outcome, effect,
    sr_counting(provenance(), 'count_mapping') AS replicated
FROM f_replicated
ORDER BY exposure, outcome, effect;

SELECT remove_provenance('result_replicated');
SELECT * FROM result_replicated ORDER BY exposure, outcome, effect;
DROP TABLE result_replicated;

-- Step 12: Probability of replication (single-study findings → 0 via f_replicated)
CREATE TABLE result_replication AS
SELECT exposure, outcome, effect,
    ROUND(probability_evaluate(provenance())::numeric, 4) AS prob
FROM f_replicated
ORDER BY exposure, outcome, effect;

SELECT remove_provenance('result_replication');
SELECT * FROM result_replication ORDER BY exposure, outcome, effect;
DROP TABLE result_replication;

-- Step 13: Shapley values — expected contribution to replication probability
CREATE TABLE result_shapley AS
SELECT fin.study,
       shapley(target.prov, fin.prov) AS sv
FROM (
  SELECT provenance() AS prov
  FROM f_replicated
  WHERE exposure = 'Exercise' AND outcome = 'Cardiovascular Disease'
    AND effect = 'beneficial'
) target,
(
  SELECT study, provenance() AS prov
  FROM f
  WHERE exposure = 'Exercise' AND outcome = 'Cardiovascular Disease'
    AND effect = 'beneficial'
) fin;

SELECT remove_provenance('result_shapley');
SELECT study, ROUND(sv::numeric, 4) AS sv
FROM result_shapley ORDER BY sv DESC, study;
DROP TABLE result_shapley;

-- Step 14: Banzhaf values — alternative game-theoretic contribution measure
CREATE TABLE result_banzhaf AS
SELECT fin.study,
       banzhaf(target.prov, fin.prov) AS bv
FROM (
  SELECT provenance() AS prov
  FROM f_replicated
  WHERE exposure = 'Exercise' AND outcome = 'Cardiovascular Disease'
    AND effect = 'beneficial'
) target,
(
  SELECT study, provenance() AS prov
  FROM f
  WHERE exposure = 'Exercise' AND outcome = 'Cardiovascular Disease'
    AND effect = 'beneficial'
) fin;

SELECT remove_provenance('result_banzhaf');
SELECT study, ROUND(bv::numeric, 4) AS bv
FROM result_banzhaf ORDER BY bv DESC, study;
DROP TABLE result_banzhaf;

-- Clean up
DROP VIEW f_replicated;
DROP VIEW f;
DROP TABLE finding;
DROP TABLE outcome;
DROP TABLE exposure;
DROP TABLE study;
DROP TABLE quality_mapping;
DROP TABLE count_mapping;
DROP TABLE study_mapping;
DROP AGGREGATE quality_plus(study_quality);
DROP AGGREGATE quality_times(study_quality);
DROP FUNCTION evidence_grade(UUID, regclass);
DROP FUNCTION quality_plus_state(study_quality, study_quality);
DROP FUNCTION quality_times_state(study_quality, study_quality);
DROP TYPE study_quality;