/** * @file * @brief ProvSQL PL/pgSQL extension code * * This file contains the PL/pgSQL code of the ProvSQL extension. This * extension requires the standard uuid-ossp extension. */ /** * @brief provsql schema * * All types and functions introduced by ProvSQL are defined in the * provsql schema, requiring prefixing them by provsql. or * using PostgreSQL's search_path variable with a command such * as \code{.sql}SET search_path TO public, provsql;\endcode */ CREATE SCHEMA provsql; SET search_path TO provsql; /** * @brief Provenance circuit gate types * * Each gate in the provenance circuit has a type that determines * its semantics during semiring evaluation. */ CREATE TYPE provenance_gate AS ENUM( 'input', -- Input (variable) gate of the circuit 'plus', -- Semiring plus 'times', -- Semiring times 'monus', -- M-Semiring monus 'project', -- Project gate (for where provenance) 'zero', -- Semiring zero 'one', -- Semiring one 'eq', -- Equijoin gate (for where provenance) 'agg', -- Aggregation operator (for aggregate provenance) 'semimod', -- Semimodule scalar multiplication (for aggregate provenance) 'cmp', -- Comparison of aggregate values (HAVING-clause provenance) 'delta', -- δ-semiring operator (see Amsterdamer, Deutch, Tannen, PODS 2011) 'value', -- Scalar value (for aggregate provenance) 'mulinput',-- Multivalued input (for Boolean provenance) 'update', -- Update operation 'rv', -- Continuous random-variable leaf 'arith', -- n-ary arithmetic gate over scalar-valued children 'mixture' -- Probabilistic mixture of two scalar RV roots with a Bernoulli weight ); /** @defgroup gate_manipulation Circuit gate manipulation * Low-level functions for creating and querying provenance circuit gates. * @{ */ /** * @brief Create a new gate in the provenance circuit * * @param token UUID identifying the new gate * @param type gate type (see provenance_gate) * @param children optional array of child gate UUIDs */ CREATE OR REPLACE FUNCTION create_gate( token UUID, type provenance_gate, children uuid[] DEFAULT NULL) RETURNS void AS 'provsql','create_gate' LANGUAGE C PARALLEL SAFE; /** * @brief Return the gate type of a provenance token * * Returns @c 'input' for any token not yet materialized in the circuit, * since input is the default semantics of an unmaterialized provenance token. */ CREATE OR REPLACE FUNCTION get_gate_type( token UUID) RETURNS provenance_gate AS 'provsql','get_gate_type' LANGUAGE C IMMUTABLE PARALLEL SAFE; /** @brief Return the children of a provenance gate */ CREATE OR REPLACE FUNCTION get_children( token UUID) RETURNS uuid[] AS 'provsql','get_children' LANGUAGE C IMMUTABLE PARALLEL SAFE; /** * @brief Set the probability of an input gate * * @param token UUID of the input gate * @param p probability value in [0,1] */ CREATE OR REPLACE FUNCTION set_prob( token UUID, p DOUBLE PRECISION) RETURNS void AS 'provsql','set_prob' LANGUAGE C PARALLEL SAFE; /** @brief Get the probability associated with an input gate */ CREATE OR REPLACE FUNCTION get_prob( token UUID) RETURNS DOUBLE PRECISION AS 'provsql','get_prob' LANGUAGE C STABLE PARALLEL SAFE; /** * @brief Set additional integer values on provenance circuit gate * * This function sets two integer values associated to a circuit gate, used in * different ways by different gate types: * - for mulinput, info1 indicates the value of this multivalued variable * - for eq, info1 and info2 indicate the attribute index of the equijoin in, respectively, the first and second columns * - for agg, info1 is the oid of the aggregate function and info2 the oid of the aggregate result type * - for cmp, info1 is the oid of the comparison operator * * @param token UUID of the circuit gate * @param info1 first integer value * @param info2 second integer value */ CREATE OR REPLACE FUNCTION set_infos( token UUID, info1 INT, info2 INT DEFAULT NULL) RETURNS void AS 'provsql','set_infos' LANGUAGE C PARALLEL SAFE; /** @brief Get the integer info values associated with a circuit gate */ CREATE OR REPLACE FUNCTION get_infos( token UUID, OUT info1 INT, OUT info2 INT) RETURNS record AS 'provsql','get_infos' LANGUAGE C STABLE PARALLEL SAFE; /** * @brief Set extra text information on provenance circuit gate * * This function sets text-encoded data associated to a circuit gate, used in * different ways by different gate types: * - for project, it is a text-encoded ARRAY of two-element ARRAYs that * indicate mappings between input attribute (first element) and output * attribute (second element) * - for value and agg, it is the text-encoded (base for value, computed * for agg) scalar value * * @param token UUID of the circuit gate * @param data text-encoded information */ CREATE OR REPLACE FUNCTION set_extra( token UUID, data TEXT) RETURNS void AS 'provsql','set_extra' LANGUAGE C PARALLEL SAFE STRICT; /** @brief Get the text-encoded extra data associated with a circuit gate */ CREATE OR REPLACE FUNCTION get_extra(token UUID) RETURNS TEXT AS 'provsql','get_extra' LANGUAGE C STABLE PARALLEL SAFE RETURNS NULL ON NULL INPUT; /** * @brief Return the total number of materialized gates in the provenance circuit * * Input gates for provenance-tracked table rows are created lazily on * first reference; rows that have never appeared in a query result are * not counted. */ CREATE OR REPLACE FUNCTION get_nb_gates() RETURNS BIGINT AS 'provsql', 'get_nb_gates' LANGUAGE C PARALLEL SAFE; /** @} */ /** @defgroup table_management Provenance table management * Functions for enabling, disabling, and configuring provenance * tracking on user tables. * @{ */ /** * @brief Trigger function for DELETE statement provenance tracking * * Records the deletion and applies monus to provenance tokens of * deleted rows. This is the version for PostgreSQL < 14. */ CREATE OR REPLACE FUNCTION delete_statement_trigger() RETURNS TRIGGER AS $$ DECLARE query_text TEXT; delete_token UUID; old_token UUID; new_token UUID; r RECORD; BEGIN delete_token := public.uuid_generate_v4(); PERFORM create_gate(delete_token, 'input'); SELECT query INTO query_text FROM pg_stat_activity WHERE pid = pg_backend_pid(); INSERT INTO delete_provenance (delete_token, query, deleted_by, deleted_at) VALUES (delete_token, query_text, current_user, CURRENT_TIMESTAMP); EXECUTE format('INSERT INTO %I.%I SELECT * FROM OLD_TABLE;', TG_TABLE_SCHEMA, TG_TABLE_NAME); FOR r IN (SELECT * FROM OLD_TABLE) LOOP old_token := r.provsql; new_token := provenance_monus(old_token, delete_token); EXECUTE format('UPDATE %I.%I SET provsql = $1 WHERE provsql = $2;', TG_TABLE_SCHEMA, TG_TABLE_NAME) USING new_token, old_token; END LOOP; RETURN NULL; END $$ LANGUAGE plpgsql SET search_path=provsql,pg_temp SECURITY DEFINER; /** * @brief Enable provenance tracking on an existing table * * Adds a provsql UUID column to the table. Input gates for * existing rows are created lazily when first referenced by a query. * * @param _tbl the table to add provenance tracking to */ CREATE OR REPLACE FUNCTION add_provenance(_tbl regclass) RETURNS void AS $$ BEGIN EXECUTE format('ALTER TABLE %s ADD COLUMN provsql UUID UNIQUE DEFAULT public.uuid_generate_v4()', _tbl); END $$ LANGUAGE plpgsql SECURITY DEFINER; /** * @brief Remove provenance tracking from a table * * Drops the provsql column and associated triggers. * * @param _tbl the table to remove provenance tracking from */ CREATE OR REPLACE FUNCTION remove_provenance(_tbl regclass) RETURNS void AS $$ DECLARE BEGIN EXECUTE format('ALTER TABLE %s DROP COLUMN provsql', _tbl); BEGIN EXECUTE format('DROP TRIGGER add_gate on %s', _tbl); EXCEPTION WHEN undefined_object THEN END; BEGIN EXECUTE format('DROP TRIGGER insert_statement on %s', _tbl); EXECUTE format('DROP TRIGGER update_statement on %s', _tbl); EXECUTE format('DROP TRIGGER delete_statement on %s', _tbl); EXCEPTION WHEN undefined_object THEN END; END $$ LANGUAGE plpgsql; /** * @brief Set up provenance for a table with duplicate key values * * When a table has duplicate rows for a given key, this function * replaces simple input gates with multivalued input (mulinput) gates * that model a uniform distribution over duplicates. * * @param _tbl the table to repair * @param key_att the key attribute(s) as a comma-separated string, or * empty string if the whole table is one group */ CREATE OR REPLACE FUNCTION repair_key(_tbl regclass, key_att text) RETURNS void AS $$ DECLARE key RECORD; key_token uuid; token uuid; record RECORD; nb_rows INTEGER; ind INTEGER; select_key_att TEXT; where_condition TEXT; BEGIN IF key_att = '' THEN key_att := '()'; select_key_att := '1'; ELSE select_key_att := key_att; END IF; EXECUTE format('ALTER TABLE %s ADD COLUMN provsql_temp UUID UNIQUE DEFAULT public.uuid_generate_v4()', _tbl); FOR key IN EXECUTE format('SELECT %s AS key FROM %s GROUP BY %s', select_key_att, _tbl, key_att) LOOP IF key_att = '()' THEN where_condition := ''; ELSE where_condition := format('WHERE %s = %L', key_att, key.key); END IF; EXECUTE format('SELECT COUNT(*) FROM %s %s', _tbl, where_condition) INTO nb_rows; key_token := public.uuid_generate_v4(); PERFORM provsql.create_gate(key_token, 'input'); ind := 1; FOR record IN EXECUTE format('SELECT provsql_temp FROM %s %s', _tbl, where_condition) LOOP token:=record.provsql_temp; PERFORM provsql.create_gate(token, 'mulinput', ARRAY[key_token]); PERFORM provsql.set_prob(token, 1./nb_rows); PERFORM provsql.set_infos(token, ind); ind := ind + 1; END LOOP; END LOOP; EXECUTE format('ALTER TABLE %s RENAME COLUMN provsql_temp TO provsql', _tbl); END $$ LANGUAGE plpgsql; /** * @brief Create a provenance mapping table from an attribute * * Creates a new table mapping provenance tokens to values of a given * attribute, for use with semiring evaluation functions. * * @param newtbl name of the mapping table to create * @param oldtbl source table with provenance tracking * @param att attribute whose values populate the mapping * @param preserve_case if true, quote the table name to preserve case */ CREATE OR REPLACE FUNCTION create_provenance_mapping( newtbl text, oldtbl regclass, att text, preserve_case bool DEFAULT 'f' ) RETURNS void AS $$ DECLARE BEGIN EXECUTE format('CREATE TEMP TABLE tmp_provsql ON COMMIT DROP AS TABLE %s', oldtbl); ALTER TABLE tmp_provsql RENAME provsql TO provenance; IF preserve_case THEN EXECUTE format('CREATE TABLE %I AS SELECT %s AS value, provenance FROM tmp_provsql', newtbl, att); EXECUTE format('CREATE INDEX ON %I(provenance)', newtbl); ELSE EXECUTE format('CREATE TABLE %s AS SELECT %s AS value, provenance FROM tmp_provsql', newtbl, att); EXECUTE format('CREATE INDEX ON %s(provenance)', newtbl); END IF; END $$ LANGUAGE plpgsql; /** * @brief Create a view mapping provenance tokens to attribute values * * Like create_provenance_mapping but creates a view instead of a table, * so it always reflects the current state of the source table. * * @param newview name of the view to create * @param oldtbl source table with provenance tracking * @param att attribute whose values populate the mapping * @param preserve_case if true, quote the view name to preserve case */ CREATE OR REPLACE FUNCTION create_provenance_mapping_view( newview text, oldtbl regclass, att text, preserve_case bool DEFAULT false ) RETURNS void LANGUAGE plpgsql AS $$ BEGIN IF preserve_case THEN EXECUTE format( 'CREATE OR REPLACE VIEW %I AS SELECT %s AS value, provsql AS provenance FROM %s', newview, att, oldtbl ); ELSE EXECUTE format( 'CREATE OR REPLACE VIEW %s AS SELECT %s AS value, provsql AS provenance FROM %s', newview, att, oldtbl ); END IF; END; $$; /** @} */ /** @defgroup internal_constants Internal constants * UUID namespace and identity element functions used for * deterministic gate generation. * @{ */ /** @brief Return the ProvSQL UUID namespace (used for deterministic gate UUIDs) */ CREATE OR REPLACE FUNCTION uuid_ns_provsql() RETURNS uuid AS $$ -- uuid_generate_v5(uuid_ns_url(),'http://pierre.senellart.com/software/provsql/') SELECT '920d4f02-8718-5319-9532-d4ab83a64489'::uuid $$ LANGUAGE SQL IMMUTABLE PARALLEL SAFE; /** @brief Return the UUID of the semiring zero gate */ CREATE OR REPLACE FUNCTION gate_zero() RETURNS uuid AS $$ SELECT public.uuid_generate_v5(provsql.uuid_ns_provsql(),'zero'); $$ LANGUAGE SQL IMMUTABLE PARALLEL SAFE; /** @brief Return the UUID of the semiring one gate */ CREATE OR REPLACE FUNCTION gate_one() RETURNS uuid AS $$ SELECT public.uuid_generate_v5(provsql.uuid_ns_provsql(),'one'); $$ LANGUAGE SQL IMMUTABLE PARALLEL SAFE; /** @brief Return the epsilon threshold used for probability comparisons */ CREATE OR REPLACE FUNCTION epsilon() RETURNS DOUBLE PRECISION AS $$ SELECT CAST(0.001 AS DOUBLE PRECISION) $$ LANGUAGE SQL IMMUTABLE PARALLEL SAFE; /** @} */ /** @defgroup semiring_operations Semiring operations * Functions that build provenance circuit gates for semiring operations. * These are called internally by the query rewriter. * @{ */ /** * @brief Create a times (product) gate from multiple provenance tokens * * Filters out NULL and one-gates; returns gate_one() if all tokens * are trivial, or a single token if only one remains. */ CREATE OR REPLACE FUNCTION provenance_times(VARIADIC tokens uuid[]) RETURNS UUID AS $$ DECLARE times_token uuid; filtered_tokens uuid[]; BEGIN SELECT array_agg(t) FROM unnest(tokens) t WHERE t IS NOT NULL AND t <> gate_one() INTO filtered_tokens; -- Dispatch on the FILTERED count: a single survivor short-circuits -- to that token directly (no useless single-child times gate); zero -- survivors collapse to the identity. Using array_length(tokens, 1) -- here would miss the [one, cmp] → [cmp] case, leaving the cmp wrapped -- in a one-child times when its only sibling was gate_one(). CASE coalesce(array_length(filtered_tokens, 1), 0) WHEN 0 THEN times_token:=gate_one(); WHEN 1 THEN times_token:=filtered_tokens[1]; ELSE times_token := uuid_generate_v5(uuid_ns_provsql(),concat('times',filtered_tokens)); PERFORM create_gate(times_token, 'times', ARRAY_AGG(t)) FROM UNNEST(filtered_tokens) AS t WHERE t IS NOT NULL; END CASE; RETURN times_token; END $$ LANGUAGE plpgsql SET search_path=provsql,pg_temp,public SECURITY DEFINER PARALLEL SAFE; /** * @brief Create a monus (difference) gate from two provenance tokens * * Implements m-semiring monus. Returns token1 if token2 is NULL * (used for LEFT OUTER JOIN semantics in the EXCEPT rewriting). */ CREATE OR REPLACE FUNCTION provenance_monus(token1 UUID, token2 UUID) RETURNS UUID AS $$ DECLARE monus_token uuid; BEGIN IF token1 IS NULL THEN RAISE EXCEPTION USING MESSAGE='provenance_monus is called with first argument NULL'; END IF; IF token2 IS NULL THEN -- Special semantics, because of a LEFT OUTER JOIN used by the -- difference operator: token2 NULL means there is no second argument RETURN token1; END IF; IF token1 = token2 THEN -- X-X=0 monus_token:=gate_zero(); ELSIF token1 = gate_zero() THEN -- 0-X=0 monus_token:=gate_zero(); ELSIF token2 = gate_zero() THEN -- X-0=X monus_token:=token1; ELSE monus_token:=uuid_generate_v5(uuid_ns_provsql(),concat('monus',token1,token2)); PERFORM create_gate(monus_token, 'monus', ARRAY[token1::uuid, token2::uuid]); END IF; RETURN monus_token; END $$ LANGUAGE plpgsql SET search_path=provsql,pg_temp,public SECURITY DEFINER PARALLEL SAFE IMMUTABLE; /** * @brief Create a project gate for where-provenance tracking * * Records the mapping between input and output attribute positions. * * @param token child provenance token * @param positions array encoding attribute position mappings */ CREATE OR REPLACE FUNCTION provenance_project(token UUID, VARIADIC positions int[]) RETURNS UUID AS $$ DECLARE project_token uuid; rec record; BEGIN project_token:=uuid_generate_v5(uuid_ns_provsql(),concat('project', token, positions)); PERFORM create_gate(project_token, 'project', ARRAY[token]); PERFORM set_extra(project_token, ARRAY_AGG(pair)::text) FROM ( SELECT ARRAY[(CASE WHEN info=0 THEN NULL ELSE info END), idx] AS pair FROM unnest(positions) WITH ORDINALITY AS a(info, idx) ORDER BY idx ) t; RETURN project_token; END $$ LANGUAGE plpgsql SET search_path=provsql,pg_temp,public SECURITY DEFINER PARALLEL SAFE IMMUTABLE; /** * @brief Create an equijoin gate for where-provenance tracking * * @param token child provenance token * @param pos1 attribute index in the first relation * @param pos2 attribute index in the second relation */ CREATE OR REPLACE FUNCTION provenance_eq(token UUID, pos1 int, pos2 int) RETURNS UUID AS $$ DECLARE eq_token uuid; rec record; BEGIN eq_token:=uuid_generate_v5(uuid_ns_provsql(),concat('eq',token,pos1,',',pos2)); PERFORM create_gate(eq_token, 'eq', ARRAY[token::uuid]); PERFORM set_infos(eq_token, pos1, pos2); RETURN eq_token; END $$ LANGUAGE plpgsql SET search_path=provsql,pg_temp,public SECURITY DEFINER PARALLEL SAFE IMMUTABLE; /** * @brief Create a plus (sum) gate from an array of provenance tokens * * Filters out NULL and zero-gates; returns gate_zero() if all tokens * are trivial, or a single token if only one remains. */ CREATE OR REPLACE FUNCTION provenance_plus(tokens uuid[]) RETURNS UUID AS $$ DECLARE c INTEGER; plus_token uuid; filtered_tokens uuid[]; BEGIN SELECT array_agg(t) FROM unnest(tokens) t WHERE t IS NOT NULL AND t <> gate_zero() INTO filtered_tokens; c:=array_length(filtered_tokens, 1); IF c = 0 THEN plus_token := gate_zero(); ELSIF c = 1 THEN plus_token := filtered_tokens[1]; ELSE plus_token := uuid_generate_v5( uuid_ns_provsql(), concat('plus', filtered_tokens)); PERFORM create_gate(plus_token, 'plus', filtered_tokens); END IF; RETURN plus_token; END $$ LANGUAGE plpgsql STRICT SET search_path=provsql,pg_temp,public SECURITY DEFINER PARALLEL SAFE IMMUTABLE; /** * @brief Create a comparison gate for HAVING clause provenance * * @param left_token provenance token for the left operand * @param comparison_op OID of the comparison operator * @param right_token provenance token for the right operand */ CREATE OR REPLACE FUNCTION provenance_cmp( left_token UUID, comparison_op OID, right_token UUID ) RETURNS UUID AS $$ DECLARE cmp_token UUID; BEGIN -- deterministic v5 namespace id cmp_token := public.uuid_generate_v5( uuid_ns_provsql(), concat('cmp', left_token::text, comparison_op::text, right_token::text) ); -- wire it up in the circuit PERFORM create_gate(cmp_token, 'cmp', ARRAY[left_token, right_token]); PERFORM set_infos(cmp_token, comparison_op::integer); RETURN cmp_token; END $$ LANGUAGE plpgsql SET search_path=provsql,pg_temp,public SECURITY DEFINER IMMUTABLE PARALLEL SAFE STRICT; /** * @brief Create an arithmetic gate over scalar-valued provenance children * * Builds a deterministic @c gate_arith from an operator tag and an * ordered list of children. The tag is one of the @c provsql_arith_op * enum values declared in @c src/provsql_utils.h * (@c PLUS=0, @c TIMES=1, @c MINUS=2, @c DIV=3, @c NEG=4) and is * stored in the gate's @c info1 field. Children must be UUIDs of * scalar-producing gates (@c gate_rv, @c gate_value, or another * @c gate_arith). The token UUID is derived deterministically from * @p op and @p children so identical sub-expressions share their gate. * * @param op Operator tag (@c provsql_arith_op). * @param children Ordered list of child gate UUIDs. * @return UUID of the (possibly pre-existing) @c gate_arith. */ CREATE OR REPLACE FUNCTION provenance_arith( op INTEGER, children UUID[] ) RETURNS UUID AS $$ DECLARE arith_token UUID; BEGIN arith_token := public.uuid_generate_v5( uuid_ns_provsql(), concat('arith', op::text, children::text) ); PERFORM create_gate(arith_token, 'arith', children); PERFORM set_infos(arith_token, op); RETURN arith_token; END $$ LANGUAGE plpgsql SET search_path=provsql,pg_temp,public SECURITY DEFINER IMMUTABLE PARALLEL SAFE STRICT; /** @} */ /** @defgroup semiring_evaluation Semiring evaluation * Functions for evaluating provenance circuits over semirings, * both user-defined (via function references) and compiled (built-in). * @{ */ /** * @brief Evaluate provenance using a compiled (built-in) semiring * * This C function handles semiring evaluation entirely in C++ for * better performance. The semiring is specified by name. * * @param token provenance token to evaluate * @param token2value mapping table from tokens to semiring values * @param semiring name of the compiled semiring (e.g., "formula", "counting") * @param element_one identity element of the semiring */ CREATE OR REPLACE FUNCTION provenance_evaluate_compiled( token UUID, token2value regclass, semiring TEXT, element_one anyelement) RETURNS anyelement AS 'provsql', 'provenance_evaluate_compiled' LANGUAGE C PARALLEL SAFE STABLE; /** * @brief Evaluate provenance over a user-defined semiring (PL/pgSQL version) * * Recursively walks the provenance circuit and evaluates each gate * using the provided semiring operations. This is the generic version * that accepts semiring operations as function references. * * @param token provenance token to evaluate * @param token2value mapping table from tokens to semiring values * @param element_one identity element of the semiring * @param value_type OID of the semiring value type * @param plus_function semiring addition (aggregate) * @param times_function semiring multiplication (aggregate) * @param monus_function semiring monus (binary), or NULL * @param delta_function δ-semiring operator, or NULL */ CREATE OR REPLACE FUNCTION provenance_evaluate( token UUID, token2value regclass, element_one anyelement, value_type regtype, plus_function regproc, times_function regproc, monus_function regproc, delta_function regproc) RETURNS anyelement AS $$ DECLARE gate_type provenance_gate; result ALIAS FOR $0; children UUID[]; -- cmp_value anyelement; -- temp_result anyelement; value_text TEXT; BEGIN SELECT get_gate_type(token) INTO gate_type; IF gate_type IS NULL THEN RETURN NULL; ELSIF gate_type = 'input' THEN EXECUTE format('SELECT value FROM %s WHERE provenance=%L', token2value, token) INTO result; IF result IS NULL THEN result := element_one; END IF; ELSIF gate_type = 'mulinput' THEN SELECT concat('{',(get_children(token))[1]::text,'=',(get_infos(token)).info1,'}') INTO result; ELSIF gate_type='update' THEN EXECUTE format('SELECT value FROM %s WHERE provenance=%L',token2value,token) INTO result; IF result IS NULL THEN result:=element_one; END IF; ELSIF gate_type = 'plus' THEN EXECUTE format('SELECT %s(provsql.provenance_evaluate(t,%L,%L::%s,%L,%L,%L,%L,%L)) FROM unnest(get_children(%L)) AS t', plus_function, token2value, element_one, value_type, value_type, plus_function, times_function, monus_function, delta_function, token) INTO result; ELSIF gate_type = 'times' THEN EXECUTE format('SELECT %s(provsql.provenance_evaluate(t,%L,%L::%s,%L,%L,%L,%L,%L)) FROM unnest(get_children(%L)) AS t', times_function, token2value, element_one, value_type, value_type, plus_function, times_function, monus_function, delta_function, token) INTO result; ELSIF gate_type = 'monus' THEN IF monus_function IS NULL THEN RAISE EXCEPTION USING MESSAGE='Provenance with negation evaluated over a semiring without monus function'; ELSE EXECUTE format('SELECT %s(a1,a2) FROM (SELECT provsql.provenance_evaluate(c[1],%L,%L::%s,%L,%L,%L,%L,%L) AS a1, ' || 'provsql.provenance_evaluate(c[2],%L,%L::%s,%L,%L,%L,%L,%L) AS a2 FROM get_children(%L) c) tmp', monus_function, token2value, element_one, value_type, value_type, plus_function, times_function, monus_function, delta_function, token2value, element_one, value_type, value_type, plus_function, times_function, monus_function, delta_function, token) INTO result; END IF; ELSIF gate_type = 'eq' THEN EXECUTE format('SELECT provsql.provenance_evaluate((get_children(%L))[1],%L,%L::%s,%L,%L,%L,%L,%L)', token, token2value, element_one, value_type, value_type, plus_function, times_function, monus_function, delta_function) INTO result; /* elsif gate_type = 'cmp' then EXECUTE format('SELECT provsql.provenance_evaluate((get_children(%L))[1],%L,%L::%s,%L,%L,%L,%L,%L)', token, token2value, element_one, value_type, value_type, plus_function, times_function, monus_function, delta_function) INTO temp_result; EXECUTE format('SELECT get_extra((get_children(%L))[2])', token) INTO cmp_value; IF temp_result::text = cmp_value::text THEN SELECT concat('{',temp_result::text,'=',cmp_value::text,'}') INTO result; ELSE RETURN gate_zero() */ ELSIF gate_type = 'delta' THEN IF delta_function IS NULL THEN RAISE EXCEPTION USING MESSAGE='Provenance with aggregation evaluated over a semiring without delta function'; ELSE EXECUTE format('SELECT %I(a) FROM (SELECT provsql.provenance_evaluate((get_children(%L))[1],%L,%L::%s,%L,%L,%L,%L,%L) AS a) tmp', delta_function, token, token2value, element_one, value_type, value_type, plus_function, times_function, monus_function, delta_function) INTO result; END IF; ELSIF gate_type = 'zero' THEN EXECUTE format('SELECT %I(a) FROM (SELECT %L::%I AS a WHERE FALSE) temp', plus_function, element_one, value_type) INTO result; ELSIF gate_type = 'one' THEN EXECUTE format('SELECT %L::%I', element_one, value_type) INTO result; ELSIF gate_type = 'project' THEN EXECUTE format('SELECT provsql.provenance_evaluate((get_children(%L))[1],%L,%L::%s,%L,%L,%L,%L,%L)', token, token2value, element_one, value_type, value_type, plus_function, times_function, monus_function, delta_function) INTO result; ELSE RAISE EXCEPTION USING MESSAGE='provenance_evaluate cannot be called on formulas using ' || gate_type || ' gates; use compiled semirings instead'; END IF; RETURN result; END $$ LANGUAGE plpgsql PARALLEL SAFE STABLE; /** * @brief Evaluate aggregate provenance over a user-defined semiring (PL/pgSQL version) * * Handles agg and semimod gates produced by GROUP BY queries. * * @param token provenance token to evaluate * @param token2value mapping table from tokens to semiring values * @param agg_function_final finalization function for the aggregate * @param agg_function aggregate combination function * @param semimod_function semimodule scalar multiplication function * @param element_one identity element of the semiring * @param value_type OID of the semiring value type * @param plus_function semiring addition * @param times_function semiring multiplication * @param monus_function semiring monus, or NULL * @param delta_function δ-semiring operator, or NULL */ CREATE OR REPLACE FUNCTION aggregation_evaluate( token UUID, token2value regclass, agg_function_final regproc, agg_function regproc, semimod_function regproc, element_one anyelement, value_type regtype, plus_function regproc, times_function regproc, monus_function regproc, delta_function regproc) RETURNS anyelement AS $$ DECLARE gt provenance_gate; result ALIAS FOR $0; BEGIN SELECT get_gate_type(token) INTO gt; IF gt IS NULL THEN RETURN NULL; ELSIF gt='agg' THEN EXECUTE format('SELECT %I(%I(provsql.aggregation_evaluate(t,%L,%L,%L,%L,%L::%s,%L,%L,%L,%L,%L)),pp.proname::varchar) FROM unnest(get_children(%L)) AS t, pg_proc pp WHERE pp.oid=(get_infos(%L)).info1 GROUP BY pp.proname', agg_function_final, agg_function,token2value,agg_function_final,agg_function,semimod_function,element_one,value_type,value_type,plus_function,times_function, monus_function,delta_function,token,token) INTO result; ELSE -- gt='semimod' EXECUTE format('SELECT %I(get_extra((get_children(%L))[2]),provsql.provenance_evaluate((get_children(%L))[1],%L,%L::%s,%L,%L,%L,%L,%L))', semimod_function,token,token,token2value,element_one,value_type,value_type,plus_function,times_function,monus_function,delta_function) INTO result; END IF; RETURN result; END $$ LANGUAGE plpgsql PARALLEL SAFE STABLE; /** * @brief Evaluate provenance over a user-defined semiring (C version) * * Optimized C implementation of provenance_evaluate. Infers the * value type from element_one. Monus and delta functions are optional. * * @param token provenance token to evaluate * @param token2value mapping table from tokens to semiring values * @param element_one identity element of the semiring * @param plus_function semiring addition (aggregate) * @param times_function semiring multiplication (aggregate) * @param monus_function semiring monus, or NULL if not needed * @param delta_function δ-semiring operator, or NULL if not needed */ CREATE OR REPLACE FUNCTION provenance_evaluate( token UUID, token2value regclass, element_one anyelement, plus_function regproc, times_function regproc, monus_function regproc = NULL, delta_function regproc = NULL) RETURNS anyelement AS 'provsql','provenance_evaluate' LANGUAGE C STABLE; /** @brief Evaluate aggregate provenance over a user-defined semiring (C version) */ CREATE OR REPLACE FUNCTION aggregation_evaluate( token UUID, token2value regclass, agg_function_final regproc, agg_function regproc, semimod_function regproc, element_one anyelement, plus_function regproc, times_function regproc, monus_function regproc = NULL, delta_function regproc = NULL) RETURNS anyelement AS 'provsql','aggregation_evaluate' LANGUAGE C STABLE; /** @} */ /** @defgroup circuit_introspection Circuit introspection * Functions for examining the structure of provenance circuits, * used by visualization and where-provenance features. * @{ */ /** @brief Row type for sub_circuit_with_desc results */ CREATE TYPE gate_with_desc AS (f UUID, t UUID, gate_type provenance_gate, desc_str CHARACTER VARYING, infos INTEGER[], extra TEXT); /** * @brief Return the sub-circuit reachable from a token, with descriptions * * Recursively traverses the provenance circuit from the given token and * returns all edges together with input gate descriptions from the * mapping table. * * @param token root provenance token * @param token2desc mapping table providing descriptions for input gates */ CREATE OR REPLACE FUNCTION sub_circuit_with_desc( token UUID, token2desc regclass) RETURNS SETOF gate_with_desc AS $$ BEGIN RETURN QUERY EXECUTE 'WITH RECURSIVE transitive_closure(f,t,gate_type) AS ( SELECT $1,t,provsql.get_gate_type($1) FROM unnest(provsql.get_children($1)) AS t UNION ALL SELECT p1.t,u,provsql.get_gate_type(p1.t) FROM transitive_closure p1, unnest(provsql.get_children(p1.t)) AS u) SELECT *, ARRAY[(get_infos(f)).info1, (get_infos(f)).info2], get_extra(f) FROM ( SELECT f::uuid,t::uuid,gate_type,NULL FROM transitive_closure UNION ALL SELECT p2.provenance::uuid as f, NULL::uuid, ''input'', CAST (p2.value AS varchar) FROM transitive_closure p1 JOIN ' || token2desc || ' AS p2 ON p2.provenance=t UNION ALL SELECT provenance::uuid as f, NULL::uuid, ''input'', CAST (value AS varchar) FROM ' || token2desc || ' WHERE provenance=$1 ) t' USING token LOOP; RETURN; END $$ LANGUAGE plpgsql PARALLEL SAFE; /** * @brief Identify which table and how many columns a provenance token belongs to * * Searches all provenance-tracked tables for a row matching the given * token and returns the table name and column count. * * @param token provenance token to look up * @param table_name (OUT) the table containing this token * @param nb_columns (OUT) number of non-provenance columns in that table */ CREATE OR REPLACE FUNCTION identify_token( token UUID, OUT table_name regclass, OUT nb_columns integer) AS $$ DECLARE t RECORD; result RECORD; BEGIN table_name:=NULL; nb_columns:=-1; FOR t IN SELECT relname, (SELECT count(*) FROM pg_attribute a2 WHERE a2.attrelid=a1.attrelid AND attnum>0 AND atttypid<>0)-1 c FROM pg_attribute a1 JOIN pg_type ON atttypid=pg_type.oid JOIN pg_class ON attrelid=pg_class.oid JOIN pg_namespace ON relnamespace=pg_namespace.oid WHERE typname='uuid' AND relkind='r' AND nspname<>'provsql' AND attname='provsql' LOOP EXECUTE format('SELECT * FROM %I WHERE provsql=%L',t.relname,token) INTO result; IF result IS NOT NULL THEN table_name:=t.relname; nb_columns:=t.c; EXIT; END IF; END LOOP; END $$ LANGUAGE plpgsql STRICT; /** * @brief Return the sub-circuit for where-provenance computation * * Similar to sub_circuit_with_desc but resolves input gates to their * source table and column count for where-provenance evaluation. */ CREATE OR REPLACE FUNCTION sub_circuit_for_where(token UUID) RETURNS TABLE(f UUID, t UUID, gate_type provenance_gate, table_name REGCLASS, nb_columns INTEGER, infos INTEGER[], extra TEXT) AS $$ WITH RECURSIVE transitive_closure(f,t,idx,gate_type) AS ( SELECT $1,t,id,provsql.get_gate_type($1) FROM unnest(provsql.get_children($1)) WITH ORDINALITY AS a(t,id) UNION ALL SELECT p1.t,u,id,provsql.get_gate_type(p1.t) FROM transitive_closure p1, unnest(provsql.get_children(p1.t)) WITH ORDINALITY AS a(u, id) ) SELECT f, t, gate_type, table_name, nb_columns, ARRAY[(get_infos(f)).info1, (get_infos(f)).info2], get_extra(f) FROM ( SELECT f, t::uuid, idx, gate_type, NULL AS table_name, NULL AS nb_columns FROM transitive_closure UNION ALL SELECT DISTINCT t, NULL::uuid, NULL::int, 'input'::provenance_gate, (id).table_name, (id).nb_columns FROM transitive_closure JOIN (SELECT t AS prov, provsql.identify_token(t) as id FROM transitive_closure WHERE t NOT IN (SELECT f FROM transitive_closure)) temp ON t=prov UNION ALL SELECT DISTINCT $1, NULL::uuid, NULL::int, 'input'::provenance_gate, (id).table_name, (id).nb_columns FROM (SELECT provsql.identify_token($1) AS id WHERE $1 NOT IN (SELECT f FROM transitive_closure)) temp ) t $$ LANGUAGE sql; /** * @brief BFS expansion of a provenance circuit, capped at @p max_depth * * Returns one row per (parent, child) edge in the BFS-bounded subgraph * rooted at @p root, plus one row for the root with parent and * child_pos NULL. Provenance circuits are DAGs, so a child gate * may have several parents within the bound; each such edge is reported * as a separate row, so callers must deduplicate on node if they * need a one-row-per-node view. * * depth is the node's BFS depth (its shortest distance from * @p root), so for an edge (parent, child) it is always the case that * parent.depth + 1 >= child.depth; equality holds only on * shortest-path edges. A node at depth = max_depth is not * expanded; callers can detect a partial expansion by comparing * provsql.get_children length against the number of outgoing * edges reported. * * info1 and info2 are the integer values stored on * the gate by provsql.set_infos, formatted as text; their * meaning is gate-type-specific (see provsql.set_infos). * * @param root root provenance token * @param max_depth maximum BFS depth (default 8) */ CREATE OR REPLACE FUNCTION circuit_subgraph(root UUID, max_depth INT DEFAULT 8) RETURNS TABLE(node UUID, parent UUID, child_pos INT, gate_type TEXT, info1 TEXT, info2 TEXT, depth INT) AS $$ WITH RECURSIVE bfs(node, parent, child_pos, depth) AS ( SELECT root, NULL::UUID, NULL::INT, 0 UNION ALL SELECT c.t, b.node, c.idx::INT, b.depth + 1 FROM bfs b CROSS JOIN LATERAL unnest(provsql.get_children(b.node)) WITH ORDINALITY AS c(t, idx) WHERE b.depth < max_depth ), -- Each node's canonical depth is its shortest-path distance from the root. -- Tie-breaking on child_pos is irrelevant for the depth value but kept so -- the (now informational) row order is stable. node_depth AS ( SELECT node, MIN(depth) AS depth FROM bfs GROUP BY node ), -- All distinct (parent, child, child_pos) triples seen during the BFS. -- A child reached from k parents within the bound contributes k rows. -- Self-joins (times(x, x)) contribute one row per child position. edges AS ( SELECT DISTINCT parent, node AS child, child_pos FROM bfs WHERE parent IS NOT NULL ) SELECT d.node, e.parent, e.child_pos, provsql.get_gate_type(d.node)::TEXT, i.info1::TEXT, i.info2::TEXT, d.depth FROM node_depth d LEFT JOIN edges e ON e.child = d.node LEFT JOIN LATERAL provsql.get_infos(d.node) i ON TRUE ORDER BY d.depth, d.node, e.parent; $$ LANGUAGE sql STABLE PARALLEL SAFE; /** * @brief BFS subgraph of the IN-MEMORY simplified circuit rooted at @p root. * * Same row shape as @ref circuit_subgraph plus an inline @c extra * column, but built from the @c GenericCircuit returned by * @c getGenericCircuit -- i.e. AFTER @c provsql.simplify_on_load * passes (RangeCheck, ...) have rewritten any decidable @c gate_cmp * into Bernoulli @c gate_input / @c gate_zero / @c gate_one leaves. * Lets a renderer show the user what the evaluator actually sees, * without mutating the persisted DAG. * * Returns @c jsonb (an array of objects) rather than @c SETOF record * to keep the C++ implementation free of SRF / @c FuncCallContext * boilerplate; callers either consume the array directly or expand * it via @c jsonb_array_elements. * * @param root Root provenance token. * @param max_depth Maximum BFS depth (default 8). */ CREATE OR REPLACE FUNCTION simplified_circuit_subgraph( root UUID, max_depth INT DEFAULT 8) RETURNS jsonb AS 'provsql','simplified_circuit_subgraph' LANGUAGE C STABLE PARALLEL SAFE; /** * @brief Empirical histogram of a scalar sub-circuit * * Returns a jsonb array of @c {bin_lo, bin_hi, count} objects covering * the observed @c [min, max] range of @p bins equal-width samples from * the sub-circuit rooted at @p token. Sample count is taken from * @c provsql.rv_mc_samples; pinning @c provsql.monte_carlo_seed makes * the result reproducible. * * Accepted root gate types are the scalar ones: @c gate_value (Dirac * at the constant, single bin), @c gate_rv (sampled from the leaf's * distribution), and @c gate_arith (sampled by recursing through the * arithmetic DAG, with shared @c gate_rv leaves correctly correlated * within an iteration). Any other gate type raises. * * @param token Root provenance token of a scalar sub-circuit. * @param bins Number of equal-width histogram bins (default 30). * @param prov Conditioning event (defaults to @c gate_one() = no * conditioning). When non-trivial, the histogram is * over the conditional distribution recovered by * rejection sampling on the joint circuit with @p token. */ CREATE OR REPLACE FUNCTION rv_histogram( token UUID, bins INT DEFAULT 30, prov UUID DEFAULT gate_one()) RETURNS jsonb AS 'provsql','rv_histogram' LANGUAGE C VOLATILE PARALLEL SAFE; /** * @brief Sample the closed-form PDF and CDF of a (possibly truncated) * scalar distribution. * * Returns @c {"pdf": [{x, p}, ...], "cdf": [{x, p}, ...]} with @p samples * evenly-spaced points spanning the distribution's natural display * range (intersected with the conditioning event's interval when * @c prov is non-trivial). Used by ProvSQL Studio's Distribution * profile panel to overlay the analytical curve on the empirical * histogram from :sqlfunc:`rv_histogram` -- the simplifier's * analytical wins (e.g. @c c·Exp(λ) folding to @c Exp(λ/c)) become * visible as a smooth curve riding over the MC-sampled bars. * * Returns @c NULL when the root sub-circuit is not a closed-form * shape (V1: only bare @c gate_rv of Normal / Uniform / Exponential * / integer-Erlang). The frontend reads @c NULL as "skip overlay" * without erroring, so the caller can dispatch this in parallel with * @c rv_histogram regardless of the underlying shape. * * @param token Scalar gate token (random_variable's UUID). * @param samples Number of (x, p) points; must be >= 2. * @param prov Conditioning event (defaults to @c gate_one() = no * conditioning). When non-trivial, the curves are * over the truncated distribution. */ CREATE OR REPLACE FUNCTION rv_analytical_curves( token UUID, samples INT DEFAULT 100, prov UUID DEFAULT gate_one()) RETURNS jsonb AS 'provsql','rv_analytical_curves' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; /** * @brief Draw conditional Monte Carlo samples from a scalar gate. * * Returns up to @c n samples of the scalar value at @c token; when * @c prov is not the trivial @c gate_one() event, draws are accepted * only on iterations where @c prov evaluates true (rejection * sampling). Shared @c gate_rv leaves between @c token and @c prov * are loaded into a single joint circuit so the indicator's draw * and the value's draw share their per-iteration state. * * @param token Scalar sub-circuit root. * @param n Number of accepted samples to attempt. * @param prov Conditioning event (defaults to @c gate_one() = no * conditioning). * * Emits a @c NOTICE when the conditional acceptance rate yields fewer * than @c n samples within the @c provsql.rv_mc_samples budget so the * caller can choose to widen the budget. */ CREATE OR REPLACE FUNCTION rv_sample( token UUID, n integer, prov UUID DEFAULT gate_one()) RETURNS SETOF float8 AS 'provsql','rv_sample' LANGUAGE C VOLATILE PARALLEL SAFE; /** * @brief Resolve an input gate UUID back to its source row * * Searches every provenance-tracked relation for a row whose * provsql column equals @p uuid and returns the relation's * regclass together with the row encoded as JSONB. Returns zero * rows when @p uuid is not the provenance token of any tracked row, * including when it identifies an internal gate (plus, * times, ...) rather than an input. * * Ordinarily exactly one row is returned, but if the same UUID * happens to appear as a provsql value in several tracked * tables, all matches are returned. * * @param uuid token to resolve */ CREATE OR REPLACE FUNCTION resolve_input(uuid UUID) RETURNS TABLE(relation regclass, row_data JSONB) AS $$ DECLARE t RECORD; rel regclass; rd JSONB; -- ProvSQL's rewriter unconditionally appends a provsql column to the -- targetlist of any SELECT reading from a tracked relation; capture and -- discard it here rather than disabling the rewriter for the whole call. ign UUID; BEGIN FOR t IN SELECT c.oid::regclass AS regc FROM pg_attribute a JOIN pg_class c ON a.attrelid = c.oid JOIN pg_namespace ns ON c.relnamespace = ns.oid JOIN pg_type ty ON a.atttypid = ty.oid WHERE a.attname = 'provsql' AND ty.typname = 'uuid' AND c.relkind = 'r' AND ns.nspname <> 'provsql' AND a.attnum > 0 LOOP FOR rel, rd, ign IN EXECUTE format( 'SELECT %L::regclass, to_jsonb(t) - ''provsql'', t.provsql FROM %s AS t WHERE provsql = $1', t.regc, t.regc) USING uuid LOOP relation := rel; row_data := rd; RETURN NEXT; END LOOP; END LOOP; END $$ LANGUAGE plpgsql STABLE; /** @} */ /** @defgroup agg_token_type Type for the result of aggregate queries * * Custom type agg_token for a provenance semimodule value, to * be used in attributes that are computed as a result of aggregation. * As for provenance tokens, this is simply a UUID, but this UUID is * displayed in a specific way (as the result of the aggregation * followed by a "(*)") to help with readability. * * The text output is controlled by the * provsql.aggtoken_text_as_uuid GUC. By default it is off and * the cell renders as "value (*)". When set to on (typical * for UI layers such as ProvSQL Studio), the cell renders as the * underlying UUID instead, so the caller can click through to the * provenance circuit; the value side is then recovered via * provsql.agg_token_value_text(uuid). * * @{ */ CREATE TYPE agg_token; /** @brief Input function for the agg_token type (parses text representation) */ CREATE OR REPLACE FUNCTION agg_token_in(cstring) RETURNS agg_token AS 'provsql','agg_token_in' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; /** * @brief Output function for the agg_token type * * Default: produces the human-friendly @c "value (*)" form, where * @c value is the running aggregate state. * * When the @c provsql.aggtoken_text_as_uuid GUC is on, returns the * underlying provenance UUID instead. UI layers (notably ProvSQL * Studio) flip this on per session so aggregate cells expose the * circuit root UUID for click-through; the @c "value (*)" display * string is recovered via @c provsql.agg_token_value_text(uuid). * * Marked STABLE rather than IMMUTABLE because the chosen output * shape now depends on a GUC that the same session can flip at * runtime. */ CREATE OR REPLACE FUNCTION agg_token_out(agg_token) RETURNS cstring AS 'provsql','agg_token_out' LANGUAGE C STABLE STRICT PARALLEL SAFE; /** @brief Cast an agg_token to its text representation */ CREATE OR REPLACE FUNCTION agg_token_cast(agg_token) RETURNS text AS 'provsql','agg_token_cast' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; CREATE TYPE agg_token ( internallength = 117, input = agg_token_in, output = agg_token_out, alignment = char ); /** @brief Extract the UUID from an agg_token (implicit cast to UUID) */ CREATE OR REPLACE FUNCTION agg_token_uuid(aggtok agg_token) RETURNS uuid AS $$ BEGIN RETURN agg_token_cast(aggtok)::uuid; END $$ LANGUAGE plpgsql STRICT SET search_path=provsql,pg_temp,public SECURITY DEFINER IMMUTABLE PARALLEL SAFE; /** @brief Implicit PostgreSQL cast from agg_token to UUID (delegates to agg_token_uuid()) */ CREATE CAST (agg_token AS UUID) WITH FUNCTION agg_token_uuid(agg_token) AS IMPLICIT; /** * @brief Recover the @c "value (*)" display string for an aggregation gate * * Companion helper to the @c provsql.aggtoken_text_as_uuid GUC. With * the GUC on, an @c agg_token cell prints as the underlying provenance * UUID, which is convenient for tooling that wants to click through to * the circuit but loses the human-readable aggregate value. This * function takes such a UUID and returns the original @c "value (*)" * string by reading the gate's @c extra (set by aggregate evaluation * to the computed scalar). Returns @c NULL if @p token does not * resolve to an @c agg gate. * * @param token UUID of an @c agg gate (typically obtained from an * @c agg_token cell when @c aggtoken_text_as_uuid is on, * or via a manual UUID cast otherwise). */ CREATE OR REPLACE FUNCTION agg_token_value_text(token UUID) RETURNS text AS $$ SELECT CASE WHEN provsql.get_gate_type(token) = 'agg' THEN provsql.get_extra(token) || ' (*)' ELSE NULL END; $$ LANGUAGE sql STABLE STRICT PARALLEL SAFE; /** @brief Cast an agg_token to numeric (extracts the aggregate value, loses provenance) */ CREATE OR REPLACE FUNCTION agg_token_to_numeric(agg_token) RETURNS numeric AS 'provsql','agg_token_to_numeric' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; /** @brief Cast an agg_token to double precision (extracts the aggregate value, loses provenance) */ CREATE OR REPLACE FUNCTION agg_token_to_float8(agg_token) RETURNS double precision AS 'provsql','agg_token_to_float8' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; /** @brief Cast an agg_token to integer (extracts the aggregate value, loses provenance) */ CREATE OR REPLACE FUNCTION agg_token_to_int4(agg_token) RETURNS integer AS 'provsql','agg_token_to_int4' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; /** @brief Cast an agg_token to bigint (extracts the aggregate value, loses provenance) */ CREATE OR REPLACE FUNCTION agg_token_to_int8(agg_token) RETURNS bigint AS 'provsql','agg_token_to_int8' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; /** @brief Cast an agg_token to text (extracts the aggregate value, loses provenance) */ CREATE OR REPLACE FUNCTION agg_token_to_text(agg_token) RETURNS text AS 'provsql','agg_token_to_text' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; /** @brief Implicit PostgreSQL cast from agg_token to numeric (enables arithmetic on aggregates) */ CREATE CAST (agg_token AS numeric) WITH FUNCTION agg_token_to_numeric(agg_token) AS IMPLICIT; /** @brief Assignment cast from agg_token to double precision */ CREATE CAST (agg_token AS double precision) WITH FUNCTION agg_token_to_float8(agg_token) AS ASSIGNMENT; /** @brief Assignment cast from agg_token to integer */ CREATE CAST (agg_token AS integer) WITH FUNCTION agg_token_to_int4(agg_token) AS ASSIGNMENT; /** @brief Assignment cast from agg_token to bigint */ CREATE CAST (agg_token AS bigint) WITH FUNCTION agg_token_to_int8(agg_token) AS ASSIGNMENT; /** @brief Assignment cast from agg_token to text (extracts value, not UUID) */ CREATE CAST (agg_token AS text) WITH FUNCTION agg_token_to_text(agg_token) AS ASSIGNMENT; /** * @brief Placeholder comparison of agg_token with numeric * * This function is never actually called; it exists so the SQL parser * accepts comparison operators between agg_token and numeric values. * The ProvSQL query rewriter replaces these comparisons at plan time. */ CREATE OR REPLACE FUNCTION agg_token_comp_numeric(a agg_token, b numeric) RETURNS boolean LANGUAGE plpgsql IMMUTABLE STRICT PARALLEL SAFE AS $$ BEGIN RAISE EXCEPTION 'Comparison agg_token-numeric not implemented, should be replaced by ProvSQL behavior'; END; $$; /** * @brief Placeholder comparison of numeric with agg_token * * Symmetric to agg_token_comp_numeric; never actually called. * The ProvSQL query rewriter replaces these comparisons at plan time. */ CREATE OR REPLACE FUNCTION numeric_comp_agg_token(a numeric, b agg_token) RETURNS boolean LANGUAGE plpgsql IMMUTABLE STRICT PARALLEL SAFE AS $$ BEGIN RAISE EXCEPTION 'Comparison numeric-agg_token not implemented, should be replaced by ProvSQL behavior'; END; $$; /** @brief SQL operator agg_token < numeric (placeholder rewritten by ProvSQL at plan time) */ CREATE OPERATOR < ( LEFTARG = agg_token, RIGHTARG = numeric, PROCEDURE = agg_token_comp_numeric, COMMUTATOR = >, NEGATOR = >= ); /** @brief SQL operator numeric < agg_token (placeholder rewritten by ProvSQL at plan time) */ CREATE OPERATOR < ( LEFTARG = numeric, RIGHTARG = agg_token, PROCEDURE = numeric_comp_agg_token, COMMUTATOR = >, NEGATOR = >= ); /** @brief SQL operator agg_token <= numeric (placeholder rewritten by ProvSQL at plan time) */ CREATE OPERATOR <= ( LEFTARG = agg_token, RIGHTARG = numeric, PROCEDURE = agg_token_comp_numeric, COMMUTATOR = >=, NEGATOR = > ); /** @brief SQL operator numeric <= agg_token (placeholder rewritten by ProvSQL at plan time) */ CREATE OPERATOR <= ( LEFTARG = numeric, RIGHTARG = agg_token, PROCEDURE = numeric_comp_agg_token, COMMUTATOR = >=, NEGATOR = > ); /** @brief SQL operator agg_token = numeric (placeholder rewritten by ProvSQL at plan time) */ CREATE OPERATOR = ( LEFTARG = agg_token, RIGHTARG = numeric, PROCEDURE = agg_token_comp_numeric, COMMUTATOR = =, NEGATOR = <> ); /** @brief SQL operator numeric = agg_token (placeholder rewritten by ProvSQL at plan time) */ CREATE OPERATOR = ( LEFTARG = numeric, RIGHTARG = agg_token, PROCEDURE = numeric_comp_agg_token, COMMUTATOR = =, NEGATOR = <> ); /** @brief SQL operator agg_token <> numeric (placeholder rewritten by ProvSQL at plan time) */ CREATE OPERATOR <> ( LEFTARG = agg_token, RIGHTARG = numeric, PROCEDURE = agg_token_comp_numeric, COMMUTATOR = <>, NEGATOR = = ); /** @brief SQL operator numeric <> agg_token (placeholder rewritten by ProvSQL at plan time) */ CREATE OPERATOR <> ( LEFTARG = numeric, RIGHTARG = agg_token, PROCEDURE = numeric_comp_agg_token, COMMUTATOR = <>, NEGATOR = = ); /** @brief SQL operator agg_token >= numeric (placeholder rewritten by ProvSQL at plan time) */ CREATE OPERATOR >= ( LEFTARG = agg_token, RIGHTARG = numeric, PROCEDURE = agg_token_comp_numeric, COMMUTATOR = <=, NEGATOR = < ); /** @brief SQL operator numeric >= agg_token (placeholder rewritten by ProvSQL at plan time) */ CREATE OPERATOR >= ( LEFTARG = numeric, RIGHTARG = agg_token, PROCEDURE = numeric_comp_agg_token, COMMUTATOR = <=, NEGATOR = < ); /** @brief SQL operator agg_token > numeric (placeholder rewritten by ProvSQL at plan time) */ CREATE OPERATOR > ( LEFTARG = agg_token, RIGHTARG = numeric, PROCEDURE = agg_token_comp_numeric, COMMUTATOR = <, NEGATOR = <= ); /** @brief SQL operator numeric > agg_token (placeholder rewritten by ProvSQL at plan time) */ CREATE OPERATOR > ( LEFTARG = numeric, RIGHTARG = agg_token, PROCEDURE = numeric_comp_agg_token, COMMUTATOR = <, NEGATOR = <= ); /** @} */ /** @defgroup random_variable_type Type for continuous random variables * * Custom type random_variable: a thin wrapper around a * provenance gate UUID, used to expose continuous probabilistic * c-tables in SQL. The UUID indexes either a gate_rv * (an actual distribution) or a gate_value (a * zero-variance constant produced by provsql.as_random). * Binary-coercible with uuid (same 16-byte layout), so an * rv-typed expression flows directly into any function * expecting a uuid at zero runtime cost. * * Constructors live in this group: provsql.normal(μ, σ), * provsql.uniform(a, b), provsql.exponential(λ), * provsql.erlang(k, λ), and provsql.as_random(c). * Operator overloads * (+ - * / and the six comparators) are defined further * below, alongside direct rv_cmp_* UUID constructors for * callers that want a gate_cmp token without going through * the planner hook. * @{ */ CREATE TYPE random_variable; /** @brief Input function for the random_variable type */ CREATE OR REPLACE FUNCTION random_variable_in(cstring) RETURNS random_variable AS 'provsql','random_variable_in' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; /** @brief Output function for the random_variable type */ CREATE OR REPLACE FUNCTION random_variable_out(random_variable) RETURNS cstring AS 'provsql','random_variable_out' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; CREATE TYPE random_variable ( internallength = 16, input = random_variable_in, output = random_variable_out, alignment = char ); /** @brief Build a random_variable from a UUID (internal). */ CREATE OR REPLACE FUNCTION random_variable_make(tok uuid) RETURNS random_variable AS 'provsql','random_variable_make' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; /** @brief Binary-coercible cast random_variable -> uuid. * A random_variable is byte-for-byte a pg_uuid_t (alignment char, * length 16), so WITHOUT FUNCTION lets PostgreSQL reinterpret the * bytes at zero runtime cost. The cast is IMPLICIT so a `provsql` * column or any random_variable expression flows directly into any * function expecting a uuid. */ CREATE CAST (random_variable AS uuid) WITHOUT FUNCTION AS IMPLICIT; CREATE CAST (uuid AS random_variable) WITHOUT FUNCTION; /** * @brief Internal: true iff @p x is a finite (non-NaN, non-±∞) float8. * * PostgreSQL's isnan is defined for numeric only, * not for double precision; we use the inequality form, * which works because PG defines NaN = NaN as TRUE * for floats (so NaN <> 'NaN'::float8 is FALSE). */ CREATE OR REPLACE FUNCTION is_finite_float8(x double precision) RETURNS bool AS $$ SELECT $1 <> 'NaN'::float8 AND $1 <> 'Infinity'::float8 AND $1 <> '-Infinity'::float8; $$ LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE; /** * @brief Construct a normal-distribution random variable * * Creates a fresh gate_rv with @c "normal:μ,σ" stored in * the gate's @c extra field, and returns a random_variable * pointing at it. * * Validation: * - @p mu and @p sigma must be finite (no @c NaN, no @c ±Infinity). * - @p sigma must be non-negative. * - When @p sigma is zero the distribution degenerates to the Dirac * at @p mu; the call is silently routed through @c as_random(mu), * producing a @c gate_value rather than a zero-variance @c gate_rv. * This keeps the sampler / moment / boundcheck paths free of σ=0 * special cases and lets normal(x, 0) share its gate with * as_random(x). * * @warning The VOLATILE marking is load-bearing and must * not be weakened. Each call mints a fresh uuid_generate_v4 * token because two calls to normal(0, 1) are *independent* * random variables; if PostgreSQL were allowed to fold the function * (which it would under STABLE / IMMUTABLE), two * calls in the same query would share a UUID and collapse into a * single dependent RV, silently breaking the c-table semantics. * Same warning applies to @c uniform and @c exponential below. * * @sa Wikipedia: Normal distribution */ CREATE OR REPLACE FUNCTION normal(mu double precision, sigma double precision) RETURNS random_variable AS $$ DECLARE token uuid; BEGIN IF NOT provsql.is_finite_float8(mu) OR NOT provsql.is_finite_float8(sigma) THEN RAISE EXCEPTION 'provsql.normal: parameters must be finite (got mu=%, sigma=%)', mu, sigma; END IF; IF sigma < 0 THEN RAISE EXCEPTION 'provsql.normal: sigma must be non-negative (got %)', sigma; END IF; IF sigma = 0 THEN RETURN provsql.as_random(mu); END IF; token := public.uuid_generate_v4(); PERFORM provsql.create_gate(token, 'rv'); PERFORM provsql.set_extra(token, 'normal:' || mu || ',' || sigma); RETURN provsql.random_variable_make(token); END $$ LANGUAGE plpgsql STRICT VOLATILE PARALLEL SAFE; /** * @brief Construct a uniform-distribution random variable on [a, b] * * Validation: * - @p a and @p b must be finite. * - @p a must be ≤ @p b (reversed bounds are rejected). * - When a = b the distribution is the Dirac at @p a; the * call is silently routed through @c as_random(a) for the same * reason as @c normal with @p sigma = 0. * * @warning VOLATILE is load-bearing; see the warning on * @ref normal. * * @sa Wikipedia: Continuous uniform distribution */ CREATE OR REPLACE FUNCTION uniform(a double precision, b double precision) RETURNS random_variable AS $$ DECLARE token uuid; BEGIN IF NOT provsql.is_finite_float8(a) OR NOT provsql.is_finite_float8(b) THEN RAISE EXCEPTION 'provsql.uniform: bounds must be finite (got a=%, b=%)', a, b; END IF; IF a > b THEN RAISE EXCEPTION 'provsql.uniform: a must be <= b (got a=%, b=%)', a, b; END IF; IF a = b THEN RETURN provsql.as_random(a); END IF; token := public.uuid_generate_v4(); PERFORM provsql.create_gate(token, 'rv'); PERFORM provsql.set_extra(token, 'uniform:' || a || ',' || b); RETURN provsql.random_variable_make(token); END $$ LANGUAGE plpgsql STRICT VOLATILE PARALLEL SAFE; /** * @brief Construct an exponential-distribution random variable with rate λ * * Validation: * - @p lambda must be finite and strictly positive. No degenerate * form exists for the exponential distribution, so there is no * silent route through @c as_random. * * @warning VOLATILE is load-bearing; see the warning on * @ref normal. * * @sa Wikipedia: Exponential distribution */ CREATE OR REPLACE FUNCTION exponential(lambda double precision) RETURNS random_variable AS $$ DECLARE token uuid; BEGIN IF NOT provsql.is_finite_float8(lambda) THEN RAISE EXCEPTION 'provsql.exponential: lambda must be finite (got %)', lambda; END IF; IF lambda <= 0 THEN RAISE EXCEPTION 'provsql.exponential: lambda must be strictly positive (got %)', lambda; END IF; token := public.uuid_generate_v4(); PERFORM provsql.create_gate(token, 'rv'); PERFORM provsql.set_extra(token, 'exponential:' || lambda); RETURN provsql.random_variable_make(token); END $$ LANGUAGE plpgsql STRICT VOLATILE PARALLEL SAFE; /** * @brief Construct an Erlang-distribution random variable, sum of * @p k i.i.d. exponentials with shared rate @p lambda * * The Erlang distribution is the sum of @p k independent * Exp(λ) random variables (equivalently the gamma with * integer shape). It is the natural closure of i.i.d. * exponentials under addition, and is materialised here as a single * gate_rv so the analytic CDF and closed-form moments fire * directly (rather than the sampler having to draw and sum @p k * exponential leaves per Monte-Carlo iteration). * * Validation: * - @p k must be ≥ 1. The degenerate @c k=1 case is silently routed * through @c exponential so erlang(1, λ) shares its gate * with exponential(λ). * - @p lambda must be finite and strictly positive. * * @warning VOLATILE is load-bearing; see the warning on * @ref normal. * * @sa Wikipedia: Erlang distribution */ CREATE OR REPLACE FUNCTION erlang(k integer, lambda double precision) RETURNS random_variable AS $$ DECLARE token uuid; BEGIN IF k < 1 THEN RAISE EXCEPTION 'provsql.erlang: k must be >= 1 (got %)', k; END IF; IF NOT provsql.is_finite_float8(lambda) THEN RAISE EXCEPTION 'provsql.erlang: lambda must be finite (got %)', lambda; END IF; IF lambda <= 0 THEN RAISE EXCEPTION 'provsql.erlang: lambda must be strictly positive (got %)', lambda; END IF; IF k = 1 THEN RETURN provsql.exponential(lambda); END IF; token := public.uuid_generate_v4(); PERFORM provsql.create_gate(token, 'rv'); PERFORM provsql.set_extra(token, 'erlang:' || k || ',' || lambda); RETURN provsql.random_variable_make(token); END $$ LANGUAGE plpgsql STRICT VOLATILE PARALLEL SAFE; /** * @brief Construct a probabilistic-mixture random variable. * * Returns a @c random_variable whose distribution is a Bernoulli * mixture of two scalar RV roots: with probability P(p = true) * the mixture samples @p x, with the complementary probability it * samples @p y. The mixing token @p p is a @c gate_input Bernoulli * whose probability has been pinned with @c set_prob, and the same * @p p can be shared with other branches of the circuit -- the * Monte-Carlo sampler's per-iteration cache couples every reference * to the same draw, so users can build joint conditional structures * (e.g. mixture(p, X1, Y1) + mixture(p, X2, Y2) samples * X1 + X2 with prob π and Y1 + Y2 with prob 1-π). * * @p x and @p y may be any scalar RV root: a base @c gate_rv * (@c normal / @c uniform / @c exponential / @c erlang), a * @c gate_value Dirac (@c as_random), a @c gate_arith expression, or * another @c mixture. N-ary mixtures are built by composition -- * mixture(p1, A, mixture(p2, B, C)) realises a 3-component * mixture with effective weights π1, (1-π1)·π2, (1-π1)·(1-π2). * * Validation: * - @p p must point to a Boolean gate (@c input, @c mulinput, * @c update, @c plus, @c times, @c monus, @c project, @c eq, * @c cmp, @c zero, @c one). Compound Boolean gates derive their * probability from their atoms via the active probability-evaluation * method; a bare @c gate_input's probability is whatever @c set_prob * pinned (@c set_prob is responsible for keeping it in [0, 1]). * - @p x and @p y must be scalar RV roots; aggregate / Boolean roots * are rejected at construction. * * Two calls to @c mixture with the same @c (p, x, y) operands collapse * to the same @c gate_mixture node by v5-hash, exactly like * @c arith(PLUS, X, Y). Draw independence is controlled by @p p: * sharing @p p couples branch selection across consumers via the * sampler's @c bool_cache_; minting independent Bernoullis (e.g. via * the @c mixture(p_value, …) overload) decouples them. * * @sa Wikipedia: Mixture distribution */ CREATE OR REPLACE FUNCTION mixture( p uuid, x random_variable, y random_variable) RETURNS random_variable AS $$ DECLARE token uuid; p_kind provsql.provenance_gate; x_uuid uuid; y_uuid uuid; x_kind provsql.provenance_gate; y_kind provsql.provenance_gate; BEGIN p_kind := provsql.get_gate_type(p); IF p_kind NOT IN ('input','mulinput','update', 'plus','times','monus', 'project','eq','cmp', 'zero','one') THEN RAISE EXCEPTION 'provsql.mixture: p must be a Boolean gate ' '(input/mulinput/update/plus/times/monus/project/eq/cmp/zero/one), got %', p_kind; END IF; x_uuid := (x)::uuid; y_uuid := (y)::uuid; x_kind := provsql.get_gate_type(x_uuid); y_kind := provsql.get_gate_type(y_uuid); IF x_kind NOT IN ('rv','value','arith','mixture') THEN RAISE EXCEPTION 'provsql.mixture: x must be a scalar RV root (rv / value / arith / mixture), got %', x_kind; END IF; IF y_kind NOT IN ('rv','value','arith','mixture') THEN RAISE EXCEPTION 'provsql.mixture: y must be a scalar RV root (rv / value / arith / mixture), got %', y_kind; END IF; token := public.uuid_generate_v5( provsql.uuid_ns_provsql(), concat('mixture', p, x_uuid, y_uuid)); PERFORM provsql.create_gate(token, 'mixture', ARRAY[p, x_uuid, y_uuid]); RETURN provsql.random_variable_make(token); END $$ LANGUAGE plpgsql STRICT IMMUTABLE PARALLEL SAFE; /** * @brief Ad-hoc mixture constructor that mints a fresh anonymous * @c gate_input Bernoulli with probability @p p_value. * * Sugar over the @c mixture(uuid, x, y) form: when the caller doesn't * care about reusing the Bernoulli token elsewhere in the circuit * (which is the common case — "give me a 0.3 / 0.7 weighted GMM, * I don't need to share the coin"), this overload creates the * underlying @c gate_input on the fly with a fresh * @c uuid_generate_v4() token, pins @p p_value via @c set_prob, and * threads everything into the uuid-keyed constructor. * * Each call mints a NEW Bernoulli, so two calls to * mixture(0.5, X, Y) are *independent* mixtures whose branch * selections are uncorrelated. When coupling is desired (e.g. two * mixtures sharing a coin), use the @c mixture(uuid, x, y) form with a * user-managed @c gate_input token. * * @warning VOLATILE is load-bearing for the same reason as * @ref normal and the other RV constructors -- folding under * @c STABLE / @c IMMUTABLE would collapse two independent draws into * one shared gate. * * @sa Wikipedia: Mixture distribution */ CREATE OR REPLACE FUNCTION mixture( p_value double precision, x random_variable, y random_variable) RETURNS random_variable AS $$ DECLARE p_token uuid; BEGIN IF p_value IS NULL OR p_value <> p_value OR p_value < 0 OR p_value > 1 THEN RAISE EXCEPTION 'provsql.mixture: probability must be in [0,1] (got %)', p_value; END IF; p_token := public.uuid_generate_v4(); PERFORM provsql.create_gate(p_token, 'input'); PERFORM provsql.set_prob(p_token, p_value); RETURN provsql.mixture(p_token, x, y); END $$ LANGUAGE plpgsql STRICT VOLATILE PARALLEL SAFE; /** * @brief Categorical-RV constructor over explicit (probabilities, * values) arrays. * * Builds a categorical-form @c gate_mixture directly: a fresh * @c gate_input "key" anchor and one @c gate_mulinput per outcome with * positive mass, all sharing the key. The wires * [key, mul_1, ..., mul_n] are what downstream evaluators * (@c Expectation, @c MonteCarloSampler, @c AnalyticEvaluator, * @c RangeCheck) recognise via @c isCategoricalMixture and treat as a * scalar RV with the categorical distribution @p probs over * @p outcomes. * * Validation: * - @p probs and @p outcomes must be non-null, same length, length ≥ 1. * - Each @c probs[i] must be finite, in [0, 1], and the array * must sum to 1 within @c 1e-9. * - Each @c outcomes[i] must be finite. * * Each call mints a fresh key gate and a fresh set of mulinputs, so * two calls to @c categorical with the same arrays are *independent* * categorical RVs. The marking is @c VOLATILE accordingly. * * Degenerate case: a categorical with exactly one positive-mass * outcome reduces to @c as_random(v) at construction (the block would * just be a single mulinput, which is operationally a Dirac point * mass). Two such calls share the @c gate_value UUID via the v5 * convention @c as_random already uses. * * @sa @c mixture for the Bernoulli-weighted choice constructor. * @sa Wikipedia: Categorical distribution */ CREATE OR REPLACE FUNCTION categorical( probs double precision[], outcomes double precision[]) RETURNS random_variable AS $$ DECLARE n integer; p_sum double precision := 0.0; i integer; key_token uuid; mix_token uuid; mul_token uuid; mul_tokens uuid[] := ARRAY[]::uuid[]; mix_wires uuid[]; pi_i double precision; vi_i double precision; BEGIN IF probs IS NULL OR outcomes IS NULL THEN RAISE EXCEPTION 'provsql.categorical: probs and outcomes must be non-null'; END IF; n := array_length(probs, 1); IF n IS NULL OR n < 1 THEN RAISE EXCEPTION 'provsql.categorical: probs must be non-empty'; END IF; IF array_length(outcomes, 1) <> n THEN RAISE EXCEPTION 'provsql.categorical: probs and outcomes must have the same length (got % and %)', n, array_length(outcomes, 1); END IF; FOR i IN 1..n LOOP pi_i := probs[i]; vi_i := outcomes[i]; -- PostgreSQL diverges from IEEE 754: NaN = NaN is TRUE there, so -- the canonical x <> x NaN test doesn't fire. Compare against the -- literal 'NaN'::float8 instead, and reject ±Infinity for outcomes -- explicitly. IF pi_i IS NULL OR pi_i = 'NaN'::float8 OR pi_i < 0 OR pi_i > 1 THEN RAISE EXCEPTION 'provsql.categorical: probs[%] must be in [0,1] (got %)', i, pi_i; END IF; IF vi_i IS NULL OR vi_i = 'NaN'::float8 OR vi_i = 'Infinity'::float8 OR vi_i = '-Infinity'::float8 THEN RAISE EXCEPTION 'provsql.categorical: outcomes[%] must be finite (got %)', i, vi_i; END IF; p_sum := p_sum + pi_i; END LOOP; IF abs(p_sum - 1.0) > 1e-9 THEN RAISE EXCEPTION 'provsql.categorical: probs must sum to 1 within 1e-9 (got %)', p_sum; END IF; -- Degenerate case: exactly one positive-mass outcome (the rest are -- zero). The "categorical" is then a Dirac point mass; skip the -- block-allocation entirely and return @c as_random(v), which yields -- a shared, v5-keyed gate_value -- exactly what downstream -- evaluators (rv_moment, AnalyticEvaluator, rv_support) treat -- specially. Saves a key gate and a mulinput per call, and lets -- two calls to @c categorical({1.0}, {v}) collide on the same -- gate_value UUID instead of producing distinct anonymous blocks. DECLARE nb_positive integer := 0; only_idx integer := 0; BEGIN FOR i IN 1..n LOOP IF probs[i] > 0.0 THEN nb_positive := nb_positive + 1; only_idx := i; END IF; END LOOP; IF nb_positive = 1 THEN RETURN provsql.as_random(outcomes[only_idx]); END IF; END; -- Mint the block's key anchor. Probability 1.0 matches the -- joint-table convention: the categorical mass lives on the -- mulinputs, the key just identifies the block. key_token := public.uuid_generate_v4(); PERFORM provsql.create_gate(key_token, 'input'); PERFORM provsql.set_prob(key_token, 1.0); -- One mulinput per positive-probability outcome. Zero-probability -- entries contribute no mass and are skipped: the gate_mixture's -- wire vector is otherwise polluted with no-op leaves. FOR i IN 1..n LOOP pi_i := probs[i]; IF pi_i <= 0.0 THEN CONTINUE; END IF; mul_token := public.uuid_generate_v4(); PERFORM provsql.create_gate(mul_token, 'mulinput', ARRAY[key_token]); PERFORM provsql.set_prob(mul_token, pi_i); PERFORM provsql.set_infos(mul_token, (i - 1)); PERFORM provsql.set_extra(mul_token, outcomes[i]::text); mul_tokens := mul_tokens || mul_token; END LOOP; mix_wires := ARRAY[key_token] || mul_tokens; mix_token := public.uuid_generate_v4(); PERFORM provsql.create_gate(mix_token, 'mixture', mix_wires); RETURN provsql.random_variable_make(mix_token); END $$ LANGUAGE plpgsql STRICT VOLATILE PARALLEL SAFE; /** * @brief Lift a deterministic constant into a random_variable * * Creates a gate_value carrying the constant's text form so * that comparisons against a random_variable column produce * the same circuit shape regardless of whether the operand is an * actual RV or a literal constant. * * Marked IMMUTABLE: the gate UUID is derived deterministically * from the constant via the same v5 convention as provenance_semimod's * inline value gate (concat('value', CAST(c AS VARCHAR))), so * as_random(2) always resolves to the same gate, and any other * code path that already creates a value gate for the same constant * (e.g. provenance_semimod) shares the UUID. * create_gate is idempotent on already-mapped tokens, so * repeat invocations are harmless. * * @sa Wikipedia: Degenerate distribution (Dirac point mass) */ CREATE OR REPLACE FUNCTION as_random(c double precision) RETURNS random_variable AS $$ DECLARE -- Canonicalise -0.0 to +0.0: IEEE 754 defines x + 0.0 = +0.0 for -- both signed zeros, and is identity for finite, NaN, and ±Infinity. -- Without this, as_random(-0.0) and as_random(+0.0) would produce -- different gate UUIDs (their CAST AS VARCHAR text representations -- differ: '-0' vs '0') even though they denote the same constant. c_canon double precision := c + 0.0; c_text varchar := CAST(c_canon AS VARCHAR); token uuid := public.uuid_generate_v5( provsql.uuid_ns_provsql(), concat('value', c_text)); BEGIN PERFORM provsql.create_gate(token, 'value'); PERFORM provsql.set_extra(token, c_text); RETURN provsql.random_variable_make(token); END $$ LANGUAGE plpgsql STRICT IMMUTABLE PARALLEL SAFE; /** * @brief Implicit cast double precision -> random_variable (lifts a * scalar literal to a constant RV). * * Lets users write WHERE reading > 2.5::float8 instead of * WHERE reading > provsql.as_random(2.5); the planner-hook * rewriter then sees a uniform random_variable on both sides. * Sibling casts below cover @c integer and @c numeric literals so * plain WHERE reading > 2 and WHERE reading > 2.5 * also work — PostgreSQL's operator resolution does not chain casts * across more than one step, so each numeric-source type needs its * own direct cast. */ CREATE CAST (double precision AS random_variable) WITH FUNCTION as_random(double precision) AS IMPLICIT; /** @brief @c as_random for @c integer (delegates to the @c float8 form). */ CREATE OR REPLACE FUNCTION as_random(c integer) RETURNS random_variable AS $$ SELECT provsql.as_random(c::double precision); $$ LANGUAGE sql STRICT IMMUTABLE PARALLEL SAFE; /** @brief @c as_random for @c numeric (delegates to the @c float8 form). */ CREATE OR REPLACE FUNCTION as_random(c numeric) RETURNS random_variable AS $$ SELECT provsql.as_random(c::double precision); $$ LANGUAGE sql STRICT IMMUTABLE PARALLEL SAFE; /** @brief Implicit cast integer -> random_variable. */ CREATE CAST (integer AS random_variable) WITH FUNCTION as_random(integer) AS IMPLICIT; /** @brief Implicit cast numeric -> random_variable. */ CREATE CAST (numeric AS random_variable) WITH FUNCTION as_random(numeric) AS IMPLICIT; /** * @name Arithmetic and comparison on random_variable * * Each binary operator below is declared on @c (random_variable, * random_variable) only; mixed shapes such as rv + 2 or * 2.5 > rv resolve through the implicit casts from * @c integer / @c numeric / @c double @c precision to * @c random_variable declared above. This avoids the resolution * ambiguity that would arise if both (rv, numeric) and * (rv, rv) overloads were declared while implicit casts also * existed. * * Arithmetic operators build a @c gate_arith via @c provenance_arith * and return a new @c random_variable wrapping its UUID. * * Comparison operators are placeholders that return @c boolean and * raise if executed -- the @c boolean return type is required so that * PostgreSQL accepts WHERE rv > 2 at parse-analyze. The * planner hook intercepts every such @c OpExpr (matched by * @c opfuncid against @c constants_t::OID_FUNCTION_RV_CMP) and rewrites * it into a @c provenance_cmp call whose UUID is conjoined into the * tuple's @c provsql column via @c provenance_times. Code that needs * a @c gate_cmp UUID directly (without going through the planner hook) * uses the @c rv_cmp_* family below, which call @c provenance_cmp * with the matching float8-comparator OID. * * @{ */ /** @brief @c random_variable + @c random_variable (gate_arith PLUS). */ CREATE OR REPLACE FUNCTION random_variable_plus( a random_variable, b random_variable) RETURNS random_variable AS $$ SELECT provsql.random_variable_make( provsql.provenance_arith( 0, -- PROVSQL_ARITH_PLUS ARRAY[(a)::uuid, (b)::uuid])); $$ LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE; /** @brief @c random_variable - @c random_variable (gate_arith MINUS). */ CREATE OR REPLACE FUNCTION random_variable_minus( a random_variable, b random_variable) RETURNS random_variable AS $$ SELECT provsql.random_variable_make( provsql.provenance_arith( 2, -- PROVSQL_ARITH_MINUS ARRAY[(a)::uuid, (b)::uuid])); $$ LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE; /** @brief @c random_variable * @c random_variable (gate_arith TIMES). */ CREATE OR REPLACE FUNCTION random_variable_times( a random_variable, b random_variable) RETURNS random_variable AS $$ SELECT provsql.random_variable_make( provsql.provenance_arith( 1, -- PROVSQL_ARITH_TIMES ARRAY[(a)::uuid, (b)::uuid])); $$ LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE; /** @brief @c random_variable / @c random_variable (gate_arith DIV). */ CREATE OR REPLACE FUNCTION random_variable_div( a random_variable, b random_variable) RETURNS random_variable AS $$ SELECT provsql.random_variable_make( provsql.provenance_arith( 3, -- PROVSQL_ARITH_DIV ARRAY[(a)::uuid, (b)::uuid])); $$ LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE; /** @brief Unary @c -random_variable (gate_arith NEG). */ CREATE OR REPLACE FUNCTION random_variable_neg(a random_variable) RETURNS random_variable AS $$ SELECT provsql.random_variable_make( provsql.provenance_arith( 4, -- PROVSQL_ARITH_NEG ARRAY[(a)::uuid])); $$ LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE; /** * @brief Internal helper: float8-comparator OID for a given symbol. * * Wraps the @c '<sym>(double precision,double precision)'::regoperator * lookup so the per-comparator functions read uniformly. Marked * @c IMMUTABLE because the resolved OID is fixed at catalog level * (the float8 comparators are core PG and never re-installed). */ CREATE OR REPLACE FUNCTION random_variable_cmp_oid(sym text) RETURNS oid AS $$ SELECT (sym || '(double precision,double precision)')::regoperator::oid; $$ LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE; /* The six @c random_variable_{lt,le,eq,ne,ge,gt} functions below are * boolean placeholders -- they exist only so the @c (rv, rv) operators * can be declared at all (PostgreSQL needs a procedure to bind to the * operator definition, and a procedure returning anything but @c boolean * would be rejected by parse-analyze in a WHERE position). They MUST * NOT be invoked directly: the planner hook in @c src/provsql.c * intercepts every @c OpExpr whose @c opfuncid matches one of these and * rewrites it into a @c provenance_cmp() call against the row's * provenance. If the executor ever reaches one of these, it means the * planner hook was bypassed (e.g. @c provsql.active was off), in which * case raising is the right behaviour. */ /** @brief Placeholder body shared by every random_variable_* * comparison procedure. Raises with a uniform message. */ CREATE OR REPLACE FUNCTION random_variable_cmp_placeholder( a random_variable, b random_variable) RETURNS boolean AS $$ BEGIN RAISE EXCEPTION 'random_variable comparison must be rewritten by the ' 'ProvSQL planner hook (is provsql.active off?)'; END $$ LANGUAGE plpgsql IMMUTABLE STRICT PARALLEL SAFE; CREATE OR REPLACE FUNCTION random_variable_lt( a random_variable, b random_variable) RETURNS boolean AS $$ SELECT provsql.random_variable_cmp_placeholder(a, b); $$ LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE; CREATE OR REPLACE FUNCTION random_variable_le( a random_variable, b random_variable) RETURNS boolean AS $$ SELECT provsql.random_variable_cmp_placeholder(a, b); $$ LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE; CREATE OR REPLACE FUNCTION random_variable_eq( a random_variable, b random_variable) RETURNS boolean AS $$ SELECT provsql.random_variable_cmp_placeholder(a, b); $$ LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE; CREATE OR REPLACE FUNCTION random_variable_ne( a random_variable, b random_variable) RETURNS boolean AS $$ SELECT provsql.random_variable_cmp_placeholder(a, b); $$ LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE; CREATE OR REPLACE FUNCTION random_variable_ge( a random_variable, b random_variable) RETURNS boolean AS $$ SELECT provsql.random_variable_cmp_placeholder(a, b); $$ LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE; CREATE OR REPLACE FUNCTION random_variable_gt( a random_variable, b random_variable) RETURNS boolean AS $$ SELECT provsql.random_variable_cmp_placeholder(a, b); $$ LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE; /* Direct UUID constructors -- used by tests and any caller that wants * a @c gate_cmp without going through the planner hook (e.g. building * a circuit fragment in a SELECT list). Each delegates to * @c provenance_cmp with the matching float8-comparator OID. */ /** @brief Build a @c gate_cmp for a < b and return its UUID. */ CREATE OR REPLACE FUNCTION rv_cmp_lt( a random_variable, b random_variable) RETURNS uuid AS $$ SELECT provsql.provenance_cmp( (a)::uuid, provsql.random_variable_cmp_oid('<'), (b)::uuid); $$ LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE; /** @brief Build a @c gate_cmp for a ≤ b and return its UUID. */ CREATE OR REPLACE FUNCTION rv_cmp_le( a random_variable, b random_variable) RETURNS uuid AS $$ SELECT provsql.provenance_cmp( (a)::uuid, provsql.random_variable_cmp_oid('<='), (b)::uuid); $$ LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE; /** @brief Build a @c gate_cmp for a = b and return its UUID. */ CREATE OR REPLACE FUNCTION rv_cmp_eq( a random_variable, b random_variable) RETURNS uuid AS $$ SELECT provsql.provenance_cmp( (a)::uuid, provsql.random_variable_cmp_oid('='), (b)::uuid); $$ LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE; /** @brief Build a @c gate_cmp for a <> b and return its UUID. */ CREATE OR REPLACE FUNCTION rv_cmp_ne( a random_variable, b random_variable) RETURNS uuid AS $$ SELECT provsql.provenance_cmp( (a)::uuid, provsql.random_variable_cmp_oid('<>'), (b)::uuid); $$ LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE; /** @brief Build a @c gate_cmp for a ≥ b and return its UUID. */ CREATE OR REPLACE FUNCTION rv_cmp_ge( a random_variable, b random_variable) RETURNS uuid AS $$ SELECT provsql.provenance_cmp( (a)::uuid, provsql.random_variable_cmp_oid('>='), (b)::uuid); $$ LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE; /** @brief Build a @c gate_cmp for a > b and return its UUID. */ CREATE OR REPLACE FUNCTION rv_cmp_gt( a random_variable, b random_variable) RETURNS uuid AS $$ SELECT provsql.provenance_cmp( (a)::uuid, provsql.random_variable_cmp_oid('>'), (b)::uuid); $$ LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE; CREATE OPERATOR + ( LEFTARG = random_variable, RIGHTARG = random_variable, PROCEDURE = random_variable_plus, COMMUTATOR = + ); CREATE OPERATOR - ( LEFTARG = random_variable, RIGHTARG = random_variable, PROCEDURE = random_variable_minus ); CREATE OPERATOR * ( LEFTARG = random_variable, RIGHTARG = random_variable, PROCEDURE = random_variable_times, COMMUTATOR = * ); CREATE OPERATOR / ( LEFTARG = random_variable, RIGHTARG = random_variable, PROCEDURE = random_variable_div ); /** @brief Prefix unary minus on @c random_variable. */ CREATE OPERATOR - ( RIGHTARG = random_variable, PROCEDURE = random_variable_neg ); CREATE OPERATOR < ( LEFTARG = random_variable, RIGHTARG = random_variable, PROCEDURE = random_variable_lt, COMMUTATOR = >, NEGATOR = >= ); CREATE OPERATOR <= ( LEFTARG = random_variable, RIGHTARG = random_variable, PROCEDURE = random_variable_le, COMMUTATOR = >=, NEGATOR = > ); CREATE OPERATOR = ( LEFTARG = random_variable, RIGHTARG = random_variable, PROCEDURE = random_variable_eq, COMMUTATOR = =, NEGATOR = <> ); CREATE OPERATOR <> ( LEFTARG = random_variable, RIGHTARG = random_variable, PROCEDURE = random_variable_ne, COMMUTATOR = <>, NEGATOR = = ); CREATE OPERATOR >= ( LEFTARG = random_variable, RIGHTARG = random_variable, PROCEDURE = random_variable_ge, COMMUTATOR = <=, NEGATOR = < ); CREATE OPERATOR > ( LEFTARG = random_variable, RIGHTARG = random_variable, PROCEDURE = random_variable_gt, COMMUTATOR = <, NEGATOR = <= ); /** @} */ /** * @name Aggregates over random_variable * * Phase 1 of the SUM-over-RV story: an overload of the standard * @c sum aggregate that takes a @c random_variable per row and returns * the @c random_variable representing the (provenance-weighted) sum. * Lives in the @c provsql schema so a @c sum(random_variable) call * resolves to it without colliding with the built-in numeric @c sum * overloads in @c pg_catalog. * * Direct calls outside a provenance-tracked query treat each row's * contribution unconditionally (no per-row Boolean selector). When * the planner hook sees a @c provsql.sum @c Aggref over a * provenance-tracked query, it wraps the per-row argument @c x in * provsql.mixture(prov_token, x, provsql.as_random(0)) so the * aggregate's effective semantics become * @f$\mathrm{SUM}(x) = \sum_i \mathbf{1}\{\varphi_i\} \cdot X_i@f$, * the natural extension of semimodule-provenance to RV-valued M. * * The internal state is the array of UUIDs of the per-row mixtures. * The final function builds a single @c gate_arith @c PLUS over them * (or returns @c as_random(0) for an empty group, the additive * identity). Sharing on @c provenance_arith's v5 hash means two * @c sum invocations over the same set of rows collide on the same * gate. * * @{ */ /** * @brief Per-row helper: wrap an RV in @c mixture(prov, rv, as_random(0)). * * Internal helper used by the planner-hook rewriter to lift a * @c sum(random_variable) argument into its provenance-aware form. * Encodes one row's contribution to the SUM as a Bernoulli mixture * over the row's provenance: with probability @c P(prov) the mixture * samples @c rv, otherwise it samples the additive identity * @c as_random(0). Exposed as a regular SQL function so the planner * can construct a @c FuncExpr by name without needing to disambiguate * @c mixture / @c as_random overloads at OID-lookup time. */ CREATE OR REPLACE FUNCTION rv_aggregate_semimod( prov uuid, rv random_variable) RETURNS random_variable AS $$ SELECT provsql.mixture(prov, rv, provsql.as_random(0::double precision)); $$ LANGUAGE sql IMMUTABLE STRICT PARALLEL SAFE; /** * @brief State-transition function for @c sum(random_variable). * * Appends the input RV's UUID to the running array. NULL inputs are * skipped (matching standard SUM semantics). The aggregate's INITCOND * is @c '{}' so the FINALFUNC always runs even on an empty group, which * is what lets us return @c as_random(0) (the additive identity) for * an empty SUM rather than NULL. */ CREATE OR REPLACE FUNCTION sum_rv_sfunc( state uuid[], rv random_variable) RETURNS uuid[] AS $$ SELECT CASE WHEN rv IS NULL THEN state ELSE array_append(state, (rv)::uuid) END; $$ LANGUAGE sql IMMUTABLE PARALLEL SAFE; /** * @brief Final function for @c sum(random_variable): build a * @c gate_arith PLUS root. * * Empty group (@c state = @c '{}'): return @c as_random(0), the * additive identity, so SUM over zero rows is the deterministic * scalar 0 -- matches the agg_token convention in @c agg_raw_moment. * * Singleton group: return the single child directly without minting a * useless single-child @c gate_arith. * * Otherwise: build @c gate_arith(PLUS, state) via @c provenance_arith. */ CREATE OR REPLACE FUNCTION sum_rv_ffunc(state uuid[]) RETURNS random_variable AS $$ DECLARE arith_token uuid; BEGIN IF state IS NULL OR array_length(state, 1) IS NULL THEN RETURN provsql.as_random(0::double precision); END IF; IF array_length(state, 1) = 1 THEN RETURN provsql.random_variable_make(state[1]); END IF; arith_token := provsql.provenance_arith(0, state); -- 0 = PROVSQL_ARITH_PLUS RETURN provsql.random_variable_make(arith_token); END $$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; CREATE AGGREGATE sum(random_variable) ( SFUNC = sum_rv_sfunc, STYPE = uuid[], INITCOND = '{}', FINALFUNC = sum_rv_ffunc ); /** * @brief Final function for @c avg(random_variable). * * Builds the natural lift of @c "AVG = SUM / COUNT" into the * @c random_variable algebra: * @f[ * \mathrm{AVG}(x) \;=\; \frac{\sum_i \mathbf{1}\{\varphi_i\} \cdot X_i} * {\sum_i \mathbf{1}\{\varphi_i\}} * @f] * realised as @c gate_arith(DIV, num, denom) where @c num is the * @c sum(random_variable) gate over the per-row mixtures and @c denom * is the @c sum(random_variable) gate over the same provenance gates * weighted by a per-row @c as_random(1) -- exactly the SQL pattern * "@c sum(x) @c / @c sum(as_random(1))" emitted as a single * @c random_variable token. * * Reuses @c sum_rv_sfunc as the state-transition function so the * array of per-row UUIDs is collected identically to * @c sum(random_variable). In a provenance-tracked query the * planner-hook rewriter routes RV-returning aggregates through * @c make_rv_aggregate_expression, which wraps each per-row argument * in @c mixture(prov_i, x_i, as_random(0)); the FFUNC then recovers * @c prov_i from each mixture's first child to construct the matching * @c mixture(prov_i, as_random(1), as_random(0)) for the denominator. * Outside a tracked query the per-row UUIDs are plain RV roots, in * which case each row contributes an unconditional @c as_random(1) * to the denominator -- the natural extension of "no provenance = * every row counts" used elsewhere in the extension. * * Empty group: returns @c NULL, matching the standard SQL @c AVG * convention. This differs from @c sum(random_variable), which * returns the additive identity @c as_random(0) for an empty group; * for AVG the multiplicative identity is not the right answer and * the caller has no way to disambiguate "0 rows" from "rows that * sum to 0". */ CREATE OR REPLACE FUNCTION avg_rv_ffunc(state uuid[]) RETURNS random_variable AS $$ DECLARE n integer; i integer; num_token uuid; denom_token uuid; denom_state uuid[] := '{}'; one_uuid uuid; gtype provsql.provenance_gate; children uuid[]; prov_i uuid; BEGIN IF state IS NULL THEN RETURN NULL; END IF; n := array_length(state, 1); IF n IS NULL THEN RETURN NULL; END IF; one_uuid := ( provsql.as_random(1::double precision))::uuid; FOR i IN 1..n LOOP gtype := provsql.get_gate_type(state[i]); IF gtype = 'mixture'::provsql.provenance_gate THEN children := provsql.get_children(state[i]); prov_i := children[1]; denom_state := array_append( denom_state, ( provsql.rv_aggregate_semimod( prov_i, provsql.as_random(1::double precision)))::uuid); ELSE denom_state := array_append(denom_state, one_uuid); END IF; END LOOP; IF n = 1 THEN num_token := state[1]; denom_token := denom_state[1]; ELSE num_token := provsql.provenance_arith(0, state); -- 0 = PLUS denom_token := provsql.provenance_arith(0, denom_state); -- 0 = PLUS END IF; RETURN provsql.random_variable_make( provsql.provenance_arith( 3, -- 3 = PROVSQL_ARITH_DIV ARRAY[num_token, denom_token])); END $$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; CREATE AGGREGATE avg(random_variable) ( SFUNC = sum_rv_sfunc, STYPE = uuid[], INITCOND = '{}', FINALFUNC = avg_rv_ffunc ); /** * @brief Final function for @c product(random_variable). * * Multiplicative analogue of @c sum(random_variable): * @f[ * \mathrm{PRODUCT}(x) \;=\; \prod_i \big(\mathbf{1}\{\varphi_i\} \cdot X_i * + \mathbf{1}\{\neg\varphi_i\} \cdot 1\big) * \;=\; \prod_{i : \varphi_i} X_i * @f] * realised as @c gate_arith(TIMES, mixtures) over per-row contributions * whose @em else-branch is @c as_random(1) (the multiplicative * identity), so rows whose provenance is false contribute @c 1 to the * product instead of @c 0. * * The C-side wrap shared with @c sum / @c avg always builds * @c mixture(prov_i, X_i, as_random(0)); the PRODUCT FFUNC patches each * mixture's else-branch to @c as_random(1) by reconstructing the * mixture with the corrected else-arg. Going through * @c provsql.mixture (rather than @c create_gate directly) keeps the * gate v5-hash consistent with any other mixture sharing the same * @c (prov_i, X_i, as_random(1)) triple. * * Reuses @c sum_rv_sfunc as the state-transition function. Empty * group: returns the multiplicative identity @c as_random(1) -- the * natural counterpart to @c sum(random_variable)'s empty-group * @c as_random(0). * * Singleton group: returns the single patched child directly without * minting a useless single-child @c gate_arith TIMES root. * * Direct (untracked) call: state entries are raw RV uuids rather than * mixtures; pass them through unchanged so PRODUCT degenerates to the * straight RV product over all rows, the natural "no provenance = * every row counts" behaviour. */ CREATE OR REPLACE FUNCTION product_rv_ffunc(state uuid[]) RETURNS random_variable AS $$ DECLARE n integer; i integer; prod_state uuid[] := '{}'; one_rv provsql.random_variable; gtype provsql.provenance_gate; children uuid[]; prov_i uuid; x_uuid uuid; BEGIN one_rv := provsql.as_random(1::double precision); IF state IS NULL THEN RETURN one_rv; END IF; n := array_length(state, 1); IF n IS NULL THEN RETURN one_rv; END IF; FOR i IN 1..n LOOP gtype := provsql.get_gate_type(state[i]); IF gtype = 'mixture'::provsql.provenance_gate THEN children := provsql.get_children(state[i]); prov_i := children[1]; x_uuid := children[2]; prod_state := array_append( prod_state, ( provsql.mixture( prov_i, provsql.random_variable_make(x_uuid), one_rv))::uuid); ELSE prod_state := array_append(prod_state, state[i]); END IF; END LOOP; IF n = 1 THEN RETURN provsql.random_variable_make(prod_state[1]); END IF; RETURN provsql.random_variable_make( provsql.provenance_arith(1, prod_state)); -- 1 = PROVSQL_ARITH_TIMES END $$ LANGUAGE plpgsql IMMUTABLE PARALLEL SAFE; CREATE AGGREGATE product(random_variable) ( SFUNC = sum_rv_sfunc, STYPE = uuid[], INITCOND = '{}', FINALFUNC = product_rv_ffunc ); /** @} */ /** @} */ /** @defgroup aggregate_provenance Aggregate provenance * Functions for building and evaluating aggregate (GROUP BY) provenance, * including the δ-semiring operator and semimodule multiplication. * @{ */ /** * @brief Create a δ-semiring gate wrapping a provenance token * * Used internally for aggregate provenance. Returns the token unchanged * if it is gate_zero() or gate_one(), and gate_one() if the token is NULL. */ CREATE OR REPLACE FUNCTION provenance_delta (token UUID) RETURNS UUID AS $$ DECLARE delta_token uuid; BEGIN IF token = gate_zero() OR token = gate_one() THEN return token; END IF; IF token IS NULL THEN return gate_one(); END IF; delta_token:=uuid_generate_v5(uuid_ns_provsql(),concat('delta',token)); PERFORM create_gate(delta_token,'delta',ARRAY[token::uuid]); RETURN delta_token; END $$ LANGUAGE plpgsql SET search_path=provsql,pg_temp,public SECURITY DEFINER PARALLEL SAFE; /** * @brief Build an aggregate provenance gate from grouped tokens * * Called internally by the query rewriter for GROUP BY queries. * Creates an agg gate linking all contributing tokens and records * the aggregate function OID and the computed scalar value. * * @param aggfnoid OID of the SQL aggregate function * @param aggtype OID of the aggregate result type * @param val computed aggregate value * @param tokens array of provenance tokens being aggregated */ CREATE OR REPLACE FUNCTION provenance_aggregate( aggfnoid integer, aggtype integer, val anyelement, tokens uuid[]) RETURNS agg_token AS $$ DECLARE c INTEGER; agg_tok uuid; agg_val varchar; BEGIN c:=COALESCE(array_length(tokens, 1), 0); agg_val = CAST(val as VARCHAR); IF c = 0 THEN agg_tok := gate_zero(); ELSE -- aggfnoid must be part of the UUID: SUM(id) and AVG(id) over the -- same children would otherwise collapse to a single gate, and -- their concurrent set_infos calls would overwrite each other's -- aggregation operator (resulting in the wrong agg_kind being -- read by provsql_having under cross-backend contention). agg_tok := uuid_generate_v5( uuid_ns_provsql(), concat('agg',aggfnoid,tokens)); PERFORM create_gate(agg_tok, 'agg', tokens); PERFORM set_infos(agg_tok, aggfnoid, aggtype); PERFORM set_extra(agg_tok, agg_val); END IF; RETURN '( '||agg_tok||' , '||agg_val||' )'; END $$ LANGUAGE plpgsql PARALLEL SAFE SET search_path=provsql,pg_temp,public SECURITY DEFINER; /** * @brief Create a semimodule scalar multiplication gate * * Pairs a scalar value with a provenance token, used internally by * the query rewriter for aggregate provenance. * * @param val the scalar value * @param token the provenance token to multiply */ CREATE OR REPLACE FUNCTION provenance_semimod(val anyelement, token UUID) RETURNS UUID AS $$ DECLARE semimod_token uuid; value_token uuid; BEGIN SELECT uuid_generate_v5(uuid_ns_provsql(),concat('value',CAST(val AS VARCHAR))) INTO value_token; SELECT uuid_generate_v5(uuid_ns_provsql(),concat('semimod',value_token,token)) INTO semimod_token; --create value gates PERFORM create_gate(value_token,'value'); PERFORM set_extra(value_token, CAST(val AS VARCHAR)); --create semimod gate PERFORM create_gate(semimod_token,'semimod',ARRAY[token::uuid,value_token]); RETURN semimod_token; END $$ LANGUAGE plpgsql PARALLEL SAFE SET search_path=provsql,pg_temp,public SECURITY DEFINER; /** @} */ /** @defgroup probability Probability and Shapley values * Functions for computing probabilities, expected values, and * game-theoretic contribution measures (Shapley/Banzhaf values) * from provenance circuits. * @{ */ /** * @brief Compute the probability of a provenance token * * Compiles the provenance circuit to d-DNNF and evaluates the * probability. The compilation method can be selected explicitly. * * @param token provenance token to evaluate * @param method knowledge compilation method (NULL for default) * @param arguments additional arguments for the method */ CREATE OR REPLACE FUNCTION probability_evaluate( token UUID, method text = NULL, arguments text = NULL) RETURNS DOUBLE PRECISION AS 'provsql','probability_evaluate' LANGUAGE C STABLE; /** * @brief Compute the expected value of a probabilistic scalar * * Computes E[input | prov] for either an @c agg_token (discrete * SUM/MIN/MAX aggregation over Boolean-input gate_agg circuits, with * @c prov as the Boolean conditioning event) or a @c random_variable * (continuous distribution, traversed by the analytical / MC * evaluator from @c Expectation.cpp). * * Implementation: thin wrapper over @c moment(input, 1, prov, method, * arguments). Both branches converge on the same machinery; the * agg_token side computes E[X] as the @f$k=1@f$ instance of the * @f$n^k@f$-tuple enumeration in @c agg_raw_moment, the * random_variable side calls @c compute_expectation through * @c rv_moment. * * @param input aggregate expression or random variable to compute E[·] of * @param prov provenance condition (defaults to gate_one(), i.e., unconditional) * @param method knowledge compilation method (agg_token path only) * @param arguments additional arguments for the method (agg_token path only) */ CREATE OR REPLACE FUNCTION expected( input ANYELEMENT, prov UUID = gate_one(), method text = NULL, arguments text = NULL) RETURNS DOUBLE PRECISION AS $$ SELECT moment(input, 1, prov, method, arguments); $$ LANGUAGE sql PARALLEL SAFE STABLE SET search_path=provsql SECURITY DEFINER; /** * @brief Internal: shared C entry point for variance / moment / central_moment. * * The @c expected() SQL function reaches the Expectation evaluator * through @c provenance_evaluate_compiled(..., 'expectation', ...). * The variance / raw-moment / central-moment SQL functions need an * extra @p k integer argument that does not fit that dispatcher's * signature, so they go through this dedicated entry point. Returns * E[X^k] when @p central is FALSE, or E[(X - E[X])^k] when TRUE. */ CREATE OR REPLACE FUNCTION rv_moment( token uuid, k integer, central boolean, prov uuid DEFAULT gate_one()) RETURNS double precision AS 'provsql','rv_moment' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; /** * @brief Compute the raw moment E[X^k | prov] of an agg_token aggregate * * Sister of @c expected() for the agg_token side of the polymorphic * @c moment / @c variance / @c central_moment dispatch. Supports the * same aggregation functions as @c expected: SUM (which COUNT * normalises to at the gate level via @c Aggregation.cpp:322), MIN, * and MAX. * * Strategy: * - SUM: with X = Σᵢ Iᵢ·vᵢ (Iᵢ the per-row inclusion indicator, * vᵢ the row's value), expanding X^k and taking expectation gives * @f$E[X^k] = \sum_{(i_1,\ldots,i_k) \in \{1..n\}^k} v_{i_1}\cdots v_{i_k} * \cdot P(\bigwedge_{i \in \text{distinct}(i_1..i_k)} I_i)@f$. * We enumerate the @f$n^k@f$ tuples, conjoin the distinct inclusion * tokens (and @p prov when conditioning), and evaluate the * probability via @c probability_evaluate. * - MIN / MAX: replace @c v with @c v^k in the rank-based * enumeration that @c expected already uses; @c MAX is handled by * sign-flipping per the existing trick (negate vs. rerank), with * the outer multiplier becoming @f$(-1)^k@f$ instead of just @f$-1@f$. * * Cost: SUM is @f$O(n^k)@f$ probability evaluations -- tractable for * small @p k or small @p n; for larger sizes, prefer reaching for the * sampler. MIN / MAX stay linear in @p n. */ CREATE OR REPLACE FUNCTION agg_raw_moment( token agg_token, k integer, prov UUID = gate_one(), method text = NULL, arguments text = NULL) RETURNS DOUBLE PRECISION AS $$ DECLARE aggregation_function VARCHAR; child_pairs uuid[]; pair_children uuid[]; n integer; i integer; j integer; vals float8[]; toks uuid[]; total float8; total_probability float8; tup integer[]; d integer; prod_v float8; distinct_tok uuid[]; conj_token uuid; prob float8; sign_max float8; BEGIN IF token IS NULL OR k IS NULL THEN RETURN NULL; END IF; IF k < 0 THEN RAISE EXCEPTION 'agg_raw_moment(): k must be non-negative (got %)', k; END IF; IF get_gate_type(token) <> 'agg' THEN RAISE EXCEPTION USING MESSAGE='Wrong gate type for agg_raw_moment computation'; END IF; IF k = 0 THEN RETURN 1; END IF; SELECT pp.proname::varchar FROM pg_proc pp WHERE oid=(get_infos(token)).info1 INTO aggregation_function; child_pairs := get_children(token); n := COALESCE(array_length(child_pairs, 1), 0); IF aggregation_function = 'sum' THEN -- Trivial empty aggregation: SUM = 0, so SUM^k = 0 for k >= 1. -- Note: agg_token semantics treat the "no row included" world as -- SUM = 0, so this stays consistent with k = 1 (= expected()). IF n = 0 THEN RETURN 0; END IF; -- Extract per-child token + value arrays. vals := ARRAY[]::float8[]; toks := ARRAY[]::uuid[]; FOR i IN 1..n LOOP pair_children := get_children(child_pairs[i]); toks := toks || pair_children[1]; vals := vals || CAST(get_extra(pair_children[2]) AS float8); END LOOP; -- Enumerate all k-tuples (i_1, ..., i_k) in {1..n}^k. tup is the -- current tuple; we step through them in lexicographic order. total := 0; tup := array_fill(1, ARRAY[k]); LOOP prod_v := 1; FOR j IN 1..k LOOP prod_v := prod_v * vals[tup[j]]; END LOOP; SELECT array_agg(DISTINCT toks[idx]) INTO distinct_tok FROM unnest(tup) AS idx; IF prov <> gate_one() THEN distinct_tok := distinct_tok || prov; END IF; conj_token := provenance_times(VARIADIC distinct_tok); prob := probability_evaluate(conj_token, method, arguments); total := total + prod_v * prob; d := k; WHILE d >= 1 AND tup[d] = n LOOP tup[d] := 1; d := d - 1; END LOOP; EXIT WHEN d = 0; tup[d] := tup[d] + 1; END LOOP; ELSIF aggregation_function = 'min' OR aggregation_function = 'max' THEN -- Rank enumeration: per distinct value v, P(MIN = v) is the -- probability that some t_i with v_i=v is true and all t_j with -- smaller v are false. For MAX we negate values so the same -- "smaller-than" rank logic computes MIN-of-negated, then flip. -- The outer multiplier picks up the right sign for the k-th moment -- of MAX: E[MAX^k] = (-1)^k * E[MIN(-v)^k], so sign_max = (-1)^k. sign_max := CASE WHEN aggregation_function = 'max' THEN power(-1::float8, k) ELSE 1 END; -- ±Infinity sink: positive probability of "no row included" makes -- MIN = +Inf and MAX = -Inf. For MIN^k that's +Inf for any k>=1; -- for MAX^k it's (-1)^k * (+Inf) = ±Inf depending on k's parity. WITH tok_value AS ( SELECT (get_children(c))[1] AS tok, (CASE WHEN aggregation_function='max' THEN -1 ELSE 1 END) * CAST(get_extra((get_children(c))[2]) AS DOUBLE PRECISION) AS v FROM UNNEST(child_pairs) AS c ) SELECT probability_evaluate(provenance_monus(prov, provenance_plus(ARRAY_AGG(tok)))) FROM tok_value INTO total_probability; IF total_probability > epsilon() THEN total := sign_max * 'Infinity'::float8; ELSE WITH tok_value AS ( SELECT (get_children(c))[1] AS tok, (CASE WHEN aggregation_function='max' THEN -1 ELSE 1 END) * CAST(get_extra((get_children(c))[2]) AS DOUBLE PRECISION) AS v FROM UNNEST(child_pairs) AS c ) SELECT sign_max * SUM(p * power(v, k)) FROM ( SELECT t1.v AS v, probability_evaluate( provenance_monus(provenance_plus(ARRAY_AGG(t1.tok)), provenance_plus(ARRAY_AGG(t2.tok))), method, arguments) AS p FROM tok_value t1 LEFT OUTER JOIN tok_value t2 ON t1.v > t2.v GROUP BY t1.v) tmp INTO total; END IF; ELSE RAISE EXCEPTION USING MESSAGE= 'Cannot compute moment for aggregation function ' || aggregation_function; END IF; -- Conditional normalisation: E[X^k · 1_A] / P(A) = E[X^k | A]. IF prov <> gate_one() AND total <> 0 AND total <> 'Infinity'::float8 AND total <> '-Infinity'::float8 THEN total := total / probability_evaluate(prov, method, arguments); END IF; RETURN total; END $$ LANGUAGE plpgsql PARALLEL SAFE SET search_path=provsql SECURITY DEFINER; /** * @brief Compute the variance Var[X | prov] of a probabilistic scalar * * Polymorphic dispatcher that mirrors @c expected: @c random_variable * inputs go through the analytical / MC evaluator * (@c rv_moment(uuid, 2, true)); @c agg_token inputs go through the * @c agg_raw_moment helper, computing * @f$\mathrm{Var}[X|A] = E[X^2|A] - E[X|A]^2@f$. Conditioning on * @c prov is supported for @c agg_token (matching @c expected) but * not yet for @c random_variable. */ CREATE OR REPLACE FUNCTION variance( input ANYELEMENT, prov UUID = gate_one(), method text = NULL, arguments text = NULL) RETURNS DOUBLE PRECISION AS $$ DECLARE m1 float8; m2 float8; BEGIN IF pg_typeof(input) = 'random_variable'::regtype THEN IF input IS NULL THEN RETURN NULL; END IF; -- Conditioning on prov is handled inside rv_moment: when prov -- resolves to gate_one() (the default, or load-time -- simplification of any always-true sub-circuit) the -- unconditional analytical path runs unchanged; otherwise the -- joint-circuit loader unifies shared gate_rv leaves between -- input and prov, and the conditional path runs either -- truncated-distribution closed form or MC rejection. RETURN provsql.rv_moment( (input::random_variable)::uuid, 2, true, prov); END IF; IF pg_typeof(input) = 'agg_token'::regtype THEN IF input IS NULL THEN RETURN NULL; END IF; m1 := agg_raw_moment(input::agg_token, 1, prov, method, arguments); m2 := agg_raw_moment(input::agg_token, 2, prov, method, arguments); IF m1 IS NULL OR m2 IS NULL THEN RETURN NULL; END IF; RETURN m2 - m1 * m1; END IF; RAISE EXCEPTION 'variance() is not yet supported for input type %', pg_typeof(input); END $$ LANGUAGE plpgsql PARALLEL SAFE SET search_path=provsql SECURITY DEFINER; /** * @brief Compute the raw moment E[X^k | prov] of a probabilistic scalar * * @c k must be a non-negative integer. @c k = 0 returns 1; @c k = 1 * is equivalent to @c expected(input). Polymorphic dispatcher: routes * @c random_variable through @c rv_moment (analytical / MC) and * @c agg_token through @c agg_raw_moment (SUM via tuple enumeration, * MIN / MAX via rank enumeration). */ CREATE OR REPLACE FUNCTION moment( input ANYELEMENT, k integer, prov UUID = gate_one(), method text = NULL, arguments text = NULL) RETURNS DOUBLE PRECISION AS $$ BEGIN IF pg_typeof(input) = 'random_variable'::regtype THEN IF input IS NULL OR k IS NULL THEN RETURN NULL; END IF; -- See variance() above: rv_moment handles the conditional/unconditional -- dispatch internally based on the resolved prov gate type. RETURN provsql.rv_moment( (input::random_variable)::uuid, k, false, prov); END IF; IF pg_typeof(input) = 'agg_token'::regtype THEN RETURN agg_raw_moment(input::agg_token, k, prov, method, arguments); END IF; RAISE EXCEPTION 'moment() is not yet supported for input type %', pg_typeof(input); END $$ LANGUAGE plpgsql PARALLEL SAFE SET search_path=provsql SECURITY DEFINER; /** * @brief Internal: rv-side support computation * * Lifts @c provsql::compute_support out of @c RangeCheck.cpp -- the * same interval-arithmetic propagation @c runRangeCheck uses to * decide @c gate_cmps. Returns @c [-Infinity, +Infinity] when the * tightest bound is the conservative all-real interval (e.g. for a * normal RV, or any sub-circuit that mixes a normal in). */ CREATE OR REPLACE FUNCTION rv_support( token uuid, prov uuid DEFAULT gate_one(), OUT lo float8, OUT hi float8) AS 'provsql','rv_support' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; /** * @brief Compute the support interval @c [lo, hi] of a probabilistic * (or deterministic) scalar * * Polymorphic dispatcher mirroring @c expected / @c variance / * @c moment / @c central_moment, with two extra "free" branches: * * - Plain numeric (@c smallint / @c integer / @c bigint / * @c numeric / @c real / @c double @c precision): degenerate * point support @f$[c, c]@f$. Lets callers ask for the support * of a literal without round-tripping through @c as_random. * - @c random_variable / bare @c uuid (any provenance gate * token, including the implicit @c random_variable @c -> @c uuid * cast): routes to @c rv_support, which propagates distribution * supports (uniform exact, exponential @c [0,+∞), normal * @c (-∞,+∞)) through @c gate_arith via interval arithmetic. * @c gate_value gives the same @f$[c, c]@f$ point support as the * numeric branch; any non-scalar gate (Boolean gates, aggregates, * ...) safely falls back to the conservative all-real interval * without raising. Conditioning on @c prov is not yet supported. * * - @c agg_token: closed-form per aggregation function: * - @c SUM : @f$[\sum_i \min(0,v_i), \sum_i \max(0,v_i)]@f$ * (every row is independently in or out of the included set; the * extreme SUMs are reached by including only positive or only * negative-valued rows). * - @c MIN : @f$[\min_i v_i, \max_i v_i]@f$ in the non-empty * subsets, plus @c +Infinity if the empty subset has positive * probability under @c prov. * - @c MAX : symmetric -- @c -Infinity if empty has positive * probability under @c prov, otherwise @c min_i v_i; @c hi is * always @c max_i v_i. * * Other aggregation functions raise. * * Returns the composite record @c (lo, hi) via the function's * @c OUT parameters, with @c -Infinity / @c +Infinity marking * unbounded ends. */ CREATE OR REPLACE FUNCTION support( input ANYELEMENT, prov UUID = gate_one(), method text = NULL, arguments text = NULL, OUT lo float8, OUT hi float8) AS $$ DECLARE aggregation_function VARCHAR; child_pairs uuid[]; values_arr float8[]; total_probability float8; BEGIN IF input IS NULL THEN lo := NULL; hi := NULL; RETURN; END IF; -- Plain numeric: degenerate point support. Lets `support(2.5)` / -- `support(42)` / etc. return (2.5, 2.5) without making the user -- wrap in `as_random`. IF pg_typeof(input) IN ( 'smallint'::regtype, 'integer'::regtype, 'bigint'::regtype, 'numeric'::regtype, 'real'::regtype, 'double precision'::regtype) THEN lo := input::double precision; hi := input::double precision; RETURN; END IF; -- random_variable has an IMPLICIT cast to uuid, so a single -- rv_support call covers both shapes. rv_support handles -- gate_value (point), gate_rv (distribution), gate_arith -- (propagated), and falls back to the conservative all-real -- interval for any other gate kind. Conditioning on prov is not -- supported (would require restricting the underlying joint -- distribution by the indicator of prov, which has no closed form -- for the basic distributions we ship). IF pg_typeof(input) IN ('random_variable'::regtype, 'uuid'::regtype) THEN -- Conditional support: rv_support folds the AND-conjunct interval -- constraints from prov into the unconditional support. When -- prov is gate_one() the unconditional support is returned -- unchanged. SELECT r.lo, r.hi INTO lo, hi FROM provsql.rv_support(input::uuid, prov) r; RETURN; END IF; IF pg_typeof(input) = 'agg_token'::regtype THEN IF get_gate_type(input::agg_token) <> 'agg' THEN RAISE EXCEPTION USING MESSAGE='Wrong gate type for support computation'; END IF; SELECT pp.proname::varchar FROM pg_proc pp WHERE oid=(get_infos(input::agg_token)).info1 INTO aggregation_function; child_pairs := get_children(input::agg_token); IF aggregation_function = 'sum' THEN -- Empty agg_token: SUM is identically 0. IF COALESCE(array_length(child_pairs, 1), 0) = 0 THEN lo := 0; hi := 0; RETURN; END IF; SELECT sum(LEAST(v, 0::float8)), sum(GREATEST(v, 0::float8)) INTO lo, hi FROM (SELECT CAST(get_extra((get_children(c))[2]) AS float8) AS v FROM unnest(child_pairs) AS c) sub; ELSIF aggregation_function = 'min' OR aggregation_function = 'max' THEN -- Empty agg_token: MIN = +Infinity, MAX = -Infinity (the -- empty-set conventions used by `expected`). IF COALESCE(array_length(child_pairs, 1), 0) = 0 THEN IF aggregation_function = 'min' THEN lo := 'Infinity'::float8; hi := 'Infinity'::float8; ELSE lo := '-Infinity'::float8; hi := '-Infinity'::float8; END IF; RETURN; END IF; WITH tok_value AS ( SELECT (get_children(c))[1] AS tok, CAST(get_extra((get_children(c))[2]) AS float8) AS v FROM UNNEST(child_pairs) AS c ) SELECT min(v), max(v), probability_evaluate( provenance_monus(prov, provenance_plus(ARRAY_AGG(tok))), method, arguments) INTO lo, hi, total_probability FROM tok_value; IF total_probability > epsilon() THEN IF aggregation_function = 'min' THEN hi := 'Infinity'::float8; ELSE lo := '-Infinity'::float8; END IF; END IF; ELSE RAISE EXCEPTION USING MESSAGE= 'Cannot compute support for aggregation function ' || aggregation_function; END IF; RETURN; END IF; RAISE EXCEPTION 'support() is not yet supported for input type %', pg_typeof(input); END $$ LANGUAGE plpgsql PARALLEL SAFE SET search_path=provsql SECURITY DEFINER; /** * @brief Compute the central moment E[(X - E[X|prov])^k | prov] * * @c k = 0 returns 1; @c k = 1 returns 0; @c k = 2 is equivalent to * @c variance(input, prov, ...). Polymorphic dispatcher: routes * @c random_variable through @c rv_moment, and @c agg_token through * the binomial expansion * @f$E[(X-\mu)^k|A] = \sum_{i=0}^{k} \binom{k}{i} (-\mu)^{k-i} E[X^i|A]@f$ * with @f$\mu = E[X|A]@f$, where each @f$E[X^i|A]@f$ comes from * @c agg_raw_moment. */ CREATE OR REPLACE FUNCTION central_moment( input ANYELEMENT, k integer, prov UUID = gate_one(), method text = NULL, arguments text = NULL) RETURNS DOUBLE PRECISION AS $$ DECLARE mu float8; total float8; i integer; raw_i float8; binom float8; -- iterative binomial coefficient C(k, i) k_double float8; BEGIN IF pg_typeof(input) = 'random_variable'::regtype THEN IF input IS NULL OR k IS NULL THEN RETURN NULL; END IF; -- See variance() above: rv_moment handles the conditional/unconditional -- dispatch internally based on the resolved prov gate type. RETURN provsql.rv_moment( (input::random_variable)::uuid, k, true, prov); END IF; IF pg_typeof(input) = 'agg_token'::regtype THEN IF input IS NULL OR k IS NULL THEN RETURN NULL; END IF; IF k < 0 THEN RAISE EXCEPTION 'central_moment(): k must be non-negative (got %)', k; END IF; IF k = 0 THEN RETURN 1; END IF; IF k = 1 THEN RETURN 0; END IF; mu := agg_raw_moment(input::agg_token, 1, prov, method, arguments); IF mu IS NULL THEN RETURN NULL; END IF; -- mu may be ±Infinity for empty MIN / MAX with positive empty -- probability; central_moment is undefined in that case. IF mu = 'Infinity'::float8 OR mu = '-Infinity'::float8 THEN RETURN mu; END IF; total := 0; binom := 1; -- C(k, 0) k_double := k; FOR i IN 0..k LOOP raw_i := agg_raw_moment(input::agg_token, i, prov, method, arguments); IF raw_i IS NULL THEN RETURN NULL; END IF; total := total + binom * power(-mu, k - i) * raw_i; -- C(k, i+1) = C(k, i) * (k - i) / (i + 1) IF i < k THEN binom := binom * (k_double - i) / (i + 1); END IF; END LOOP; RETURN total; END IF; RAISE EXCEPTION 'central_moment() is not yet supported for input type %', pg_typeof(input); END $$ LANGUAGE plpgsql PARALLEL SAFE SET search_path=provsql SECURITY DEFINER; /** * @brief Compute the Shapley value of an input variable * * Measures the contribution of a specific input variable to the * truth of a provenance expression, using game-theoretic Shapley values. * * @param token provenance token to evaluate * @param variable UUID of the input variable * @param method knowledge compilation method * @param arguments additional arguments for the method * @param banzhaf if true, compute the Banzhaf value instead */ CREATE OR REPLACE FUNCTION shapley( token UUID, variable UUID, method text = NULL, arguments text = NULL, banzhaf BOOLEAN = 'f') RETURNS DOUBLE PRECISION AS 'provsql','shapley' LANGUAGE C STABLE; /** @brief Compute Shapley values for all input variables at once */ CREATE OR REPLACE FUNCTION shapley_all_vars( IN token UUID, IN method text = NULL, IN arguments text = NULL, IN banzhaf BOOLEAN = 'f', OUT variable UUID, OUT value DOUBLE PRECISION) RETURNS SETOF record AS 'provsql', 'shapley_all_vars' LANGUAGE C STABLE; /** @brief Compute the Banzhaf power index of an input variable */ CREATE OR REPLACE FUNCTION banzhaf( token UUID, variable UUID, method text = NULL, arguments text = NULL) RETURNS DOUBLE PRECISION AS $$ SELECT provsql.shapley(token, variable, method, arguments, 't') $$ LANGUAGE SQL; /** @brief Compute Banzhaf power indices for all input variables at once */ CREATE OR REPLACE FUNCTION banzhaf_all_vars( IN token UUID, IN method text = NULL, IN arguments text = NULL, OUT variable UUID, OUT value DOUBLE PRECISION) RETURNS SETOF record AS $$ SELECT * FROM provsql.shapley_all_vars(token, method, arguments, 't') $$ LANGUAGE SQL; /** @} */ /** @defgroup provenance_output Provenance output * Functions for visualizing and exporting provenance circuits * in various formats. * @{ */ /** * @brief Return a DOT or text visualization of the provenance circuit * * @param token root provenance token * @param token2desc mapping table for gate descriptions * @param dbg debug level (0 = normal) */ CREATE OR REPLACE FUNCTION view_circuit( token UUID, token2desc regclass, dbg int = 0) RETURNS TEXT AS 'provsql','view_circuit' LANGUAGE C; /** * @brief Return an XML representation of the provenance circuit * * @param token root provenance token * @param token2desc optional mapping table for gate descriptions */ CREATE OR REPLACE FUNCTION to_provxml( token UUID, token2desc regclass = NULL) RETURNS TEXT AS 'provsql','to_provxml' LANGUAGE C; /** @brief Return the provenance token of the current query result tuple */ CREATE OR REPLACE FUNCTION provenance() RETURNS UUID AS 'provsql', 'provenance' LANGUAGE C; /** * @brief Compute where-provenance for a result tuple * * Returns a text representation showing which input columns * contributed to each output column. */ CREATE OR REPLACE FUNCTION where_provenance(token UUID) RETURNS text AS 'provsql','where_provenance' LANGUAGE C; /** @} */ /** @defgroup circuit_init Circuit initialization * Functions and statements executed at extension load time to * reset internal caches and create the constant zero/one gates. * @{ */ /** @brief Reset the internal cache of OID constants used by the query rewriter */ CREATE OR REPLACE FUNCTION reset_constants_cache() RETURNS void AS 'provsql', 'reset_constants_cache' LANGUAGE C; SELECT reset_constants_cache(); SELECT create_gate(gate_zero(), 'zero'); SELECT create_gate(gate_one(), 'one'); /** @} */ /** @brief Types of update operations tracked for temporal provenance */ CREATE TYPE query_type_enum AS ENUM ( 'INSERT', -- Row was inserted 'DELETE', -- Row was deleted 'UPDATE', -- Row was updated 'UNDO' -- Previous operation was undone ); /** @defgroup compiled_semirings Compiled semirings * Definitions of compiled semirings * @{ */ /** @brief Evaluate provenance as a symbolic formula (e.g., "a ⊗ b ⊕ c") */ CREATE FUNCTION sr_formula(token ANYELEMENT, token2value regclass) RETURNS VARCHAR AS $$ BEGIN RETURN provsql.provenance_evaluate_compiled( token, token2value, 'formula', '𝟙'::VARCHAR ); END $$ LANGUAGE plpgsql STRICT PARALLEL SAFE STABLE; /** @brief Evaluate provenance over the counting semiring (ℕ) */ CREATE FUNCTION sr_counting(token ANYELEMENT, token2value regclass) RETURNS INT AS $$ BEGIN RETURN provsql.provenance_evaluate_compiled( token, token2value, 'counting', 1 ); END $$ LANGUAGE plpgsql STRICT PARALLEL SAFE STABLE; /** @brief Evaluate provenance as why-provenance (set of witness sets) */ CREATE FUNCTION sr_why(token ANYELEMENT, token2value regclass) RETURNS VARCHAR AS $$ BEGIN RETURN provsql.provenance_evaluate_compiled( token, token2value, 'why', '{}'::VARCHAR ); END $$ LANGUAGE plpgsql STRICT PARALLEL SAFE STABLE; /** @brief Evaluate provenance as how-provenance (canonical polynomial provenance ℕ[X], universal commutative-semiring provenance) */ CREATE FUNCTION sr_how(token ANYELEMENT, token2value regclass) RETURNS VARCHAR AS $$ BEGIN RETURN provsql.provenance_evaluate_compiled( token, token2value, 'how', '{}'::VARCHAR ); END $$ LANGUAGE plpgsql STRICT PARALLEL SAFE STABLE; /** @brief Evaluate provenance as which-provenance (lineage: a single set of contributing labels) */ CREATE FUNCTION sr_which(token ANYELEMENT, token2value regclass) RETURNS VARCHAR AS $$ BEGIN RETURN provsql.provenance_evaluate_compiled( token, token2value, 'which', '{}'::VARCHAR ); END $$ LANGUAGE plpgsql STRICT PARALLEL SAFE STABLE; /** @brief Evaluate provenance as a Boolean expression * * The optional @p token2value mapping labels the leaves of the * formula: when omitted, leaves are rendered as bare @c x@ * placeholders. */ CREATE FUNCTION sr_boolexpr(token ANYELEMENT, token2value regclass = NULL) RETURNS VARCHAR AS $$ BEGIN IF token IS NULL THEN RETURN NULL; END IF; RETURN provsql.provenance_evaluate_compiled( token, token2value, 'boolexpr', '⊤'::VARCHAR ); END $$ LANGUAGE plpgsql PARALLEL SAFE STABLE; /** @brief Evaluate provenance over the Boolean semiring (true/false) */ CREATE FUNCTION sr_boolean(token ANYELEMENT, token2value regclass) RETURNS BOOLEAN AS $$ BEGIN RETURN provsql.provenance_evaluate_compiled( token, token2value, 'boolean', TRUE ); END $$ LANGUAGE plpgsql STRICT PARALLEL SAFE STABLE; /** @brief Evaluate provenance over the tropical (min-plus) m-semiring * * Inputs are read as %float8 cost values; the additive identity * is 'Infinity'::%float8 and the multiplicative identity is 0. * Returns the cost of the cheapest derivation. */ CREATE FUNCTION sr_tropical(token ANYELEMENT, token2value regclass) RETURNS FLOAT AS $$ BEGIN RETURN provsql.provenance_evaluate_compiled( token, token2value, 'tropical', 0::FLOAT ); END $$ LANGUAGE plpgsql STRICT PARALLEL SAFE STABLE; /** @brief Evaluate provenance over the Viterbi (max-times) m-semiring * * Inputs are read as %float8 probability values in @f$[0,1]@f$. * Returns the probability of the most likely derivation. */ CREATE FUNCTION sr_viterbi(token ANYELEMENT, token2value regclass) RETURNS FLOAT AS $$ BEGIN RETURN provsql.provenance_evaluate_compiled( token, token2value, 'viterbi', 1::FLOAT ); END $$ LANGUAGE plpgsql STRICT PARALLEL SAFE STABLE; /** @brief Evaluate provenance over the Łukasiewicz fuzzy m-semiring * * Inputs are read as %float8 graded-truth values in @f$[0,1]@f$. * Addition is @f$\max@f$; multiplication is the Łukasiewicz t-norm * @f$\max(a + b - 1, 0)@f$, which preserves crisp truth and avoids * the near-zero collapse of long product chains. */ CREATE FUNCTION sr_lukasiewicz(token ANYELEMENT, token2value regclass) RETURNS FLOAT AS $$ BEGIN RETURN provsql.provenance_evaluate_compiled( token, token2value, 'lukasiewicz', 1::FLOAT ); END $$ LANGUAGE plpgsql STRICT PARALLEL SAFE STABLE; /** @brief Evaluate provenance over the min-max m-semiring on a user enum * * Inputs are read as values of a user-defined enum carrier; addition * is enum-min, multiplication is enum-max. Bottom and top of the enum * are derived from @c pg_enum.enumsortorder. The third argument is a * sample value of the carrier enum, used only for type inference; its * value is ignored. * * The security shape: alternative derivations combine to the least * sensitive label, joins combine to the most sensitive label. * * @param token Provenance token to evaluate. * @param token2value Mapping from input gates to enum values. * @param element_one Sample value of the carrier enum (any value works). */ CREATE FUNCTION sr_minmax(token UUID, token2value regclass, element_one ANYENUM) RETURNS ANYENUM AS $$ BEGIN RETURN provsql.provenance_evaluate_compiled( token, token2value, 'minmax', element_one ); END $$ LANGUAGE plpgsql STRICT PARALLEL SAFE STABLE; /** @brief Evaluate provenance over the max-min m-semiring on a user enum * * Dual of :sqlfunc:`sr_minmax`: addition is enum-max, multiplication * is enum-min. The fuzzy / availability / trust shape: alternatives * combine to the most permissive label, joins combine to the strictest * label. The third argument is a sample value of the carrier enum, * used only for type inference; its value is ignored. * * @param token Provenance token to evaluate. * @param token2value Mapping from input gates to enum values. * @param element_one Sample value of the carrier enum (any value works). */ CREATE FUNCTION sr_maxmin(token UUID, token2value regclass, element_one ANYENUM) RETURNS ANYENUM AS $$ BEGIN RETURN provsql.provenance_evaluate_compiled( token, token2value, 'maxmin', element_one ); END $$ LANGUAGE plpgsql STRICT PARALLEL SAFE STABLE; /** @} */ /** @defgroup choose_aggregate choose aggregate * Choose one value among many, used in particular to code a mutually * exclusive choice as an aggregate. * @{ */ /** @brief Transition function for the choose aggregate (keeps first non-NULL value) */ CREATE FUNCTION choose_function(state ANYELEMENT, data ANYELEMENT) RETURNS ANYELEMENT AS $$ BEGIN IF state IS NULL THEN RETURN data; ELSE RETURN state; END IF; END $$ LANGUAGE plpgsql PARALLEL SAFE IMMUTABLE; /** @brief Aggregate that returns an arbitrary non-NULL value from a group */ CREATE AGGREGATE choose(ANYELEMENT) ( SFUNC = choose_function, STYPE = ANYELEMENT ); /** @} */ GRANT USAGE ON SCHEMA provsql TO PUBLIC; SET search_path TO public;