/*

These is the explore API.
It provides function exposing "bulk exploration" between tables (or relations in general).

Here's how the process works
     Given a regclass $1:
     Filter the column types we support / are interested in
     and store them in an array.

     Iterate over all possible pairs of columns and
     compute the statistics; possible cache them too

*/


-- Returns 2-pairs from array (with no repetition)

CREATE OR REPLACE FUNCTION vasco_array_pairs(anyarray, OUT p1 anyelement, OUT p2 anyelement)
    RETURNS setof record
    LANGUAGE sql AS
$$
WITH d AS (SELECT ROW_NUMBER() OVER () AS row, e AS v
           FROM UNNEST($1) e)
SELECT d1.v AS p1, d2.v AS p2
FROM d d1
         CROSS JOIN d d2
WHERE d1.row >= d2.row;
$$;


/*
 Given an input table, explores all the column pairs for their MINE statistics.
 Returns a table describing those results.
 Ideally the returned table should have fixed schema / columns but dunno it yet.
 */
CREATE OR REPLACE FUNCTION vasco_explore(regclass)
    RETURNS table
            (
                table_schema  text,
                table_name    text,
                column_name_1 text,
                column_name_2 text,
                mine_prob     mine_problem,
                mine_stats    mine_statistics,
                solved_in     interval
            )
    LANGUAGE plpgsql
    VOLATILE
    PARALLEL SAFE
AS
$$

DECLARE
    cols            text[];
    col_types       text[];
    tabname ALIAS FOR $1;
    var_pair        record;
    mine_prob_n     int;
    mine_prob_x     float8[];
    mine_prob_y     float8[];
    prob_size_limit int DEFAULT NULL; /* maybe at some point we'll want to limit prob_n or sample randomly */
    time_start      timestamp;
    relnamespace    text;
    relname         text;
    i               int;
    num_cols        int;
    num_pairs       int;
BEGIN

    --TODO: make this a param ?
    col_types := ARRAY ['int2', 'int4', 'int8', 'float4', 'float8', 'numeric'];

    --TODO: merge these two queries
    EXECUTE 'SELECT relnamespace::regnamespace::text, relname::text
        FROM pg_class
        WHERE oid = $1' INTO relnamespace, relname USING tabname;

    RAISE NOTICE 'relnamespace is % \t relname is %', relnamespace, relname;


    EXECUTE FORMAT('SELECT ARRAY_AGG(attname)
                 FROM pg_attribute a
                          JOIN pg_type t ON a.atttypid = t.oid
                 WHERE attrelid = $1
                   AND attnum > 0
                   AND NOT attisdropped
                   AND t.typname = ANY ($2) limit 2')
        INTO cols USING tabname, col_types;

    num_cols = ARRAY_LENGTH(cols, 1);
    num_pairs = (num_cols * (num_cols - 1)) / 2; /* n*(n-1)/2 */

    RAISE NOTICE 'vasco is about to explore % variable pairs', num_pairs;

    i = 0;
    FOR var_pair IN
        SELECT * FROM vasco_array_pairs(cols)
        LOOP
            i = i + 1;
            RAISE NOTICE 'exploring pair % out of %', i, num_pairs;

            /* Build a mine problem for this var pair,
               by aggregating their values in a float8[] array.
            */
            EXECUTE FORMAT('SELECT array_agg(%I), array_agg(%I) FROM %I.%I',
                           var_pair.p1,
                           var_pair.p2,
                           relnamespace,
                           relname
                ) INTO mine_prob_x, mine_prob_y;

            mine_prob_n := ARRAY_LENGTH(mine_prob_x, 1);
            mine_prob := ROW (mine_prob_n, mine_prob_x, mine_prob_y)::mine_problem;

            /* Solve the problem */
            time_start := CLOCK_TIMESTAMP();

            mine_stats = compute_mine_statistics(mine_prob);

            solved_in = CLOCK_TIMESTAMP() - time_start;

            table_schema = relnamespace::text;
            table_name = relname::text;
            column_name_1 = var_pair.p1;
            column_name_2 = var_pair.p2;

            RETURN NEXT;

        END LOOP;
    RETURN;
END;
$$;


CREATE FUNCTION vasco_corr_matrix(regclass, text) RETURNS void
    LANGUAGE plpgsql AS
$$
DECLARE
    r record;
    out_table_name ALIAS FOR $2;
BEGIN


    EXECUTE FORMAT('CREATE TABLE %I
        (
            col text primary key
        )', out_table_name);

    FOR r IN SELECT *
             FROM vasco_explore($1)
             ORDER BY column_name_1, column_name_2
        LOOP
            /* That's a lot of ALTER / INSERT / UPDATE to execute in separate queries.
               Will need to refactor by putting everything in a single query.
               Keep it cleaner for now though.
               */
            EXECUTE FORMAT('ALTER TABLE %I ADD COLUMN IF NOT EXISTS %I float8', out_table_name, r.column_name_1);
            EXECUTE FORMAT('ALTER TABLE %I ADD COLUMN IF NOT EXISTS %I float8', out_table_name, r.column_name_2);

            EXECUTE FORMAT('INSERT INTO %I(col) VALUES (%L) ON CONFLICT DO NOTHING;', out_table_name,
                           r.column_name_1);
            EXECUTE FORMAT('INSERT INTO %I(col) VALUES (%L) ON CONFLICT DO NOTHING;', out_table_name,
                           r.column_name_2);

            EXECUTE FORMAT('UPDATE %I SET %I = %L WHERE col=%L ;', out_table_name, r.column_name_1,
                           (r.mine_stats).mic, r.column_name_2);

            EXECUTE FORMAT('ALTER TABLE %I ADD COLUMN IF NOT EXISTS %I float8', out_table_name, r.column_name_2);

            EXECUTE FORMAT('UPDATE %I SET %I = %L WHERE col=%L ;', out_table_name, r.column_name_2,
                           (r.mine_stats).mic, r.column_name_1);

        END LOOP;
END

$$;