/* ----------------------------------------------------------------------- *//**
 *
 * @file crf_data_loader.sql_in
 *
 * @brief Create database tables and import POS/NER training/testing data to the database
 * @date Feb. 2012
 *
 *//* ----------------------------------------------------------------------- */

m4_include(`SQLCommon.m4')

/**

@input

-# Prepare an input train data segment table, e.g.:
- CREATE TABLE train_segmenttbl (start_pos integer,doc_id integer,seg_text text, max_pos integer)
\verbatim
sql> select * from train_segmenttbl order by doc_id, start_pos;
start_pos | doc_id | seg_text      | max_pos
----------+---------+--------------+-------------
     0    |    1   |       madlib  |    9
     1    |    1   |           is  |    9
     2    |    1   |           an  |    9
     3    |    1   |  open-source  |    9
     4    |    1   |      library  |    9
     5    |    1   |          for  |    9
     6    |    1   |     scalable  |    9
     7    |    1   |  in-database  |    9
     8    |    1   |    analytics  |    9
     9    |    1   |            .  |    9
     0    |    2   |           it  |   16
     1    |    2   |     provides  |   16
     2    |    2   |data-parallel  |   16
     3    |    2   |implementations|   16
     ...
     14   |    2   |  unstructured |   16
     15   |    2   |          data |   16
     16   |    2   |             . |   16
\endverbatim

-# Prepare an input dictionary table, e.g.,:
- CREATE TABLE crf_dictionary (token text,token_id integer,label text,count integer,total integer)
\verbatim
sql> select * from crf_dictionary;
token       | label  | count | total
------------+--------+--------------
   freefall |   11   |   1   |  1
     policy |   11   |   2   |  2
   measures |   12   |   1   |  1
 commitment |   11   |   1   |  1
        new |    6   |   1   |  1
     speech |   11   |   1   |  1
         's |   16   |   2   |  2
     reckon |   30   |   1   |  1
 underlying |   28   |   1   |  1
 ...
\endverbatim

-# Prepare an input label table, e.g.,:
- CREATE TABLE labeltbl (id integer,label character varying)
\verbatim
sql> select * from labeltbl order by id;
id          | label
------------+--------
      0     |   CC
      1     |   CD
      2     |   DT
      3     |   EX
      4     |   FW
      5     |   IN
      6     |   JJ
...
     42     |    ,
     43     |    .
     44     |    :
\endverbatim

-# Prepare an input regex table, e.g.,:
- CREATE TABLE crf_regex (pattern text,name text)
\verbatim
sql> select * from crf_regex;
pattern       |   name
------------- +---------------
^[A-Z][a-z]+$ |  InitCapital%
     ^[A-Z]+$ |  isAllCapital%
 ^.*[0-9]+.*$ |  containsDigit%
      ^.+[.]$ |  endsWithDot%
      ^.+[,]$ |  endsWithComma%
       ^.+er$ |  endsWithER%
      ^.+est$ |  endsWithEst%
       ^.+ed$ |  endsWithED%
...
\endverbatim

-# Prepare an input feature table, e.g.,:
- CREATE TABLE featuretbl (id integer,name text,prev_label_id integer,label_id integer,weight float)
\verbatim
sql> select * from featuretbl order by id;
id   |     name     | prev_label_id | label_id | weight
-------------------------------------------------------
1    | W_chancellor |       -1      |    13     | 2.2322
2    |         E.13 |       13      |     5     | 2.3995
3    |            U |       -1      |     5     | 1.2164
4    |         W_of |       -1      |     5     | 2.8744
5    |          E.5 |        5      |     2     | 3.7716
6    |        W_the |       -1      |     2     | 4.1790
7    |          E.2 |        2      |    13     | 0.8957
...
\endverbatim

-# Prepare an crf feature set table, e.g.,:
- CREATE TABLE MADLIB_SCHEMA.crf_feature_dic(f_index integer, f_name text, feature integer[])
\verbatim
sql> select * from crf_feature_dic order by id;
f_index|     f_name   | feature
--------------------------------
0      | W_chancellor |       -1
1      |         E.13 |       13
2      |            U |       -1
3      |         W_of |       -1
4      |          E.5 |        5
5      |        W_the |       -1
...
\endverbatim


@usage
- create tables and import data to the database
  SELECT madlib.crf_train_data('/path/to/modeldata')

*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.crf_train_data(datapath text) RETURNS void AS
$$
        # import label data to the database
        query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.crf_label CASCADE;" + \
                "CREATE TABLE MADLIB_SCHEMA.crf_label(id integer,label text);" + \
                "COPY MADLIB_SCHEMA.crf_label(id,label) FROM '" + datapath + "/crf_label.tab'";
        plpy.execute(query);

        # import regex to regex table
        query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.crf_regex CASCADE;" + \
                "CREATE TABLE MADLIB_SCHEMA.crf_regex (pattern text,name text);" + \
                "COPY MADLIB_SCHEMA.crf_regex(pattern,name) FROM '" + datapath + "/crf_regex.tab'";
        plpy.execute(query);

        # import training data to the database
        query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.train_segmenttbl CASCADE;" + \
                "CREATE TABLE MADLIB_SCHEMA.train_segmenttbl(start_pos integer,doc_id integer,seg_text text,label integer,max_pos integer);" + \
                "COPY MADLIB_SCHEMA.train_segmenttbl(start_pos,doc_id,seg_text,label,max_pos) FROM '" + datapath + "/crf_traindata.tab'";
        plpy.execute(query);

        query ="DROP TABLE IF EXISTS MADLIB_SCHEMA.crf_feature;" + \
               "CREATE TABLE MADLIB_SCHEMA.crf_feature (id integer,name text,prev_label_id integer,label_id integer,weight float);"
        plpy.execute(query);

        # dictionary table
        query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.crf_dictionary;" + \
                "CREATE TABLE MADLIB_SCHEMA.crf_dictionary(token text,total integer);"
        plpy.execute(query);

        query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.featuretbl;" + \
                "CREATE TABLE MADLIB_SCHEMA.featuretbl(doc_id integer,f_size FLOAT8,sparse_r FLOAT8[],dense_m FLOAT8[],sparse_m FLOAT8[]);"
        plpy.execute(query);

        query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.crf_feature_dic;" + \
                "CREATE TABLE MADLIB_SCHEMA.crf_feature_dic(f_index integer, f_name text, feature integer[]);"
        plpy.execute(query);

$$ LANGUAGE plpythonu STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');


/**

@input

-# Prepare an input test data segment table, e.g.:
- CREATE TABLE test_segmenttbl (start_pos integer,doc_id integer,seg_text text, max_pos integer)
\verbatim
sql> select * from test_segmenttbl order by doc_id, start_pos;
start_pos | doc_id |   seg_text    | max_pos
----------+---------+--------------+-------------
     0    |    1   |          the  |    26
     1    |    1   |       madlib  |    26
     2    |    1   |      mission  |    26
     3    |    1   |            :  |    26
     4    |    1   |           to  |    26
     5    |    1   |       foster  |    26
     6    |    1   |   widespread  |    26
     7    |    1   |  development  |    26
     8    |    1   |           of  |    26
     9    |    1   |     scalable  |    26
     10   |    1   |     analytic  |    26
     11   |    1   |       skills  |    26
     12   |    1   |            ,  |    26
     13   |    1   |           by  |    26
     ...
     24   |    1   |  open-source  |    26
     25   |    1   |   development |    26
     26   |    1   |             . |    26
\endverbatim

@usage
- create tables and import data to the database
  SELECT madlib.crf_test_data('/path/to/modeldata')

*/

CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.crf_test_data(datapath text) RETURNS void AS
$$
        # tokenized document
	query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.test_segmenttbl CASCADE;" + \
	        "CREATE TABLE MADLIB_SCHEMA.test_segmenttbl (start_pos integer,doc_id integer,seg_text text, max_pos integer)";
	plpy.execute(query);

	# R factor table
	query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.viterbi_rtbl;" + \
	        "CREATE TABLE MADLIB_SCHEMA.viterbi_rtbl (seg_text text, label integer, score integer)";
	plpy.execute(query);

	# M factor table
	query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.viterbi_mtbl;" + \
	        "CREATE TABLE MADLIB_SCHEMA.viterbi_mtbl (score integer[])";
	plpy.execute(query);

        # import tokenized document to the segment table
        query = "COPY MADLIB_SCHEMA.test_segmenttbl (start_pos,doc_id,seg_text,max_pos) FROM '" + datapath + "/crf_testdata.tab'";
        plpy.execute(query);

$$ language plpythonu STRICT
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');