/* ----------------------------------------------------------------------- *//** * * @file crf_data_loader.sql_in * * @brief Create database tables and import POS/NER training/testing data to the database * @date Feb. 2012 * *//* ----------------------------------------------------------------------- */ m4_include(`SQLCommon.m4') /** @input -# Prepare an input train data segment table, e.g.: - CREATE TABLE train_segmenttbl (start_pos integer,doc_id integer,seg_text text, max_pos integer) \verbatim sql> select * from train_segmenttbl order by doc_id, start_pos; start_pos | doc_id | seg_text | max_pos ----------+---------+--------------+------------- 0 | 1 | madlib | 9 1 | 1 | is | 9 2 | 1 | an | 9 3 | 1 | open-source | 9 4 | 1 | library | 9 5 | 1 | for | 9 6 | 1 | scalable | 9 7 | 1 | in-database | 9 8 | 1 | analytics | 9 9 | 1 | . | 9 0 | 2 | it | 16 1 | 2 | provides | 16 2 | 2 |data-parallel | 16 3 | 2 |implementations| 16 ... 14 | 2 | unstructured | 16 15 | 2 | data | 16 16 | 2 | . | 16 \endverbatim -# Prepare an input dictionary table, e.g.,: - CREATE TABLE crf_dictionary (token text,token_id integer,label text,count integer,total integer) \verbatim sql> select * from crf_dictionary; token | label | count | total ------------+--------+-------------- freefall | 11 | 1 | 1 policy | 11 | 2 | 2 measures | 12 | 1 | 1 commitment | 11 | 1 | 1 new | 6 | 1 | 1 speech | 11 | 1 | 1 's | 16 | 2 | 2 reckon | 30 | 1 | 1 underlying | 28 | 1 | 1 ... \endverbatim -# Prepare an input label table, e.g.,: - CREATE TABLE labeltbl (id integer,label character varying) \verbatim sql> select * from labeltbl order by id; id | label ------------+-------- 0 | CC 1 | CD 2 | DT 3 | EX 4 | FW 5 | IN 6 | JJ ... 42 | , 43 | . 44 | : \endverbatim -# Prepare an input regex table, e.g.,: - CREATE TABLE crf_regex (pattern text,name text) \verbatim sql> select * from crf_regex; pattern | name ------------- +--------------- ^[A-Z][a-z]+$ | InitCapital% ^[A-Z]+$ | isAllCapital% ^.*[0-9]+.*$ | containsDigit% ^.+[.]$ | endsWithDot% ^.+[,]$ | endsWithComma% ^.+er$ | endsWithER% ^.+est$ | endsWithEst% ^.+ed$ | endsWithED% ... \endverbatim -# Prepare an input feature table, e.g.,: - CREATE TABLE featuretbl (id integer,name text,prev_label_id integer,label_id integer,weight float) \verbatim sql> select * from featuretbl order by id; id | name | prev_label_id | label_id | weight ------------------------------------------------------- 1 | W_chancellor | -1 | 13 | 2.2322 2 | E.13 | 13 | 5 | 2.3995 3 | U | -1 | 5 | 1.2164 4 | W_of | -1 | 5 | 2.8744 5 | E.5 | 5 | 2 | 3.7716 6 | W_the | -1 | 2 | 4.1790 7 | E.2 | 2 | 13 | 0.8957 ... \endverbatim -# Prepare an crf feature set table, e.g.,: - CREATE TABLE MADLIB_SCHEMA.crf_feature_dic(f_index integer, f_name text, feature integer[]) \verbatim sql> select * from crf_feature_dic order by id; f_index| f_name | feature -------------------------------- 0 | W_chancellor | -1 1 | E.13 | 13 2 | U | -1 3 | W_of | -1 4 | E.5 | 5 5 | W_the | -1 ... \endverbatim @usage - create tables and import data to the database SELECT madlib.crf_train_data('/path/to/modeldata') */ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.crf_train_data(datapath text) RETURNS void AS $$ # import label data to the database query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.crf_label CASCADE;" + \ "CREATE TABLE MADLIB_SCHEMA.crf_label(id integer,label text);" + \ "COPY MADLIB_SCHEMA.crf_label(id,label) FROM '" + datapath + "/crf_label.tab'"; plpy.execute(query); # import regex to regex table query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.crf_regex CASCADE;" + \ "CREATE TABLE MADLIB_SCHEMA.crf_regex (pattern text,name text);" + \ "COPY MADLIB_SCHEMA.crf_regex(pattern,name) FROM '" + datapath + "/crf_regex.tab'"; plpy.execute(query); # import training data to the database query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.train_segmenttbl CASCADE;" + \ "CREATE TABLE MADLIB_SCHEMA.train_segmenttbl(start_pos integer,doc_id integer,seg_text text,label integer,max_pos integer);" + \ "COPY MADLIB_SCHEMA.train_segmenttbl(start_pos,doc_id,seg_text,label,max_pos) FROM '" + datapath + "/crf_traindata.tab'"; plpy.execute(query); query ="DROP TABLE IF EXISTS MADLIB_SCHEMA.crf_feature;" + \ "CREATE TABLE MADLIB_SCHEMA.crf_feature (id integer,name text,prev_label_id integer,label_id integer,weight float);" plpy.execute(query); # dictionary table query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.crf_dictionary;" + \ "CREATE TABLE MADLIB_SCHEMA.crf_dictionary(token text,total integer);" plpy.execute(query); query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.featuretbl;" + \ "CREATE TABLE MADLIB_SCHEMA.featuretbl(doc_id integer,f_size FLOAT8,sparse_r FLOAT8[],dense_m FLOAT8[],sparse_m FLOAT8[]);" plpy.execute(query); query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.crf_feature_dic;" + \ "CREATE TABLE MADLIB_SCHEMA.crf_feature_dic(f_index integer, f_name text, feature integer[]);" plpy.execute(query); $$ LANGUAGE plpythonu STRICT m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); /** @input -# Prepare an input test data segment table, e.g.: - CREATE TABLE test_segmenttbl (start_pos integer,doc_id integer,seg_text text, max_pos integer) \verbatim sql> select * from test_segmenttbl order by doc_id, start_pos; start_pos | doc_id | seg_text | max_pos ----------+---------+--------------+------------- 0 | 1 | the | 26 1 | 1 | madlib | 26 2 | 1 | mission | 26 3 | 1 | : | 26 4 | 1 | to | 26 5 | 1 | foster | 26 6 | 1 | widespread | 26 7 | 1 | development | 26 8 | 1 | of | 26 9 | 1 | scalable | 26 10 | 1 | analytic | 26 11 | 1 | skills | 26 12 | 1 | , | 26 13 | 1 | by | 26 ... 24 | 1 | open-source | 26 25 | 1 | development | 26 26 | 1 | . | 26 \endverbatim @usage - create tables and import data to the database SELECT madlib.crf_test_data('/path/to/modeldata') */ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.crf_test_data(datapath text) RETURNS void AS $$ # tokenized document query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.test_segmenttbl CASCADE;" + \ "CREATE TABLE MADLIB_SCHEMA.test_segmenttbl (start_pos integer,doc_id integer,seg_text text, max_pos integer)"; plpy.execute(query); # R factor table query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.viterbi_rtbl;" + \ "CREATE TABLE MADLIB_SCHEMA.viterbi_rtbl (seg_text text, label integer, score integer)"; plpy.execute(query); # M factor table query = "DROP TABLE IF EXISTS MADLIB_SCHEMA.viterbi_mtbl;" + \ "CREATE TABLE MADLIB_SCHEMA.viterbi_mtbl (score integer[])"; plpy.execute(query); # import tokenized document to the segment table query = "COPY MADLIB_SCHEMA.test_segmenttbl (start_pos,doc_id,seg_text,max_pos) FROM '" + datapath + "/crf_testdata.tab'"; plpy.execute(query); $$ language plpythonu STRICT m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');