/* ----------------------------------------------------------------------- *//** * * @file text_utilities.sql_in * * @brief SQL functions for carrying out routine text operations * * @sa For a brief overview of utility functions, see the * module description \ref grp_utilities. * *//* ----------------------------------------------------------------------- */ m4_include(`SQLCommon.m4') /** @addtogroup grp_text_utilities
Contents
@brief Provides a collection of functions for performing common tasks related to text analytics. Term frequency computes the number of times that a word or term occurs in a document. Term frequency is often used as part of a larger text processing pipeline, which may include operations such as stemming, stop word removal and topic modelling. @anchor function_syntax @par Function Syntax
    term_frequency(input_table,
                   doc_id_col,
                   word_col,
                   output_table,
                   compute_vocab)
\b Arguments:
input_table
TEXT. The name of the table containing the documents, with one document per row. Each row is in the form <doc_id, word_vector> where \c doc_id is an id unique to each document, and \c word_vector is a text array containing the words in the document. The \c word_vector should contain multiple entries of a word if the document contains multiple occurrence of that word.
doc_id_col
TEXT. The name of the column containing the document id.
word_col
TEXT. The name of the column containing the vector of words/terms in the document. This column should be of type that can be cast to TEXT[].
output_table
TEXT. The name of the table to store the term frequency output. The output table contains the following columns: - \c doc_id_col: This the document id column (name will be same as the one provided as input). - \c word: Word/term present in a document. Depending on the value of \c compute_vocab below, this is either the original word as it appears in \c word_col, or an id representing the word. Note that word id's start from 0 not 1. - \c count: The number of times this word is found in the document.
compute_vocab
BOOLEAN. (Optional, Default=FALSE) Flag to indicate if a vocabulary table is to be created. If TRUE, an additional output table is created containing the vocabulary of all words, with an id assigned to each word in alphabetical order. The table is called output_table_vocabulary (i.e., suffix added to the output_table name) and contains the following columns: - \c wordid: An id for each word in alphabetical order. - \c word: The word/term corresponding to the id.
@anchor examples @par Examples -# First we create a document table with one document per row:
DROP TABLE IF EXISTS documents;
CREATE TABLE documents(docid INT4, contents TEXT);
INSERT INTO documents VALUES
(0, 'I like to eat broccoli and bananas. I ate a banana and spinach smoothie for breakfast.'),
(1, 'Chinchillas and kittens are cute.'),
(2, 'My sister adopted two kittens yesterday.'),
(3, 'Look at this cute hamster munching on a piece of broccoli.');
You can apply stemming, stop word removal and tokenization at this point in order to prepare the documents for text processing. Depending upon your database version, various tools are available. Databases based on more recent versions of PostgreSQL may do something like:
SELECT tsvector_to_array(to_tsvector('english',contents)) from documents;
                    tsvector_to_array                     
+----------------------------------------------------------
 {ate,banana,breakfast,broccoli,eat,like,smoothi,spinach}
 {chinchilla,cute,kitten}
 {adopt,kitten,sister,two,yesterday}
 {broccoli,cute,hamster,look,munch,piec}
(4 rows)
In this example, we assume a database based on an older version of PostgreSQL and just perform basic punctuation removal and tokenization. The array of words is added as a new column to the documents table:
ALTER TABLE documents ADD COLUMN words TEXT[];
UPDATE documents SET words = 
    regexp_split_to_array(lower(
    regexp_replace(contents, E'[,.;\\']','', 'g')
    ), E'[\\\\s+]');
\\x on   
SELECT * FROM documents ORDER BY docid;
-[ RECORD 1 ]------------------------------------------------------------------------------------
docid    | 0
contents | I like to eat broccoli and bananas. I ate a banana and spinach smoothie for breakfast.
words    | {i,like,to,eat,broccoli,and,bananas,i,ate,a,banana,and,spinach,smoothie,for,breakfast}
-[ RECORD 2 ]------------------------------------------------------------------------------------
docid    | 1
contents | Chinchillas and kittens are cute.
words    | {chinchillas,and,kittens,are,cute}
-[ RECORD 3 ]------------------------------------------------------------------------------------
docid    | 2
contents | My sister adopted two kittens yesterday.
words    | {my,sister,adopted,two,kittens,yesterday}
-[ RECORD 4 ]------------------------------------------------------------------------------------
docid    | 3
contents | Look at this cute hamster munching on a piece of broccoli.
words    | {look,at,this,cute,hamster,munching,on,a,piece,of,broccoli}
-# Compute the frequency of each word in each document:
DROP TABLE IF EXISTS documents_tf, documents_tf_vocabulary;
SELECT madlib.term_frequency('documents',    -- input table
                             'docid',        -- document id column
                             'words',        -- vector of words in document
                             'documents_tf'  -- output table
                            );
\\x off
SELECT * FROM documents_tf ORDER BY docid;
 docid |    word     | count 
-------+-------------+-------
     0 | a           |     1
     0 | breakfast   |     1
     0 | banana      |     1
     0 | and         |     2
     0 | eat         |     1
     0 | smoothie    |     1
     0 | to          |     1
     0 | like        |     1
     0 | broccoli    |     1
     0 | bananas     |     1
     0 | spinach     |     1
     0 | i           |     2
     0 | ate         |     1
     0 | for         |     1
     1 | are         |     1
     1 | cute        |     1
     1 | kittens     |     1
     1 | chinchillas |     1
     1 | and         |     1
     2 | two         |     1
     2 | yesterday   |     1
     2 | kittens     |     1
     2 | sister      |     1
     2 | my          |     1
     2 | adopted     |     1
     3 | this        |     1
     3 | at          |     1
     3 | a           |     1
     3 | broccoli    |     1
     3 | of          |     1
     3 | look        |     1
     3 | hamster     |     1
     3 | on          |     1
     3 | piece       |     1
     3 | cute        |     1
     3 | munching    |     1
(36 rows)
-# Next we create a vocabulary of the words and store a wordid in the output table instead of the actual word:
DROP TABLE IF EXISTS documents_tf, documents_tf_vocabulary;
SELECT madlib.term_frequency('documents',    -- input table
                             'docid',        -- document id column
                             'words',        -- vector of words in document
                             'documents_tf',-- output table
                             TRUE
                            );
SELECT * FROM documents_tf ORDER BY docid;
\nbsp
 docid | wordid | count 
-------+--------+-------
     0 |     17 |     1
     0 |      9 |     1
     0 |     25 |     1
     0 |     12 |     1
     0 |     13 |     1
     0 |     15 |     2
     0 |      0 |     1
     0 |      2 |     2
     0 |     28 |     1
     0 |      5 |     1
     0 |      6 |     1
     0 |      7 |     1
     0 |      8 |     1
     0 |     26 |     1
     1 |     16 |     1
     1 |     11 |     1
     1 |     10 |     1
     1 |      2 |     1
     1 |      3 |     1
     2 |     30 |     1
     2 |      1 |     1
     2 |     16 |     1
     2 |     20 |     1
     2 |     24 |     1
     2 |     29 |     1
     3 |      4 |     1
     3 |     21 |     1
     3 |     22 |     1
     3 |     23 |     1
     3 |      0 |     1
     3 |     11 |     1
     3 |      9 |     1
     3 |     27 |     1
     3 |     14 |     1
     3 |     18 |     1
     3 |     19 |     1
(36 rows)
\nbsp Note above that wordid's start at 0 not 1. The vocabulary table maps wordid to the actual word:
SELECT * FROM documents_tf_vocabulary ORDER BY wordid;
 wordid |    word     
--------+-------------
      0 | a
      1 | adopted
      2 | and
      3 | are
      4 | at
      5 | ate
      6 | banana
      7 | bananas
      8 | breakfast
      9 | broccoli
     10 | chinchillas
     11 | cute
     12 | eat
     13 | for
     14 | hamster
     15 | i
     16 | kittens
     17 | like
     18 | look
     19 | munching
     20 | my
     21 | of
     22 | on
     23 | piece
     24 | sister
     25 | smoothie
     26 | spinach
     27 | this
     28 | to
     29 | two
     30 | yesterday
(31 rows)
@anchor related @par Related Topics See text_utilities.sql_in for the term frequency SQL function definition and porter_stemmer.sql_in for the stemmer function. */ /** * @brief Compute the term frequency for each term in a collection of documents * * @returns Names of tables created */ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.term_frequency( input_table TEXT, doc_id_col TEXT, word_vec_col TEXT, output_table TEXT, compute_vocab BOOLEAN) RETURNS TEXT AS $$ PythonFunctionBodyOnly(`utilities', `text_utilities') return text_utilities.term_frequency(input_table, doc_id_col, word_vec_col, output_table, compute_vocab=compute_vocab) $$ LANGUAGE plpythonu VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.term_frequency( input_table TEXT, doc_id_col TEXT, word_vec_col TEXT, output_table TEXT) RETURNS TEXT AS $$ SELECT MADLIB_SCHEMA.term_frequency($1, $2, $3, $4, FALSE); $$ LANGUAGE SQL VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');