/* ----------------------------------------------------------------------- *//** * * @file text_utilities.sql_in * * @brief SQL functions for carrying out routine text operations * * @sa For a brief overview of utility functions, see the * module description \ref grp_utilities. * *//* ----------------------------------------------------------------------- */ m4_include(`SQLCommon.m4') /** @addtogroup grp_text_utilities
term_frequency(input_table,
doc_id_col,
word_col,
output_table,
compute_vocab)
\b Arguments:
DROP TABLE IF EXISTS documents; CREATE TABLE documents(docid INT4, contents TEXT); INSERT INTO documents VALUES (0, 'I like to eat broccoli and bananas. I ate a banana and spinach smoothie for breakfast.'), (1, 'Chinchillas and kittens are cute.'), (2, 'My sister adopted two kittens yesterday.'), (3, 'Look at this cute hamster munching on a piece of broccoli.');You can apply stemming, stop word removal and tokenization at this point in order to prepare the documents for text processing. Depending upon your database version, various tools are available. Databases based on more recent versions of PostgreSQL may do something like:
SELECT tsvector_to_array(to_tsvector('english',contents)) from documents;
tsvector_to_array
+----------------------------------------------------------
{ate,banana,breakfast,broccoli,eat,like,smoothi,spinach}
{chinchilla,cute,kitten}
{adopt,kitten,sister,two,yesterday}
{broccoli,cute,hamster,look,munch,piec}
(4 rows)
In this example, we assume a database based on an older
version of PostgreSQL and just perform basic punctuation
removal and tokenization. The array of words is added as
a new column to the documents table:
ALTER TABLE documents ADD COLUMN words TEXT[];
UPDATE documents SET words =
regexp_split_to_array(lower(
regexp_replace(contents, E'[,.;\\']','', 'g')
), E'[\\\\s+]');
\\x on
SELECT * FROM documents ORDER BY docid;
-[ RECORD 1 ]------------------------------------------------------------------------------------
docid | 0
contents | I like to eat broccoli and bananas. I ate a banana and spinach smoothie for breakfast.
words | {i,like,to,eat,broccoli,and,bananas,i,ate,a,banana,and,spinach,smoothie,for,breakfast}
-[ RECORD 2 ]------------------------------------------------------------------------------------
docid | 1
contents | Chinchillas and kittens are cute.
words | {chinchillas,and,kittens,are,cute}
-[ RECORD 3 ]------------------------------------------------------------------------------------
docid | 2
contents | My sister adopted two kittens yesterday.
words | {my,sister,adopted,two,kittens,yesterday}
-[ RECORD 4 ]------------------------------------------------------------------------------------
docid | 3
contents | Look at this cute hamster munching on a piece of broccoli.
words | {look,at,this,cute,hamster,munching,on,a,piece,of,broccoli}
-# Compute the frequency of each word in each document:
DROP TABLE IF EXISTS documents_tf, documents_tf_vocabulary;
SELECT madlib.term_frequency('documents', -- input table
'docid', -- document id column
'words', -- vector of words in document
'documents_tf' -- output table
);
\\x off
SELECT * FROM documents_tf ORDER BY docid;
docid | word | count
-------+-------------+-------
0 | a | 1
0 | breakfast | 1
0 | banana | 1
0 | and | 2
0 | eat | 1
0 | smoothie | 1
0 | to | 1
0 | like | 1
0 | broccoli | 1
0 | bananas | 1
0 | spinach | 1
0 | i | 2
0 | ate | 1
0 | for | 1
1 | are | 1
1 | cute | 1
1 | kittens | 1
1 | chinchillas | 1
1 | and | 1
2 | two | 1
2 | yesterday | 1
2 | kittens | 1
2 | sister | 1
2 | my | 1
2 | adopted | 1
3 | this | 1
3 | at | 1
3 | a | 1
3 | broccoli | 1
3 | of | 1
3 | look | 1
3 | hamster | 1
3 | on | 1
3 | piece | 1
3 | cute | 1
3 | munching | 1
(36 rows)
-# Next we create a vocabulary of the words
and store a wordid in the output table instead of the
actual word:
DROP TABLE IF EXISTS documents_tf, documents_tf_vocabulary;
SELECT madlib.term_frequency('documents', -- input table
'docid', -- document id column
'words', -- vector of words in document
'documents_tf',-- output table
TRUE
);
SELECT * FROM documents_tf ORDER BY docid;
\nbsp
docid | wordid | count
-------+--------+-------
0 | 17 | 1
0 | 9 | 1
0 | 25 | 1
0 | 12 | 1
0 | 13 | 1
0 | 15 | 2
0 | 0 | 1
0 | 2 | 2
0 | 28 | 1
0 | 5 | 1
0 | 6 | 1
0 | 7 | 1
0 | 8 | 1
0 | 26 | 1
1 | 16 | 1
1 | 11 | 1
1 | 10 | 1
1 | 2 | 1
1 | 3 | 1
2 | 30 | 1
2 | 1 | 1
2 | 16 | 1
2 | 20 | 1
2 | 24 | 1
2 | 29 | 1
3 | 4 | 1
3 | 21 | 1
3 | 22 | 1
3 | 23 | 1
3 | 0 | 1
3 | 11 | 1
3 | 9 | 1
3 | 27 | 1
3 | 14 | 1
3 | 18 | 1
3 | 19 | 1
(36 rows)
\nbsp
Note above that wordid's start
at 0 not 1. The vocabulary table maps wordid to the actual word:
SELECT * FROM documents_tf_vocabulary ORDER BY wordid;
wordid | word
--------+-------------
0 | a
1 | adopted
2 | and
3 | are
4 | at
5 | ate
6 | banana
7 | bananas
8 | breakfast
9 | broccoli
10 | chinchillas
11 | cute
12 | eat
13 | for
14 | hamster
15 | i
16 | kittens
17 | like
18 | look
19 | munching
20 | my
21 | of
22 | on
23 | piece
24 | sister
25 | smoothie
26 | spinach
27 | this
28 | to
29 | two
30 | yesterday
(31 rows)
@anchor related
@par Related Topics
See text_utilities.sql_in for the term frequency SQL function definition
and porter_stemmer.sql_in for the stemmer function.
*/
/**
* @brief Compute the term frequency for each term in a collection of documents
*
* @returns Names of tables created
*/
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.term_frequency(
input_table TEXT,
doc_id_col TEXT,
word_vec_col TEXT,
output_table TEXT,
compute_vocab BOOLEAN)
RETURNS TEXT
AS $$
PythonFunctionBodyOnly(`utilities', `text_utilities')
return text_utilities.term_frequency(input_table, doc_id_col, word_vec_col,
output_table, compute_vocab=compute_vocab)
$$
LANGUAGE plpythonu
VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.term_frequency(
input_table TEXT,
doc_id_col TEXT,
word_vec_col TEXT,
output_table TEXT)
RETURNS TEXT
AS $$
SELECT MADLIB_SCHEMA.term_frequency($1, $2, $3, $4, FALSE);
$$
LANGUAGE SQL
VOLATILE
m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');