InputTable: test_table data
CREATE TABLE test_table ( id INTEGER, paragraph VARCHAR(100) ); INSERT INTO test_table (id, paragraph) VALUES(1, 'Programmers program with program, as.as programming languages a program'); INSERT INTO test_table (id, paragraph) VALUES(2, 'The quick brown fox jumps over the lazy dog');
SELECT Statement
SELECT * from test_table;
Result:
paragraph ----------------------------------------------------------------------- Programmers program with program, as.as programming languages a program The quick brown fox jumps over the lazy dog
StopWords table: Custom set of words to be removed when parsing
CREATE TABLE stopwords (word varchar(10)); INSERT INTO stopwords('a'); INSERT INTO stopwords('an'); INSERT INTO stopwords('and'); INSERT INTO stopwords('the');
SELECT Statement
SELECT * from stopwords
Result:
word ----- the and an a
Query 1 (Tokenizing with default delimiter)
SELECT * FROM TD_TextParser ( ON test_table AS InputTable USING TextColumn ('paragraph') RemoveStopWords ('true') ) as dt ORDER BY 1,4
Result:
id | paragraph | token | locations |
---|---|---|---|
1 | Programmers program with program, as.as programming languages a program | programmers | 1 |
1 | Programmers program with program, as.as programming languages a program | program | 2 |
1 | Programmers program with program, as.as programming languages a program | programming | 5 |
1 | Programmers program with program, as.as programming languages a program | language | 6 |
1 | Programmers program with program, as.as programming languages a program | program | 8 |
2 | The quick brown fox jumps over the lazy dog | quick | 1 |
2 | The quick brown fox jumps over the lazy dog | brown | 2 |
2 | The quick brown fox jumps over the lazy dog | fox | 3 |
2 | The quick brown fox jumps over the lazy dog | jumps | 4 |
2 | The quick brown fox jumps over the lazy dog | over | 5 |
2 | The quick brown fox jumps over the lazy dog | lazy | 7 |
2 | The quick brown fox jumps over the lazy dog | dog | 8 |
Query 2 (Using StopWordsTable, ListPositions, and TokenFrequency)
SELECT * FROM TD_TextParser ( ON test_table AS InputTable ON stopwords as StopWordsTable DIMENSION USING TextColumn ('paragraph') RemoveStopWords ('true') DocIDColumn('id') ListPositions('t') TokenFrequency('t') ) as dt ORDER BY 1,2
Result:
id | paragraph | token | freqeuncy | locations |
---|---|---|---|---|
1 | Programmers program with program, as.as programming languages a program | programmers | 1 | 1 |
1 | Programmers program with program, as.as programming languages a program | program | 2 | 2,8 |
1 | Programmers program with program, as.as programming languages a program | with | 1 | 3 |
1 | Programmers program with program, as.as programming languages a program | the | 1 | 4 |
1 | Programmers program with program, as.as programming languages a program | programming | 1 | 5 |
1 | Programmers program with program, as.as programming languages a program | language | 1 | 6 |
2 | The quick brown fox jumps over the lazy dog | the | 2 | 0,6 |
2 | The quick brown fox jumps over the lazy dog | quick | 1 | 1 |
2 | The quick brown fox jumps over the lazy dog | brown | 1 | 2 |
2 | The quick brown fox jumps over the lazy dog | fox | 1 | 3 |
2 | The quick brown fox jumps over the lazy dog | jumps | 1 | 4 |
2 | The quick brown fox jumps over the lazy dog | over | 1 | 5 |
2 | The quick brown fox jumps over the lazy dog | lazy | 1 | 7 |
2 | The quick brown fox jumps over the lazy dog | dog | 1 | 8 |
Query 3 (Using OutputByWords set to false and Delimiter is a blank space)
SELECT * FROM TD_TextParser ( ON test_table AS InputTable ON stopwords as StopWordsTable DIMENSION USING TextColumn ('paragraph') RemoveStopWords ('true') Delimiter(' ') OutputByWord('false') ) as dt ORDER BY 1,2
Result:
id | paragraph | tokens |
---|---|---|
1 | Programmers program with program, as.as programming languages a program | programmers program program programming languages program |
2 | The quick brown fox jumps over the lazy dog | quick brown fox jumps over lazy dog |
Query 4 (Using DelimiterRegex)
SELECT * FROM TD_TextParser ( ON test_table AS InputTable USING TextColumn ('paragraph') RemoveStopWords ('true') DocIDColumn('id') DelimiterRegex('[ \t\f\r\n]+') ListPositions('true') ) as dt ORDER BY 1,4
Result:
id | paragraph | tokens | locations |
---|---|---|---|
1 | Programmers program with program, as.as programming languages a program | programmers | 1 |
1 | Programmers program with program, as.as programming languages a program | program | 2,8 |
1 | Programmers program with program, as.as programming languages a program | programming | 5 |
1 | Programmers program with program, as.as programming languages a program | language | 6 |
2 | The quick brown fox jumps over the lazy dog | quick | 1 |
2 | The quick brown fox jumps over the lazy dog | brown | 2 |
2 | The quick brown fox jumps over the lazy dog | fox | 3 |
2 | The quick brown fox jumps over the lazy dog | jumps | 4 |
2 | The quick brown fox jumps over the lazy dog | over | 5 |
2 | The quick brown fox jumps over the lazy dog | lazy | 7 |
2 | The quick brown fox jumps over the lazy dog | dog | 8 |