InputTable: test_table data
CREATE TABLE test_table (
id INTEGER, paragraph VARCHAR(100)
);
INSERT INTO test_table (id, paragraph) VALUES(1, 'Programmers program with program, as.as programming languages a program');
INSERT INTO test_table (id, paragraph) VALUES(2, 'The quick brown fox jumps over the lazy dog');
SELECT Statement
SELECT * from test_table;
Result:
paragraph ----------------------------------------------------------------------- Programmers program with program, as.as programming languages a program The quick brown fox jumps over the lazy dog
StopWords table: Custom set of words to be removed when parsing
CREATE TABLE stopwords (word varchar(10));
INSERT INTO stopwords('a');
INSERT INTO stopwords('an');
INSERT INTO stopwords('and');
INSERT INTO stopwords('the');
SELECT Statement
SELECT * from stopwords
Result:
word ----- the and an a
Query 1 (Tokenizing with default delimiter)
SELECT * FROM TD_TextParser (
ON test_table AS InputTable
USING
TextColumn ('paragraph')
RemoveStopWords ('true')
) as dt ORDER BY 1,4
Result:
| id | paragraph | token | locations |
|---|---|---|---|
| 1 | Programmers program with program, as.as programming languages a program | programmers | 1 |
| 1 | Programmers program with program, as.as programming languages a program | program | 2 |
| 1 | Programmers program with program, as.as programming languages a program | programming | 5 |
| 1 | Programmers program with program, as.as programming languages a program | language | 6 |
| 1 | Programmers program with program, as.as programming languages a program | program | 8 |
| 2 | The quick brown fox jumps over the lazy dog | quick | 1 |
| 2 | The quick brown fox jumps over the lazy dog | brown | 2 |
| 2 | The quick brown fox jumps over the lazy dog | fox | 3 |
| 2 | The quick brown fox jumps over the lazy dog | jumps | 4 |
| 2 | The quick brown fox jumps over the lazy dog | over | 5 |
| 2 | The quick brown fox jumps over the lazy dog | lazy | 7 |
| 2 | The quick brown fox jumps over the lazy dog | dog | 8 |
Query 2 (Using StopWordsTable, ListPositions, and TokenFrequency)
SELECT * FROM TD_TextParser (
ON test_table AS InputTable
ON stopwords as StopWordsTable DIMENSION
USING
TextColumn ('paragraph')
RemoveStopWords ('true')
DocIDColumn('id')
ListPositions('t')
TokenFrequency('t')
) as dt ORDER BY 1,2
Result:
| id | paragraph | token | freqeuncy | locations |
|---|---|---|---|---|
| 1 | Programmers program with program, as.as programming languages a program | programmers | 1 | 1 |
| 1 | Programmers program with program, as.as programming languages a program | program | 2 | 2,8 |
| 1 | Programmers program with program, as.as programming languages a program | with | 1 | 3 |
| 1 | Programmers program with program, as.as programming languages a program | the | 1 | 4 |
| 1 | Programmers program with program, as.as programming languages a program | programming | 1 | 5 |
| 1 | Programmers program with program, as.as programming languages a program | language | 1 | 6 |
| 2 | The quick brown fox jumps over the lazy dog | the | 2 | 0,6 |
| 2 | The quick brown fox jumps over the lazy dog | quick | 1 | 1 |
| 2 | The quick brown fox jumps over the lazy dog | brown | 1 | 2 |
| 2 | The quick brown fox jumps over the lazy dog | fox | 1 | 3 |
| 2 | The quick brown fox jumps over the lazy dog | jumps | 1 | 4 |
| 2 | The quick brown fox jumps over the lazy dog | over | 1 | 5 |
| 2 | The quick brown fox jumps over the lazy dog | lazy | 1 | 7 |
| 2 | The quick brown fox jumps over the lazy dog | dog | 1 | 8 |
Query 3 (Using OutputByWords set to false and Delimiter is a blank space)
SELECT * FROM TD_TextParser (
ON test_table AS InputTable
ON stopwords as StopWordsTable DIMENSION
USING
TextColumn ('paragraph')
RemoveStopWords ('true')
Delimiter(' ')
OutputByWord('false')
) as dt ORDER BY 1,2
Result:
| id | paragraph | tokens |
|---|---|---|
| 1 | Programmers program with program, as.as programming languages a program | programmers program program programming languages program |
| 2 | The quick brown fox jumps over the lazy dog | quick brown fox jumps over lazy dog |
Query 4 (Using DelimiterRegex)
SELECT * FROM TD_TextParser (
ON test_table AS InputTable
USING
TextColumn ('paragraph')
RemoveStopWords ('true')
DocIDColumn('id')
DelimiterRegex('[ \t\f\r\n]+')
ListPositions('true')
) as dt ORDER BY 1,4
Result:
| id | paragraph | tokens | locations |
|---|---|---|---|
| 1 | Programmers program with program, as.as programming languages a program | programmers | 1 |
| 1 | Programmers program with program, as.as programming languages a program | program | 2,8 |
| 1 | Programmers program with program, as.as programming languages a program | programming | 5 |
| 1 | Programmers program with program, as.as programming languages a program | language | 6 |
| 2 | The quick brown fox jumps over the lazy dog | quick | 1 |
| 2 | The quick brown fox jumps over the lazy dog | brown | 2 |
| 2 | The quick brown fox jumps over the lazy dog | fox | 3 |
| 2 | The quick brown fox jumps over the lazy dog | jumps | 4 |
| 2 | The quick brown fox jumps over the lazy dog | over | 5 |
| 2 | The quick brown fox jumps over the lazy dog | lazy | 7 |
| 2 | The quick brown fox jumps over the lazy dog | dog | 8 |