Every complete example in this document is available in a zip file that you can download. The zip file includes a SQL script file that creates the input tables for the examples. If you are reading this document on https://docs.teradata.com/, you can download the zip file from the attachment in the left sidebar.
TD_TFIDF Input
Input table before tokenization creation:
CREATE TABLE tfidf_input (docid integer, content varchar(100), category varchar(10)); INSERT INTO tfidf_input (1,'The quick brown fox jumps over the lazy fox.','Animals'); INSERT INTO tfidf_input (2,'Scientists conducted experiments in the lab to analyze the chemical reactions.','Science'); INSERT INTO tfidf_input (3,'Using advanced equipments in the lab, scientists observed unexpected reactions in the lab.','Science');
Tokenization call:
CREATE MULTISET TABLE tfidf_input_tokenized AS ( SELECT docid, cast(token as varchar(15)) as token, category FROM TD_TextParser ( ON tfidf_input AS InputTable USING TextColumn ('content') ConvertToLowerCase ('true') OutputByWord ('true') Punctuation ('\[.,?\!\]') RemoveStopWords ('true') StemTokens ('true') Accumulate ('docid','category') ) AS dt ) WITH DATA;
Input table after tokenization:
docid token category ----- ----- -------- 1 brown Animals 1 fox Animals 1 fox Animals 1 jump Animals 1 lazi Animals 1 over Animals 1 quick Animals 2 analyz Science 2 chemic Science 2 conduct Science 2 experi Science 2 lab Science 2 reaction Science 2 scientist Science 3 advanc Science 3 equip Science 3 lab Science 3 observ Science 3 reaction Science 3 scientist Science 3 unexpect Science 3 use Science
TD_TFIDF SQL Call
SELECT * FROM TD_TFIDF ( ON tfidf_input_tokenized AS InputTable USING DocIdColumn ('docid') TokenColumn ('token') TFNormalization ('LOG') IDFNormalization ('SMOOTH') Regularization ('L2') Accumulate ('category') ) AS dt ORDER BY docid, token;
TD_TFIDF Output
docid token TD_TF TD_IDF TD_TF_IDF category ----- ----- ----- ------ --------- -------- 1 brown 1.00000000000000E+00 1.69314718055995E+00 3.56535187467553E-01 Animals 1 fox 1.69314718055995E+00 1.69314718055995E+00 6.03666547431099E-01 Animals 1 jump 1.00000000000000E+00 1.69314718055995E+00 3.56535187467553E-01 Animals 1 lazi 1.00000000000000E+00 1.69314718055995E+00 3.56535187467553E-01 Animals 1 over 1.00000000000000E+00 1.69314718055995E+00 3.56535187467553E-01 Animals 1 quick 1.00000000000000E+00 1.69314718055995E+00 3.56535187467553E-01 Animals 2 analyz 1.00000000000000E+00 1.69314718055995E+00 4.17566623878192E-01 Science 2 chemic 1.00000000000000E+00 1.69314718055995E+00 4.17566623878192E-01 Science 2 conduct 1.00000000000000E+00 1.69314718055995E+00 4.17566623878192E-01 Science 2 experi 1.00000000000000E+00 1.69314718055995E+00 4.17566623878192E-01 Science 2 lab 1.69314718055995E+00 1.28768207245178E+00 3.17570180428344E-01 Science 2 reaction 1.00000000000000E+00 1.28768207245178E+00 3.17570180428344E-01 Science 2 scientist 1.00000000000000E+00 1.28768207245178E+00 3.17570180428344E-01 Science 3 advanc 1.00000000000000E+00 1.69314718055995E+00 3.57715385483810E-01 Science 3 equip 1.00000000000000E+00 1.69314718055995E+00 3.57715385483810E-01 Science 3 lab 1.00000000000000E+00 1.28768207245178E+00 4.60623688927680E-01 Science 3 observ 1.00000000000000E+00 1.69314718055995E+00 3.57715385483810E-01 Science 3 reaction 1.00000000000000E+00 1.28768207245178E+00 2.72051770936621E-01 Science 3 scientist 1.00000000000000E+00 1.28768207245178E+00 2.72051770936621E-01 Science 3 unexpect 1.00000000000000E+00 1.69314718055995E+00 3.57715385483810E-01 Science 3 use 1.00000000000000E+00 1.69314718055995E+00 3.57715385483810E-01 Science