TD_TargetEncodingTransform Example
InputTable: The following table contains a subset of titantic dataset.
CREATE TABLE titanic_train ( passenger INTEGER, survived INTEGER, pclass INTEGER, name VARCHAR(90) CHARACTER SET LATIN NOT CASESPECIFIC, gender VARCHAR(10) CHARACTER SET LATIN NOT CASESPECIFIC, age INTEGER, sibsp INTEGER, parch INTEGER, ticket VARCHAR(20) CHARACTER SET LATIN NOT CASESPECIFIC, fare FLOAT, cabin VARCHAR(20) CHARACTER SET LATIN NOT CASESPECIFIC, embarked VARCHAR(10) CHARACTER SET LATIN NOT CASESPECIFIC) PRIMARY INDEX ( passenger ); INSERT INTO titanic_train (2, 1, 1, 'Cumings; Mrs. John Bradley (Florence Briggs Thayer)', 'female', 38, 1, 0, 'PC 17599', 71.2833, 'C85', 'C'); INSERT INTO titanic_train (4, 1, 1, 'Futrelle; Mrs. Jacques Heath (Lily May Peel)', 'female', 35, 1, 0, '113803', 53.1, 'C123', 'S'); INSERT INTO titanic_train (7, 0, 1, 'McCarthy; Mr. Timothy J', 'male', 54, 0, 0, '17463', 51.8625, 'E46', 'S'); INSERT INTO titanic_train (10, 1, 2, 'Nasser; Mrs. Nicholas (Adele Achem)', 'female', 14, 1, 0, '237736', 30.0708, '', 'C'); INSERT INTO titanic_train (16, 1, 2, 'Hewlett; Mrs. (Mary D Kingcome) ', 'female', 55, 0, 0, '248706', 16, '', 'S'); INSERT INTO titanic_train (21, 0, 2, 'Fynney; Mr. Joseph J', 'male', 35, 0, 0, '239865', 26, '', 'S'); INSERT INTO titanic_train (40, 1, 3, 'Nicola-Yarred; Miss. Jamila', 'female', 14, 1, 0, '2651', 11.2417, '', 'C'); INSERT INTO titanic_train (61, 0, 3, 'Sirayanian; Mr. Orsen', 'male', 22, 0, 0, '2669', 7.2292, '', 'C'); INSERT INTO titanic_train (80, 1, 3, 'Dowdell; Miss. Elizabeth', 'female', 30, 0, 0, '364516', 12.475, '', 'S
SELECT * FROM titanic_train ORDER BY 1;
Input table:
passenger | survived | pclass | name | gender | age | sibsp | parch | ticket | fare | cabin | embarked |
---|---|---|---|---|---|---|---|---|---|---|---|
2 | 1 | 1 | Cumings; Mrs John Bradley (Florence Briggs Thayer) | female | 38 | 1 | 0 | PC 17599 | 71.2833 | CBS | C |
4 | 1 | 1 | Futrell; Mrs. Jacques Heath (Lily May Peel) | female | 35 | 1 | 0 | 113803 | 53.1 | C123 | S |
7 | 0 | 1 | McCarthy; Mr. Timothy J | male | 54 | 0 | 0 | 117463 | 51.8625 | E46 | S |
10 | 1 | 2 | Nasser; Mrs. Nicholas (Adele Achem) | female | 14 | 1 | 0 | 237736 | 30.0707 | C | |
16 | 1 | 2 | Hewlett; Mrs. (Mary D Kingcome) | female | 55 | 0 | 0 | 248706 | 16 | S | |
21 | 0 | 2 | Fynney; Mr. Joseph J | male | 35 | 0 | 0 | 239865 | 26 | S | |
40 | 1 | 3 | Nicola-Yarred; Miss. Jamila | female | 14 | 1 | 0 | 2651 | 11.2417 | C | |
61 | 0 | 3 | Sirayanian; Mr. Orsen | male | 22 | 0 | 0 | 2669 | 7.2292 | C | |
80 | 1 | 3 | Dowdell; Miss. Elizabeth | female | 30 | 0 | 0 | 364516 | 12.475 | S |
TD_TargetEncodingTransform Query with EncoderMethod CBM_BETA
SELECT * FROM TD_TargetEncodingFit ( ON titanic_train AS InputTable ON categoryTable AS CategoryTable DIMENSION OUT PERMANENT TABLE OutputTable(betaEncFitTbl) USING EncoderMethod('CBM_BETA') TargetColumns('gender','embarked') ResponseColumn ('survived') DefaultValues(-1, -2) ) AS dt;
CBM_BETA FitTable:
TD_ColumnName_TEFIT | TD_Category_TEFIT | TD_Alpha_TEFIT | TD_Beta_TEFIT | TD_DefaultCategory_TEFIT | TD_CategoryCount_TEFIT | TD_ColumnIndex_TEFIT | TD_EncoderMethod_TEFIT_CBM_BETA | gender | embarked |
---|---|---|---|---|---|---|---|---|---|
embarked | C | 3.666667 | 1.333333 | -2 | 2 | 1 | ? | ? | ? |
embarked | S | 3.666667 | 2.333333 | -2 | 2 | 1 | ? | ? | ? |
gender | female | 6.666667 | 0.333333 | -1 | 2 | 0 | ? | ? | ? |
gender | male | 0.666667 | 3.333333 | -1 | 2 | 0 | ? | ? | ? |
Transform query:
SELECT * FROM TD_TargetEncodingTransform ( ON titanic_train AS InputTable ON betaEncFitTbl AS FitTable DIMENSION USING Accumulate('passenger') ) AS dt ORDER BY passenger;
Transform output:
gender | embarked | passenger |
---|---|---|
0.952381 | 0.733333 | 2 |
0.952381 | 0.611111 | 4 |
0.166667 | 0.611111 | 7 |
0.952381 | 0.733333 | 10 |
0.952381 | 0.611111 | 16 |
0.166667 | 0.611111 | 21 |
0.952381 | 0.733333 | 40 |
0.166667 | 0.733333 | 61 |
0.952381 | 0.611111 | 80 |
TD_TargetEncodingFit Query with EncoderMethod CBM_DIRICHLET
SELECT * FROM TD_TargetEncodingFit ( ON titanic_train AS InputTable ON categoryTable AS CategoryTable DIMENSION OUT PERMANENT TABLE OutputTable(dirichletEncFitTbl) USING EncoderMethod('CBM_DIRICHLET') TargetColumns('gender','embarked') ResponseColumn ('pclass') DefaultValues(-1, -2) NumDistinctResponses(3) ) AS dt;
CBM_DIRICHLET FitTable:
TD_ColumnName_TEFIT | TD_Category_TEFIT | TD_Alpha1_TEFIT_gender | TD_Alpha2_TEFIT_gender | TD_Alpha3_TEFIT_gender | TD_Alpha1_TEFIT_embarked | TD_Alpha2_TEFIT_embarked | TD_Alpha3_TEFIT_embarked | TD_DefaultCategory_TEFIT | TD_CategoryCount_TEFIT | TD_ColumnIndex_TEFIT | TD_EncoderMethod_TEFIT_CBM_DIRICHLET | TD_NumResponses_TEFIT_3 |
---|---|---|---|---|---|---|---|---|---|---|---|---|
embarked | C | ? | ? | ? | 1.333333 | 1.333333 | 2.333333 | -2 | 2 | 1 | ? | ? |
embarked | S | ? | ? | ? | 2.333333 | 2.333333 | 1.333333 | -2 | 2 | 1 | ? | ? |
gender | female | 2.333333 | 2.333333 | 2.333333 | ? | ? | ? | -1 | 2 | 0 | ? | ? |
gender | male | 1.333333 | 1.333333 | 1.333333 | ? | ? | ? | -1 | 2 | 0 | ? | ? |
Transform query:
SELECT * FROM TD_TargetEncodingTransform ( ON titanic_train AS InputTable ON dirichletEncFitTbl AS FitTable DIMENSION USING Accumulate('passenger') ) AS dt ORDER BY passenger;
Transform output:
gender_1 | gender_2 | gender_3 | embarked_1 | embarked_2 | embarked_3 | passenger |
---|---|---|---|---|---|---|
0.333333 | 0.333333 | 0.333333 | 0.266667 | 0.266667 | 0.466667 | 2 |
0.333333 | 0.333333 | 0.333333 | 0.388889 | 0.388889 | 0.222222 | 4 |
0.333333 | 0.333333 | 0.333333 | 0.388889 | 0.388889 | 0.222222 | 7 |
0.333333 | 0.333333 | 0.333333 | 0.266667 | 0.266667 | 0.466667 | 10 |
0.333333 | 0.333333 | 0.333333 | 0.388889 | 0.388889 | 0.222222 | 16 |
0.333333 | 0.333333 | 0.333333 | 0.388889 | 0.388889 | 0.222222 | 21 |
0.333333 | 0.333333 | 0.333333 | 0.266667 | 0.266667 | 0.466667 | 40 |
0.333333 | 0.333333 | 0.333333 | 0.266667 | 0.266667 | 0.466667 | 61 |
0.333333 | 0.333333 | 0.333333 | 0.388889 | 0.388889 | 0.222222 | 80 |
TD_TargetEncodingTransform Query with EncoderMethod CBM_GAUSSIAN_INVERSE_GAMMA
SELECT * FROM TD_TargetEncodingFit ( ON titanic_train AS InputTable ON categoryTable AS CategoryTable DIMENSION OUT PERMANENT TABLE OutputTable(GIGEncFitTbl) USING EncoderMethod('CBM_GAUSSIAN_INVERSE_GAMMA') TargetColumns('gender','embarked') ResponseColumn ('age') DefaultValues(-1, -2) ) AS dt;
Output:
TD_ColumnName_TEFIT | TD_Category_TEFIT | TD_U0_TEFIT | TD_V0_TEFIT | TD_Alpha0_TEFIT | TD_Beta0_Count | TD_DefaultCategory_TEFIT | TD_CategoryCount_TEFIT | TD_Columnindex_TEFIT | TD_EncoderMehod_TEFIT_CBM_GAUSSIAN_INVERSE_GAMMA | gender | embarked |
---|---|---|---|---|---|---|---|---|---|---|---|
embarked | C | 17.600000 | 5.000000 | 4.000000 | 258.600000 | -2 | 2 | 1 | ? | ? | ? |
embarked | S | 34.833333 | 6.000000 | 4.500000 | 798.366667 | -2 | 2 | 1 | ? | ? | ? |
gender | female | 26.571429 | 7.000000 | 5.000000 | 534.857143 | -1 | 2 | 0 | ? | ? | ? |
gender | male | 27.750000 | 4.000000 | 3.500000 | 643.875000 | -1 | 2 | 0 | ? | ? | ? |
Transform query:
SELECT * FROM TD_TargetEncodingTransform ( ON titanic_train AS InputTable ON GIGEncFitTbl AS FitTable DIMENSION USING Accumulate('passenger') ) AS dt ORDER BY passenger;
Transform output:
gender | embarked | passenger |
---|---|---|
26.57143 | 17.6 | 2 |
26.57143 | 34.83333 | 4 |
27.75 | 34.83333 | 7 |
26.57143 | 17.6 | 10 |
26.57143 | 34.83333 | 16 |
27.75 | 34.83333 | 21 |
26.57143 | 17.6 | 40 |
27.75 | 17.6 | 61 |
26.57143 | 34.83333 | 80 |