TD_TargetEncodingFit Example
The following input table contains a subset of titantic dataset.
CREATE TABLE titanic_train ( passenger INTEGER, survived INTEGER, pclass INTEGER, name VARCHAR(90) CHARACTER SET LATIN NOT CASESPECIFIC, gender VARCHAR(10) CHARACTER SET LATIN NOT CASESPECIFIC, age INTEGER, sibsp INTEGER, parch INTEGER, ticket VARCHAR(20) CHARACTER SET LATIN NOT CASESPECIFIC, fare FLOAT, cabin VARCHAR(20) CHARACTER SET LATIN NOT CASESPECIFIC, embarked VARCHAR(10) CHARACTER SET LATIN NOT CASESPECIFIC) PRIMARY INDEX ( passenger ); INSERT INTO titanic_train (2, 1, 1, 'Cumings; Mrs. John Bradley (Florence Briggs Thayer)', 'female', 38, 1, 0, 'PC 17599', 71.2833, 'C85', 'C'); INSERT INTO titanic_train (4, 1, 1, 'Futrelle; Mrs. Jacques Heath (Lily May Peel)', 'female', 35, 1, 0, '113803', 53.1, 'C123', 'S'); INSERT INTO titanic_train (7, 0, 1, 'McCarthy; Mr. Timothy J', 'male', 54, 0, 0, '17463', 51.8625, 'E46', 'S'); INSERT INTO titanic_train (10, 1, 2, 'Nasser; Mrs. Nicholas (Adele Achem)', 'female', 14, 1, 0, '237736', 30.0708, '', 'C'); INSERT INTO titanic_train (16, 1, 2, 'Hewlett; Mrs. (Mary D Kingcome) ', 'female', 55, 0, 0, '248706', 16, '', 'S'); INSERT INTO titanic_train (21, 0, 2, 'Fynney; Mr. Joseph J', 'male', 35, 0, 0, '239865', 26, '', 'S'); INSERT INTO titanic_train (40, 1, 3, 'Nicola-Yarred; Miss. Jamila', 'female', 14, 1, 0, '2651', 11.2417, '', 'C'); INSERT INTO titanic_train (61, 0, 3, 'Sirayanian; Mr. Orsen', 'male', 22, 0, 0, '2669', 7.2292, '', 'C'); INSERT INTO titanic_train (80, 1, 3, 'Dowdell; Miss. Elizabeth', 'female', 30, 0, 0, '364516', 12.475, '', 'S');
SELECT * FROM titanic_train ORDER BY 1;
passenger | survived | pclass | name | gender | age | sibsp | parch | ticket | fare | cabin | embarked |
---|---|---|---|---|---|---|---|---|---|---|---|
2 | 1 | 1 | Cumings; Mrs John Bradley (Florence Briggs Thayer) | female | 38 | 1 | 0 | PC 17599 | 71.2833 | CBS | C |
4 | 1 | 1 | Futrell; Mrs. Jacques Heath (Lily May Peel) | female | 35 | 1 | 0 | 113803 | 53.1 | C123 | S |
7 | 0 | 1 | McCarthy; Mr. Timothy J | male | 54 | 0 | 0 | 117463 | 51.8625 | E46 | S |
10 | 1 | 2 | Nasser; Mrs. Nicholas (Adele Achem) | female | 14 | 1 | 0 | 237736 | 30.0707 | C | |
16 | 1 | 2 | Hewlett; Mrs. (Mary D Kingcome) | female | 55 | 0 | 0 | 248706 | 16 | S | |
21 | 0 | 2 | Fynney; Mr. Joseph J | male | 35 | 0 | 0 | 239865 | 26 | S | |
40 | 1 | 3 | Nicola-Yarred; Miss. Jamila | female | 14 | 1 | 0 | 2651 | 11.2417 | C | |
61 | 0 | 3 | Sirayanian; Mr. Orsen | male | 22 | 0 | 0 | 2669 | 7.2292 | C | |
80 | 1 | 3 | Dowdell; Miss. Elizabeth | female | 30 | 0 | 0 | 364516 | 12.475 | S |
CategoryTable:
CREATE TABLE categoryTable AS ( SELECT ColumnName, count(*) AS CategoryCount FROM ( SELECT * FROM TD_CategoricalSummary( ON titanic_train AS InputTable USING TargetColumns('gender','embarked') ) AS dt WHERE DistinctValue IS NOT NULL) AS CatTable GROUP BY ColumnName) WITH data;
SELECT * FROM categoryTable;
ColumnName | CategoryCount |
---|---|
embarked | 2 |
gender | 2 |
TD_TargetEncodingFit Query with EncoderMethod CBM_BETA
TD_TargetEncodingFit ( ON titanic_train AS InputTable ON categoryTable AS CategoryTable DIMENSION USING EncoderMethod('CBM_BETA') TargetColumns('gender','embarked') ResponseColumn ('survived') ...DefaultValues(-1, -2) )
Output:
TD_ColumnName_TEFIT | TD_Category_TEFIT | TD_Alpha_TEFIT | TD_Beta_TEFIT | TD_DefaultCategory_TEFIT | TD_CategoryCount_TEFIT | TD_ColumnIndex_TEFIT | TD_EncoderMethod_TEFIT_CBM_BETA | gender | embarked |
---|---|---|---|---|---|---|---|---|---|
embarked | C | 3.666667 | 1.333333 | -2 | 2 | 1 | ? | ? | ? |
embarked | S | 3.666667 | 2.333333 | -2 | 2 | 1 | ? | ? | ? |
gender | female | 6.666667 | 0.333333 | -1 | 2 | 0 | ? | ? | ? |
gender | male | 0.666667 | 3.333333 | -1 | 2 | 0 | ? | ? | ? |
TD_TargetEncodingFit Query with EncoderMethod CBM_DIRICHLET
TD_TargetEncodingFit ( ON titanic_train AS InputTable ON categoryTable AS CategoryTable DIMENSION USING EncoderMethod('CBM_DIRICHLET') TargetColumns('gender','embarked') ResponseColumn ('pclass') DefaultValues(-1, -2) NumDistinctResponses(3) )
Output:
TD_ColumnName_TEFIT | TD_Category_TEFIT | TD_Alpha1_TEFIT_gender | TD_Alpha2_TEFIT_gender | TD_Alpha3_TEFIT_gender | TD_Alpha1_TEFIT_embarked | TD_Alpha2_TEFIT_embarked | TD_Alpha3_TEFIT_embarked | TD_DefaultCategory_TEFIT | TD_CategoryCount_TEFIT | TD_ColumnIndex_TEFIT | TD_EncoderMethod_TEFIT_CBM_DIRICHLET | TD_NumResponses_TEFIT_3 |
---|---|---|---|---|---|---|---|---|---|---|---|---|
embarked | C | ? | ? | ? | 1.333333 | 1.333333 | 2.333333 | -2 | 2 | 1 | ? | ? |
embarked | S | ? | ? | ? | 2.333333 | 2.333333 | 1.333333 | -2 | 2 | 1 | ? | ? |
gender | female | 2.333333 | 2.333333 | 2.333333 | ? | ? | ? | -1 | 2 | 0 | ? | ? |
gender | male | 1.333333 | 1.333333 | 1.333333 | ? | ? | ? | -1 | 2 | 0 | ? | ? |
TD_TargetEncodingFit Query with EncoderMethod CBM_GAUSSIAN_INVERSE_GAMMA
TD_TargetEncodingFit ( ON titanic_train AS InputTable ON categoryTable AS CategoryTable DIMENSION USING EncoderMethod('CBM_GAUSSIAN_INVERSE_GAMMA) TargetColumns('gender','embarked') ResponseColumn ('age') DefaultValues(-1, -2) )
Output:
TD_ColumnName_TEFIT | TD_Category_TEFIT | TD_U0_TEFIT | TD_V0_TEFIT | TD_Alpha0_TEFIT | TD_Beta0_TEFIT | TD_DefaultCategory_TEFIT | TD_CategoryCount_TEFIT | TD_Columnindex_TEFIT | TD_EncoderMehod_TEFIT_CBM_GAUSSIAN_INVERSE_GAMMA | gender | embarked |
---|---|---|---|---|---|---|---|---|---|---|---|
embarked | C | 17.600000 | 5.000000 | 4.000000 | 258.600000 | -2 | 2 | 1 | ? | ? | ? |
embarked | S | 34.833333 | 6.000000 | 4.500000 | 798.366667 | -2 | 2 | 1 | ? | ? | ? |
gender | female | 26.571429 | 7.000000 | 5.000000 | 534.857143 | -1 | 2 | 0 | ? | ? | ? |
gender | male | 27.750000 | 4.000000 | 3.500000 | 643.875000 | -1 | 2 | 0 | ? | ? | ? |