TD_TargetEncodingTransform Example
InputTable: The following table contains a subset of titantic dataset.
CREATE TABLE titanic_train ( passenger INTEGER, survived INTEGER, pclass INTEGER, name VARCHAR(90) CHARACTER SET LATIN NOT CASESPECIFIC, gender VARCHAR(10) CHARACTER SET LATIN NOT CASESPECIFIC, age INTEGER, sibsp INTEGER, parch INTEGER, ticket VARCHAR(20) CHARACTER SET LATIN NOT CASESPECIFIC, fare FLOAT, cabin VARCHAR(20) CHARACTER SET LATIN NOT CASESPECIFIC, embarked VARCHAR(10) CHARACTER SET LATIN NOT CASESPECIFIC) PRIMARY INDEX ( passenger ); INSERT INTO titanic_train (2, 1, 1, 'Cumings; Mrs. John Bradley (Florence Briggs Thayer)', 'female', 38, 1, 0, 'PC 17599', 71.2833, 'C85', 'C'); INSERT INTO titanic_train (4, 1, 1, 'Futrelle; Mrs. Jacques Heath (Lily May Peel)', 'female', 35, 1, 0, '113803', 53.1, 'C123', 'S'); INSERT INTO titanic_train (7, 0, 1, 'McCarthy; Mr. Timothy J', 'male', 54, 0, 0, '17463', 51.8625, 'E46', 'S'); INSERT INTO titanic_train (10, 1, 2, 'Nasser; Mrs. Nicholas (Adele Achem)', 'female', 14, 1, 0, '237736', 30.0708, '', 'C'); INSERT INTO titanic_train (16, 1, 2, 'Hewlett; Mrs. (Mary D Kingcome) ', 'female', 55, 0, 0, '248706', 16, '', 'S'); INSERT INTO titanic_train (21, 0, 2, 'Fynney; Mr. Joseph J', 'male', 35, 0, 0, '239865', 26, '', 'S'); INSERT INTO titanic_train (40, 1, 3, 'Nicola-Yarred; Miss. Jamila', 'female', 14, 1, 0, '2651', 11.2417, '', 'C'); INSERT INTO titanic_train (61, 0, 3, 'Sirayanian; Mr. Orsen', 'male', 22, 0, 0, '2669', 7.2292, '', 'C'); INSERT INTO titanic_train (80, 1, 3, 'Dowdell; Miss. Elizabeth', 'female', 30, 0, 0, '364516', 12.475, '', 'S
SELECT * FROM titanic_train ORDER BY 1;
Input table:
| passenger | survived | pclass | name | gender | age | sibsp | parch | ticket | fare | cabin | embarked |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | 1 | 1 | Cumings; Mrs John Bradley (Florence Briggs Thayer) | female | 38 | 1 | 0 | PC 17599 | 71.2833 | CBS | C |
| 4 | 1 | 1 | Futrell; Mrs. Jacques Heath (Lily May Peel) | female | 35 | 1 | 0 | 113803 | 53.1 | C123 | S |
| 7 | 0 | 1 | McCarthy; Mr. Timothy J | male | 54 | 0 | 0 | 117463 | 51.8625 | E46 | S |
| 10 | 1 | 2 | Nasser; Mrs. Nicholas (Adele Achem) | female | 14 | 1 | 0 | 237736 | 30.0707 | C | |
| 16 | 1 | 2 | Hewlett; Mrs. (Mary D Kingcome) | female | 55 | 0 | 0 | 248706 | 16 | S | |
| 21 | 0 | 2 | Fynney; Mr. Joseph J | male | 35 | 0 | 0 | 239865 | 26 | S | |
| 40 | 1 | 3 | Nicola-Yarred; Miss. Jamila | female | 14 | 1 | 0 | 2651 | 11.2417 | C | |
| 61 | 0 | 3 | Sirayanian; Mr. Orsen | male | 22 | 0 | 0 | 2669 | 7.2292 | C | |
| 80 | 1 | 3 | Dowdell; Miss. Elizabeth | female | 30 | 0 | 0 | 364516 | 12.475 | S |
TD_TargetEncodingTransform Query with EncoderMethod CBM_BETA
SELECT * FROM TD_TargetEncodingFit (
ON titanic_train AS InputTable
ON categoryTable AS CategoryTable DIMENSION
OUT PERMANENT TABLE OutputTable(betaEncFitTbl)
USING
EncoderMethod('CBM_BETA')
TargetColumns('gender','embarked')
ResponseColumn ('survived')
DefaultValues(-1, -2)
) AS dt;
CBM_BETA FitTable:
| TD_ColumnName_TEFIT | TD_Category_TEFIT | TD_Alpha_TEFIT | TD_Beta_TEFIT | TD_DefaultCategory_TEFIT | TD_CategoryCount_TEFIT | TD_ColumnIndex_TEFIT | TD_EncoderMethod_TEFIT_CBM_BETA | gender | embarked |
|---|---|---|---|---|---|---|---|---|---|
| embarked | C | 3.666667 | 1.333333 | -2 | 2 | 1 | ? | ? | ? |
| embarked | S | 3.666667 | 2.333333 | -2 | 2 | 1 | ? | ? | ? |
| gender | female | 6.666667 | 0.333333 | -1 | 2 | 0 | ? | ? | ? |
| gender | male | 0.666667 | 3.333333 | -1 | 2 | 0 | ? | ? | ? |
Transform query:
SELECT * FROM TD_TargetEncodingTransform (
ON titanic_train AS InputTable
ON betaEncFitTbl AS FitTable DIMENSION
USING
Accumulate('passenger')
) AS dt ORDER BY passenger;
Transform output:
| gender | embarked | passenger |
|---|---|---|
| 0.952381 | 0.733333 | 2 |
| 0.952381 | 0.611111 | 4 |
| 0.166667 | 0.611111 | 7 |
| 0.952381 | 0.733333 | 10 |
| 0.952381 | 0.611111 | 16 |
| 0.166667 | 0.611111 | 21 |
| 0.952381 | 0.733333 | 40 |
| 0.166667 | 0.733333 | 61 |
| 0.952381 | 0.611111 | 80 |
TD_TargetEncodingFit Query with EncoderMethod CBM_DIRICHLET
SELECT * FROM TD_TargetEncodingFit (
ON titanic_train AS InputTable
ON categoryTable AS CategoryTable DIMENSION
OUT PERMANENT TABLE OutputTable(dirichletEncFitTbl)
USING
EncoderMethod('CBM_DIRICHLET')
TargetColumns('gender','embarked')
ResponseColumn ('pclass')
DefaultValues(-1, -2)
NumDistinctResponses(3)
) AS dt;
CBM_DIRICHLET FitTable:
| TD_ColumnName_TEFIT | TD_Category_TEFIT | TD_Alpha1_TEFIT_gender | TD_Alpha2_TEFIT_gender | TD_Alpha3_TEFIT_gender | TD_Alpha1_TEFIT_embarked | TD_Alpha2_TEFIT_embarked | TD_Alpha3_TEFIT_embarked | TD_DefaultCategory_TEFIT | TD_CategoryCount_TEFIT | TD_ColumnIndex_TEFIT | TD_EncoderMethod_TEFIT_CBM_DIRICHLET | TD_NumResponses_TEFIT_3 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| embarked | C | ? | ? | ? | 1.333333 | 1.333333 | 2.333333 | -2 | 2 | 1 | ? | ? |
| embarked | S | ? | ? | ? | 2.333333 | 2.333333 | 1.333333 | -2 | 2 | 1 | ? | ? |
| gender | female | 2.333333 | 2.333333 | 2.333333 | ? | ? | ? | -1 | 2 | 0 | ? | ? |
| gender | male | 1.333333 | 1.333333 | 1.333333 | ? | ? | ? | -1 | 2 | 0 | ? | ? |
Transform query:
SELECT * FROM TD_TargetEncodingTransform (
ON titanic_train AS InputTable
ON dirichletEncFitTbl AS FitTable DIMENSION
USING
Accumulate('passenger')
) AS dt ORDER BY passenger;
Transform output:
| gender_1 | gender_2 | gender_3 | embarked_1 | embarked_2 | embarked_3 | passenger |
|---|---|---|---|---|---|---|
| 0.333333 | 0.333333 | 0.333333 | 0.266667 | 0.266667 | 0.466667 | 2 |
| 0.333333 | 0.333333 | 0.333333 | 0.388889 | 0.388889 | 0.222222 | 4 |
| 0.333333 | 0.333333 | 0.333333 | 0.388889 | 0.388889 | 0.222222 | 7 |
| 0.333333 | 0.333333 | 0.333333 | 0.266667 | 0.266667 | 0.466667 | 10 |
| 0.333333 | 0.333333 | 0.333333 | 0.388889 | 0.388889 | 0.222222 | 16 |
| 0.333333 | 0.333333 | 0.333333 | 0.388889 | 0.388889 | 0.222222 | 21 |
| 0.333333 | 0.333333 | 0.333333 | 0.266667 | 0.266667 | 0.466667 | 40 |
| 0.333333 | 0.333333 | 0.333333 | 0.266667 | 0.266667 | 0.466667 | 61 |
| 0.333333 | 0.333333 | 0.333333 | 0.388889 | 0.388889 | 0.222222 | 80 |
TD_TargetEncodingTransform Query with EncoderMethod CBM_GAUSSIAN_INVERSE_GAMMA
SELECT * FROM TD_TargetEncodingFit (
ON titanic_train AS InputTable
ON categoryTable AS CategoryTable DIMENSION
OUT PERMANENT TABLE OutputTable(GIGEncFitTbl)
USING
EncoderMethod('CBM_GAUSSIAN_INVERSE_GAMMA')
TargetColumns('gender','embarked')
ResponseColumn ('age')
DefaultValues(-1, -2)
) AS dt;
Output:
| TD_ColumnName_TEFIT | TD_Category_TEFIT | TD_U0_TEFIT | TD_V0_TEFIT | TD_Alpha0_TEFIT | TD_Beta0_Count | TD_DefaultCategory_TEFIT | TD_CategoryCount_TEFIT | TD_Columnindex_TEFIT | TD_EncoderMehod_TEFIT_CBM_GAUSSIAN_INVERSE_GAMMA | gender | embarked |
|---|---|---|---|---|---|---|---|---|---|---|---|
| embarked | C | 17.600000 | 5.000000 | 4.000000 | 258.600000 | -2 | 2 | 1 | ? | ? | ? |
| embarked | S | 34.833333 | 6.000000 | 4.500000 | 798.366667 | -2 | 2 | 1 | ? | ? | ? |
| gender | female | 26.571429 | 7.000000 | 5.000000 | 534.857143 | -1 | 2 | 0 | ? | ? | ? |
| gender | male | 27.750000 | 4.000000 | 3.500000 | 643.875000 | -1 | 2 | 0 | ? | ? | ? |
Transform query:
SELECT * FROM TD_TargetEncodingTransform (
ON titanic_train AS InputTable
ON GIGEncFitTbl AS FitTable DIMENSION
USING
Accumulate('passenger')
) AS dt ORDER BY passenger;
Transform output:
| gender | embarked | passenger |
|---|---|---|
| 26.57143 | 17.6 | 2 |
| 26.57143 | 34.83333 | 4 |
| 27.75 | 34.83333 | 7 |
| 26.57143 | 17.6 | 10 |
| 26.57143 | 34.83333 | 16 |
| 27.75 | 34.83333 | 21 |
| 26.57143 | 17.6 | 40 |
| 27.75 | 17.6 | 61 |
| 26.57143 | 34.83333 | 80 |