TD_TargetEncodingFit Examples | TargetEncodingFit - TD_TargetEncodingFit Examples - Teradata Vantage

Teradata® VantageCloud Lake

Deployment
VantageCloud
Edition
Lake
Product
Teradata Vantage
Published
January 2023
ft:locale
en-US
ft:lastEdition
2024-12-11
dita:mapPath
phg1621910019905.ditamap
dita:ditavalPath
pny1626732985837.ditaval
dita:id
phg1621910019905

TD_TargetEncodingFit Example

The following input table contains a subset of titantic dataset.

CREATE TABLE titanic_train (
   passenger INTEGER,
   survived INTEGER,
   pclass INTEGER,
   name VARCHAR(90) CHARACTER SET LATIN NOT CASESPECIFIC,
   gender VARCHAR(10) CHARACTER SET LATIN NOT CASESPECIFIC,
   age INTEGER,
   sibsp INTEGER,
   parch INTEGER,
   ticket VARCHAR(20) CHARACTER SET LATIN NOT CASESPECIFIC,
   fare FLOAT,
   cabin VARCHAR(20) CHARACTER SET LATIN NOT CASESPECIFIC,
   embarked VARCHAR(10) CHARACTER SET LATIN NOT CASESPECIFIC)
PRIMARY INDEX ( passenger );
 
INSERT INTO titanic_train (2, 1, 1, 'Cumings; Mrs. John Bradley (Florence Briggs Thayer)', 'female', 38, 1, 0, 'PC 17599', 71.2833, 'C85', 'C');
INSERT INTO titanic_train (4, 1, 1, 'Futrelle; Mrs. Jacques Heath (Lily May Peel)', 'female', 35, 1, 0, '113803', 53.1, 'C123', 'S');
INSERT INTO titanic_train (7, 0, 1, 'McCarthy; Mr. Timothy J', 'male', 54, 0, 0, '17463', 51.8625, 'E46', 'S');
INSERT INTO titanic_train (10, 1, 2, 'Nasser; Mrs. Nicholas (Adele Achem)', 'female', 14, 1, 0, '237736', 30.0708, '', 'C');
INSERT INTO titanic_train (16, 1, 2, 'Hewlett; Mrs. (Mary D Kingcome) ', 'female', 55, 0, 0, '248706', 16, '', 'S');
INSERT INTO titanic_train (21, 0, 2, 'Fynney; Mr. Joseph J', 'male', 35, 0, 0, '239865', 26, '', 'S');
INSERT INTO titanic_train (40, 1, 3, 'Nicola-Yarred; Miss. Jamila', 'female', 14, 1, 0, '2651', 11.2417, '', 'C');
INSERT INTO titanic_train (61, 0, 3, 'Sirayanian; Mr. Orsen', 'male', 22, 0, 0, '2669', 7.2292, '', 'C');
INSERT INTO titanic_train (80, 1, 3, 'Dowdell; Miss. Elizabeth', 'female', 30, 0, 0, '364516', 12.475, '', 'S');
SELECT * FROM titanic_train ORDER BY 1;
passenger survived pclass name gender age sibsp parch ticket fare cabin embarked
2 1 1 Cumings; Mrs John Bradley (Florence Briggs Thayer) female 38 1 0 PC 17599 71.2833 CBS C
4 1 1 Futrell; Mrs. Jacques Heath (Lily May Peel) female 35 1 0 113803 53.1 C123 S
7 0 1 McCarthy; Mr. Timothy J male 54 0 0 117463 51.8625 E46 S
10 1 2 Nasser; Mrs. Nicholas (Adele Achem) female 14 1 0 237736 30.0707   C
16 1 2 Hewlett; Mrs. (Mary D Kingcome) female 55 0 0 248706 16   S
21 0 2 Fynney; Mr. Joseph J male 35 0 0 239865 26   S
40 1 3 Nicola-Yarred; Miss. Jamila female 14 1 0 2651 11.2417   C
61 0 3 Sirayanian; Mr. Orsen male 22 0 0 2669 7.2292   C
80 1 3 Dowdell; Miss. Elizabeth female 30 0 0 364516 12.475   S

CategoryTable:

CREATE TABLE categoryTable AS (
SELECT ColumnName, count(*) AS CategoryCount FROM (
SELECT * FROM TD_CategoricalSummary(
ON titanic_train AS InputTable
USING
TargetColumns('gender','embarked')
) AS dt WHERE DistinctValue IS NOT NULL) AS CatTable GROUP BY ColumnName) WITH data;
SELECT * FROM categoryTable;
ColumnName CategoryCount
embarked 2
gender 2

TD_TargetEncodingFit Query with EncoderMethod CBM_BETA

TD_TargetEncodingFit (
   ON titanic_train AS InputTable
   ON categoryTable AS CategoryTable DIMENSION
   USING
   EncoderMethod('CBM_BETA')
   TargetColumns('gender','embarked')
   ResponseColumn ('survived')
...DefaultValues(-1, -2)
)

Output:

TD_ColumnName_TEFIT TD_Category_TEFIT TD_Alpha_TEFIT TD_Beta_TEFIT TD_DefaultCategory_TEFIT TD_CategoryCount_TEFIT TD_ColumnIndex_TEFIT TD_EncoderMethod_TEFIT_CBM_BETA gender embarked
embarked C 3.666667 1.333333 -2 2 1 ? ? ?
embarked S 3.666667 2.333333 -2 2 1 ? ? ?
gender female 6.666667 0.333333 -1 2 0 ? ? ?
gender male 0.666667 3.333333 -1 2 0 ? ? ?

TD_TargetEncodingFit Query with EncoderMethod CBM_DIRICHLET

TD_TargetEncodingFit (
   ON titanic_train AS InputTable
   ON categoryTable AS CategoryTable DIMENSION
   USING
   EncoderMethod('CBM_DIRICHLET')
   TargetColumns('gender','embarked')
   ResponseColumn ('pclass')
   DefaultValues(-1, -2)
   NumDistinctResponses(3)
)

Output:

TD_ColumnName_TEFIT TD_Category_TEFIT TD_Alpha1_TEFIT_gender TD_Alpha2_TEFIT_gender TD_Alpha3_TEFIT_gender TD_Alpha1_TEFIT_embarked TD_Alpha2_TEFIT_embarked TD_Alpha3_TEFIT_embarked TD_DefaultCategory_TEFIT TD_CategoryCount_TEFIT TD_ColumnIndex_TEFIT TD_EncoderMethod_TEFIT_CBM_DIRICHLET TD_NumResponses_TEFIT_3
embarked C ? ? ? 1.333333 1.333333 2.333333 -2 2 1 ? ?
embarked S ? ? ? 2.333333 2.333333 1.333333 -2 2 1 ? ?
gender female 2.333333 2.333333 2.333333 ? ? ? -1 2 0 ? ?
gender male 1.333333 1.333333 1.333333 ? ? ? -1 2 0 ? ?

TD_TargetEncodingFit Query with EncoderMethod CBM_GAUSSIAN_INVERSE_GAMMA

TD_TargetEncodingFit (
   ON titanic_train AS InputTable
   ON categoryTable AS CategoryTable DIMENSION
   USING
   EncoderMethod('CBM_GAUSSIAN_INVERSE_GAMMA)
   TargetColumns('gender','embarked')
   ResponseColumn ('age')
   DefaultValues(-1, -2)
)

Output:

TD_ColumnName_TEFIT TD_Category_TEFIT TD_U0_TEFIT TD_V0_TEFIT TD_Alpha0_TEFIT TD_Beta0_TEFIT TD_DefaultCategory_TEFIT TD_CategoryCount_TEFIT TD_Columnindex_TEFIT TD_EncoderMehod_TEFIT_CBM_GAUSSIAN_INVERSE_GAMMA gender embarked
embarked C 17.600000 5.000000 4.000000 258.600000 -2 2 1 ? ? ?
embarked S 34.833333 6.000000 4.500000 798.366667 -2 2 1 ? ? ?
gender female 26.571429 7.000000 5.000000 534.857143 -1 2 0 ? ? ?
gender male 27.750000 4.000000 3.500000 643.875000 -1 2 0 ? ? ?