Using SamplingStrategy with smotenc Values - Teradata Vantage

Teradata® VantageCloud Lake

Deployment
VantageCloud
Edition
Lake
Product
Teradata Vantage
Published
January 2023
ft:locale
en-US
ft:lastEdition
2024-12-11
dita:mapPath
phg1621910019905.ditamap
dita:ditavalPath
pny1626732985837.ditaval
dita:id
phg1621910019905
The following is an example of how to use TD_SMOTE using SamplingStrategy with smotenc values:
  1. Prepare the input table.
  2. Compute the MedianStandardDeviation argument from the numerical input columns.
  3. Generate EncodingsTable from categorical input columns.
  4. Using the results, run the smotenc query.

Used known titanic dataset.

Use the following code to create the table and insert the rows used in this example:

CREATE MULTISET TABLE titanic_sample ,FALLBACK ,
     NO BEFORE JOURNAL,
     NO AFTER JOURNAL,
     CHECKSUM = DEFAULT,
     DEFAULT MERGEBLOCKRATIO,
     MAP = TD_MAP1
     (
      passenger INTEGER,
      survived INTEGER,
      pclass INTEGER,
      name VARCHAR(90) CHARACTER SET LATIN NOT CASESPECIFIC,
      gender VARCHAR(10) CHARACTER SET LATIN NOT CASESPECIFIC,
      age INTEGER,
      sibsp INTEGER,
      parch INTEGER,
      ticket VARCHAR(20) CHARACTER SET LATIN NOT CASESPECIFIC,
      fare FLOAT,
      cabin VARCHAR(20) CHARACTER SET LATIN NOT CASESPECIFIC,
      embarked VARCHAR(10) CHARACTER SET LATIN NOT CASESPECIFIC)
PRIMARY INDEX ( passenger );
INSERT INTO titanic_sample (passenger, survived, pclass, name, gender, age, sibsp, parch, ticket, fare, cabin, embarked) VALUES(7, 0, 1, 'McCarthy; Mr. Timothy J', 'male', 54, 0, 0, '17463', 51.8625, 'E46', 'S');
INSERT INTO titanic_sample (passenger, survived, pclass, name, gender, age, sibsp, parch, ticket, fare, cabin, embarked) VALUES(121, 0, 2, 'Hickman; Mr. Stanley George', 'male', 21, 2, 0, 'S.O.C. 14879', 73.5, '', 'S');
INSERT INTO titanic_sample (passenger, survived, pclass, name, gender, age, sibsp, parch, ticket, fare, cabin, embarked) VALUES(219, 1, 1, 'Bazzani; Miss. Albina', 'female', 32, 0, 0, '11813', 76.2917, 'D15', 'C');
INSERT INTO titanic_sample (passenger, survived, pclass, name, gender, age, sibsp, parch, ticket, fare, cabin, embarked) VALUES(282, 0, 3, 'Olsson; Mr. Nils Johan Goransson', 'male', 28, 0, 0, '347464', 7.8542, '', 'S');
INSERT INTO titanic_sample (passenger, survived, pclass, name, gender, age, sibsp, parch, ticket, fare, cabin, embarked) VALUES(387, 0, 3, 'Goodwin; Master. Sidney Leonard', 'male', 1, 5, 2, 'CA 2144', 46.9, '', 'S');
INSERT INTO titanic_sample (passenger, survived, pclass, name, gender, age, sibsp, parch, ticket, fare, cabin, embarked) VALUES(403, 0, 3, 'Jussila; Miss. Mari Aina', 'female', 21, 1, 0, '4137', 9.825, '', 'S');
INSERT INTO titanic_sample (passenger, survived, pclass, name, gender, age, sibsp, parch, ticket, fare, cabin, embarked) VALUES(542, 0, 3, 'Andersson; Miss. Ingeborg Constanzia', 'female', 9, 4, 2, '347082', 31.275, '', 'S');
INSERT INTO titanic_sample (passenger, survived, pclass, name, gender, age, sibsp, parch, ticket, fare, cabin, embarked) VALUES(645, 1, 3, 'Baclini; Miss. Eugenie', 'female', 0, 2, 1, '2666', 19.2583, '', 'C');
INSERT INTO titanic_sample (passenger, survived, pclass, name, gender, age, sibsp, parch, ticket, fare, cabin, embarked) VALUES(660, 0, 1, 'Newell; Mr. Arthur Webster', 'male', 58, 0, 2, '35273', 113.275, 'D48', 'C');
INSERT INTO titanic_sample (passenger, survived, pclass, name, gender, age, sibsp, parch, ticket, fare, cabin, embarked) VALUES(669, 0, 3, 'Cook; Mr. Jacob', 'male', 43, 0, 0, 'A/5 3536', 8.05, '', 'S');
INSERT INTO titanic_sample (passenger, survived, pclass, name, gender, age, sibsp, parch, ticket, fare, cabin, embarked) VALUES(687, 0, 3, 'Panula; Mr. Jaako Arnold', 'male', 14, 4, 1, '3101295', 39.6875, '', 'S');
INSERT INTO titanic_sample (passenger, survived, pclass, name, gender, age, sibsp, parch, ticket, fare, cabin, embarked) VALUES(740, 0, 3, 'Nankoff; Mr. Minko', 'male', NULL, 0, 0, '349218', 7.8958, '', 'S');
INSERT INTO titanic_sample (passenger, survived, pclass, name, gender, age, sibsp, parch, ticket, fare, cabin, embarked) VALUES(834, 0, 3, 'Augustsson; Mr. Albert', 'male', 23, 0, 0, '347468', 7.8542, '', 'S');
INSERT INTO titanic_sample (passenger, survived, pclass, name, gender, age, sibsp, parch, ticket, fare, cabin, embarked) VALUES(835, 0, 3, 'Allum; Mr. Owen George', 'male', 18, 0, 0, '2223', 8.3, '', 'S');
INSERT INTO titanic_sample (passenger, survived, pclass, name, gender, age, sibsp, parch, ticket, fare, cabin, embarked) VALUES(877, 0, 3, 'Gustafsson; Mr. Alfred Ossian', 'male', 20, 0, 0, '7534', 9.8458, '', 'S');

TD_UnivariateStatistics SQL Call to Compute MedianStandardDeviation Argument

SELECT StatValue AS MedianValue FROM TD_UnivariateStatistics (
ON (
SELECT * FROM TD_UnivariateStatistics (
  ON (SELECT * FROM titanic_sample WHERE survived=1) AS InputTable
  USING
  TargetColumns ('age', 'fare')
  Stats ('STANDARD DEVIATION')
) AS dt)
AS InputTable
  USING
  TargetColumns ('StatValue')
  Stats ('MED')
) AS dt;
The InputColumns are age and fare.
TD_UnivariateStatistics Output MedianStandardDeviation
MedianValue
31.478060446

TD_OrdinalEncodingFit SQL Call to Generate EncodingsTable

CREATE MULTISET TABLE titanic_sample_oef AS (
    SELECT * FROM TD_OrdinalEncodingFit (
    ON (SELECT * FROM titanic_sample WHERE survived=1) AS InputTable
    USING
    TargetColumn('gender','embarked')
    Approach ('AUTO')
    ) AS dt
) with data;
The CategoricalInputColumns are gender and embarked.
TD_OrdinalEncodingFit Output
TD_ColumnName_ORDFIT TD_Category_ORDFIT TD_Value_ORDFIT TD_IndexORDFIT gender embarked
gender female 0 0    
gender TD_CATEGORY_COUNT 1 -1    

TD_SMOTE SQL Call Using SamplingStrategy with smotenc

SELECT * FROM TD_SMOTE (
    ON (SELECT * FROM titanic_sample WHERE survived=1) AS InputTable
    ON titanic_sample_oef AS EncodingsTable DIMENSION
    USING
      IDColumn('passenger')
      MinorityClass('1')
      ResponseColumn('survived')
      InputColumns('age', 'fare')
      CategoricalInputColumns('gender','embarked')
      SamplingStrategy('smotenc')
      MedianStandardDeviation(31.47806044604718)
      OversamplingFactor(5)
      NumberOfNeighbors(5)
      ValueForNonInputColumns('NULL')
) AS dt;
TD_SMOTE Output Using SamplingStrategy with smotenc
passenger survived pclass name gender age sibsp parch ticket fare cabin embarked
219 1 1 Bazzani; Miss. Albina female 31 0 0 11813 21.1698829323 D15 C
219 1 1 Bazzani; Miss. Albina female 25 0 0 11813 40.0855024492 D15 C
219 1 1 Bazzani; Miss. Albina female 24 0 0 11813 75.5805896718 D15 C
219 1 1 Bazzani; Miss. Albina female 27 0 0 11813 49.832469372 D15 C
219 1 1 Bazzani; Miss. Albina female 1 0 0 11813 49.6465024484 D15 C
645 1 3 Baclini; Miss. Eugenie female 15 2 1 2666 56.935308086   C
645 1 3 Baclini; Miss. Eugenie female 17 2 1 2666 32.117011223   C
645 1 3 Baclini; Miss. Eugenie female 15 2 1 2666 69.0491634488   C
645 1 3 Baclini; Miss. Eugenie female 3 2 1 2666 39.0241483539   C
645 1 3 Baclini; Miss. Eugenie female 14 2 1 2666 70.2674765417   C