Using SamplingStrategy with smote, adasyn or borderline Values - Teradata Vantage

Teradata® VantageCloud Lake

Deployment
VantageCloud
Edition
Lake
Product
Teradata Vantage
Published
January 2023
ft:locale
en-US
ft:lastEdition
2024-12-11
dita:mapPath
phg1621910019905.ditamap
dita:ditavalPath
pny1626732985837.ditaval
dita:id
phg1621910019905

Used known iris dataset, where species values are changed to 1, 2, and 3.

Use the following code to create the table and insert the rows used in this example:

CREATE MULTISET TABLE iris_enc_sample ,FALLBACK ,
     NO BEFORE JOURNAL,
     NO AFTER JOURNAL,
     CHECKSUM = DEFAULT,
     DEFAULT MERGEBLOCKRATIO,
     MAP = TD_MAP1
     (
      id INTEGER,
      sepal_length FLOAT,
      sepal_width FLOAT,
      petal_length FLOAT,
      petal_width FLOAT,
      species INTEGER)
PRIMARY INDEX ( id );
INSERT INTO iris_enc_sample (id,sepal_length,sepal_width,petal_length,petal_width,species) VALUES (78,6.0,2.9,4.5,1.5,2);
INSERT INTO iris_enc_sample (id,sepal_length,sepal_width,petal_length,petal_width,species) VALUES (141,6.9,3.1,5.1,2.3,3);
INSERT INTO iris_enc_sample (id,sepal_length,sepal_width,petal_length,petal_width,species) VALUES (97,6.2,2.9,4.3,1.3,2);
INSERT INTO iris_enc_sample (id,sepal_length,sepal_width,petal_length,petal_width,species) VALUES (13,4.3,3.0,1.1,0.1,1);
INSERT INTO iris_enc_sample (id,sepal_length,sepal_width,petal_length,petal_width,species) VALUES (112,6.8,3.0,5.5,2.1,3);
INSERT INTO iris_enc_sample (id,sepal_length,sepal_width,petal_length,petal_width,species) VALUES (89,5.5,2.5,4.0,1.3,2);
INSERT INTO iris_enc_sample (id,sepal_length,sepal_width,petal_length,petal_width,species) VALUES (7,5.0,3.4,1.5,0.2,1);
INSERT INTO iris_enc_sample (id,sepal_length,sepal_width,petal_length,petal_width,species) VALUES (85,6.0,3.4,4.5,1.6,2);
INSERT INTO iris_enc_sample (id,sepal_length,sepal_width,petal_length,petal_width,species) VALUES (5,5.4,3.9,1.7,0.4,1);
INSERT INTO iris_enc_sample (id,sepal_length,sepal_width,petal_length,petal_width,species) VALUES (83,6.0,2.7,5.1,1.6,2);
INSERT INTO iris_enc_sample (id,sepal_length,sepal_width,petal_length,petal_width,species) VALUES (39,5.1,3.4,1.5,0.2,1);
INSERT INTO iris_enc_sample (id,sepal_length,sepal_width,petal_length,petal_width,species) VALUES (35,5.0,3.2,1.2,0.2,1);
INSERT INTO iris_enc_sample (id,sepal_length,sepal_width,petal_length,petal_width,species) VALUES (94,5.6,2.7,4.2,1.3,2);
INSERT INTO iris_enc_sample (id,sepal_length,sepal_width,petal_length,petal_width,species) VALUES (27,5.2,3.5,1.5,0.2,1);
INSERT INTO iris_enc_sample (id,sepal_length,sepal_width,petal_length,petal_width,species) VALUES (111,6.4,2.7,5.3,1.9,3);

TD_SMOTE SQL Call Using SamplingStrategy with smote

SELECT * FROM TD_SMOTE (
      ON iris_enc_sample AS InputTable PARTITION BY ANY
      USING
      NumberOfNeighbors(5)
      IDColumn('id')
      MinorityClass('3')
      ResponseColumn('species')
      InputColumns('sepal_length', 'sepal_width', 'petal_length', 'petal_width')
      OversamplingFactor(2)
      SamplingStrategy('smote')
) AS dt;
TD_SMOTE Output Using SamplingStrategy with smote
id sepal_length sepal_width petal_length petal_width species
111 6.4169546624 2.9899449291 5.3430063214 2.0269648927 3
111 6.4995803363 2.7037404941 5.329428712 1.9927850369 3
112 6.4149847879 2.8598442445 5.4052837772 1.9678773908 3
112 6.5928787566 2.9323622059 5.4090139685 1.9253975269 3
141 6.8490544092 2.9613735225 5.1891754861 1.942250495 3
141 6.4820585 2.8475600946 5.2059669699 1.9552546123 3

TD_SMOTE SQL Call Using SamplingStrategy with adasyn

SELECT * FROM TD_SMOTE (
      ON iris_enc_sample AS InputTable PARTITION BY ANY
      USING
      NumberOfNeighbors(5)
      IDColumn('id')
      MinorityClass('3')
      ResponseColumn('species')
      InputColumns('sepal_length', 'sepal_width', 'petal_length', 'petal_width')
      OversamplingFactor(2)
      SamplingStrategy('adasyn')
) AS dt;
TD_SMOTE Output Using SamplingStrategy with adasyn
id sepal_length sepal_width petal_length petal_width species
111 6.3830453376 2.7 5.2569936786 1.709552661 3
111 6.3004196637 2.7024936628 5.182285152 1.7144299261 3
112 6.896253803 3.0467185852 5.3105675544 2.2321226092 3
112 6.3857575132 2.9323622059 5.3180279369 1.6634938172 3
141 6.8898108818 3.0653433806 5.2783509723 2.1211252475 3
141 6.1477053001 3.2893299291 4.7820990903 1.6966955715 3

TD_SMOTE SQL Call Using SamplingStrategy with borderline

SELECT * FROM TD_SMOTE (
      ON iris_enc_sample AS InputTable PARTITION BY ANY
      USING
      NumberOfNeighbors(10)
      IDColumn('id')
      MinorityClass('3')
      ResponseColumn('species')
      InputColumns('sepal_length', 'sepal_width', 'petal_length', 'petal_width')
      OversamplingFactor(2)
      SamplingStrategy('borderline')
) AS dt;
TD_SMOTE Output Using SamplingStrategy with borderline
id sepal_length sepal_width petal_length petal_width species
111 6.3830453376 2.7 5.2569936786 1.709552661 3
111 6.1510491593 2.7149619766 4.770283184 1.204112223 3
112 6.4149847879 2.8598442445 5.4052837772 1.9678773908 3
112 6.4893181349 2.9774540686 4.9540838108 1.4015901076 3
141 6.8898108818 3.0653433806 5.2783509723 2.1211252475 3
141 5.6461755001 3.6048798108 3.2985615116 0.6624594084 3