The following is an example of how to use TD_SMOTE using SamplingStrategy with smotenc values:
- Prepare the input table.
- Compute the MedianStandardDeviation argument from the numerical input columns.
- Generate EncodingsTable from categorical input columns.
- Using the results, run the smotenc query.
Used known titanic dataset.
Use the following code to create the table and insert the rows used in this example:
CREATE MULTISET TABLE titanic_sample ,FALLBACK , NO BEFORE JOURNAL, NO AFTER JOURNAL, CHECKSUM = DEFAULT, DEFAULT MERGEBLOCKRATIO, MAP = TD_MAP1 ( passenger INTEGER, survived INTEGER, pclass INTEGER, name VARCHAR(90) CHARACTER SET LATIN NOT CASESPECIFIC, gender VARCHAR(10) CHARACTER SET LATIN NOT CASESPECIFIC, age INTEGER, sibsp INTEGER, parch INTEGER, ticket VARCHAR(20) CHARACTER SET LATIN NOT CASESPECIFIC, fare FLOAT, cabin VARCHAR(20) CHARACTER SET LATIN NOT CASESPECIFIC, embarked VARCHAR(10) CHARACTER SET LATIN NOT CASESPECIFIC) PRIMARY INDEX ( passenger ); INSERT INTO titanic_sample (passenger, survived, pclass, name, gender, age, sibsp, parch, ticket, fare, cabin, embarked) VALUES(7, 0, 1, 'McCarthy; Mr. Timothy J', 'male', 54, 0, 0, '17463', 51.8625, 'E46', 'S'); INSERT INTO titanic_sample (passenger, survived, pclass, name, gender, age, sibsp, parch, ticket, fare, cabin, embarked) VALUES(121, 0, 2, 'Hickman; Mr. Stanley George', 'male', 21, 2, 0, 'S.O.C. 14879', 73.5, '', 'S'); INSERT INTO titanic_sample (passenger, survived, pclass, name, gender, age, sibsp, parch, ticket, fare, cabin, embarked) VALUES(219, 1, 1, 'Bazzani; Miss. Albina', 'female', 32, 0, 0, '11813', 76.2917, 'D15', 'C'); INSERT INTO titanic_sample (passenger, survived, pclass, name, gender, age, sibsp, parch, ticket, fare, cabin, embarked) VALUES(282, 0, 3, 'Olsson; Mr. Nils Johan Goransson', 'male', 28, 0, 0, '347464', 7.8542, '', 'S'); INSERT INTO titanic_sample (passenger, survived, pclass, name, gender, age, sibsp, parch, ticket, fare, cabin, embarked) VALUES(387, 0, 3, 'Goodwin; Master. Sidney Leonard', 'male', 1, 5, 2, 'CA 2144', 46.9, '', 'S'); INSERT INTO titanic_sample (passenger, survived, pclass, name, gender, age, sibsp, parch, ticket, fare, cabin, embarked) VALUES(403, 0, 3, 'Jussila; Miss. Mari Aina', 'female', 21, 1, 0, '4137', 9.825, '', 'S'); INSERT INTO titanic_sample (passenger, survived, pclass, name, gender, age, sibsp, parch, ticket, fare, cabin, embarked) VALUES(542, 0, 3, 'Andersson; Miss. Ingeborg Constanzia', 'female', 9, 4, 2, '347082', 31.275, '', 'S'); INSERT INTO titanic_sample (passenger, survived, pclass, name, gender, age, sibsp, parch, ticket, fare, cabin, embarked) VALUES(645, 1, 3, 'Baclini; Miss. Eugenie', 'female', 0, 2, 1, '2666', 19.2583, '', 'C'); INSERT INTO titanic_sample (passenger, survived, pclass, name, gender, age, sibsp, parch, ticket, fare, cabin, embarked) VALUES(660, 0, 1, 'Newell; Mr. Arthur Webster', 'male', 58, 0, 2, '35273', 113.275, 'D48', 'C'); INSERT INTO titanic_sample (passenger, survived, pclass, name, gender, age, sibsp, parch, ticket, fare, cabin, embarked) VALUES(669, 0, 3, 'Cook; Mr. Jacob', 'male', 43, 0, 0, 'A/5 3536', 8.05, '', 'S'); INSERT INTO titanic_sample (passenger, survived, pclass, name, gender, age, sibsp, parch, ticket, fare, cabin, embarked) VALUES(687, 0, 3, 'Panula; Mr. Jaako Arnold', 'male', 14, 4, 1, '3101295', 39.6875, '', 'S'); INSERT INTO titanic_sample (passenger, survived, pclass, name, gender, age, sibsp, parch, ticket, fare, cabin, embarked) VALUES(740, 0, 3, 'Nankoff; Mr. Minko', 'male', NULL, 0, 0, '349218', 7.8958, '', 'S'); INSERT INTO titanic_sample (passenger, survived, pclass, name, gender, age, sibsp, parch, ticket, fare, cabin, embarked) VALUES(834, 0, 3, 'Augustsson; Mr. Albert', 'male', 23, 0, 0, '347468', 7.8542, '', 'S'); INSERT INTO titanic_sample (passenger, survived, pclass, name, gender, age, sibsp, parch, ticket, fare, cabin, embarked) VALUES(835, 0, 3, 'Allum; Mr. Owen George', 'male', 18, 0, 0, '2223', 8.3, '', 'S'); INSERT INTO titanic_sample (passenger, survived, pclass, name, gender, age, sibsp, parch, ticket, fare, cabin, embarked) VALUES(877, 0, 3, 'Gustafsson; Mr. Alfred Ossian', 'male', 20, 0, 0, '7534', 9.8458, '', 'S');
TD_UnivariateStatistics SQL Call to Compute MedianStandardDeviation Argument
SELECT StatValue AS MedianValue FROM TD_UnivariateStatistics ( ON ( SELECT * FROM TD_UnivariateStatistics ( ON (SELECT * FROM titanic_sample WHERE survived=1) AS InputTable USING TargetColumns ('age', 'fare') Stats ('STANDARD DEVIATION') ) AS dt) AS InputTable USING TargetColumns ('StatValue') Stats ('MED') ) AS dt;
The InputColumns are age and fare.
MedianValue |
---|
31.478060446 |
TD_OrdinalEncodingFit SQL Call to Generate EncodingsTable
CREATE MULTISET TABLE titanic_sample_oef AS ( SELECT * FROM TD_OrdinalEncodingFit ( ON (SELECT * FROM titanic_sample WHERE survived=1) AS InputTable USING TargetColumn('gender','embarked') Approach ('AUTO') ) AS dt ) with data;
The CategoricalInputColumns are gender and embarked.
TD_ColumnName_ORDFIT | TD_Category_ORDFIT | TD_Value_ORDFIT | TD_IndexORDFIT | gender | embarked |
---|---|---|---|---|---|
gender | female | 0 | 0 | ||
gender | TD_CATEGORY_COUNT | 1 | -1 |
TD_SMOTE SQL Call Using SamplingStrategy with smotenc
SELECT * FROM TD_SMOTE ( ON (SELECT * FROM titanic_sample WHERE survived=1) AS InputTable ON titanic_sample_oef AS EncodingsTable DIMENSION USING IDColumn('passenger') MinorityClass('1') ResponseColumn('survived') InputColumns('age', 'fare') CategoricalInputColumns('gender','embarked') SamplingStrategy('smotenc') MedianStandardDeviation(31.47806044604718) OversamplingFactor(5) NumberOfNeighbors(5) ValueForNonInputColumns('NULL') ) AS dt;
passenger | survived | pclass | name | gender | age | sibsp | parch | ticket | fare | cabin | embarked |
---|---|---|---|---|---|---|---|---|---|---|---|
219 | 1 | 1 | Bazzani; Miss. Albina | female | 31 | 0 | 0 | 11813 | 21.1698829323 | D15 | C |
219 | 1 | 1 | Bazzani; Miss. Albina | female | 25 | 0 | 0 | 11813 | 40.0855024492 | D15 | C |
219 | 1 | 1 | Bazzani; Miss. Albina | female | 24 | 0 | 0 | 11813 | 75.5805896718 | D15 | C |
219 | 1 | 1 | Bazzani; Miss. Albina | female | 27 | 0 | 0 | 11813 | 49.832469372 | D15 | C |
219 | 1 | 1 | Bazzani; Miss. Albina | female | 1 | 0 | 0 | 11813 | 49.6465024484 | D15 | C |
645 | 1 | 3 | Baclini; Miss. Eugenie | female | 15 | 2 | 1 | 2666 | 56.935308086 | C | |
645 | 1 | 3 | Baclini; Miss. Eugenie | female | 17 | 2 | 1 | 2666 | 32.117011223 | C | |
645 | 1 | 3 | Baclini; Miss. Eugenie | female | 15 | 2 | 1 | 2666 | 69.0491634488 | C | |
645 | 1 | 3 | Baclini; Miss. Eugenie | female | 3 | 2 | 1 | 2666 | 39.0241483539 | C | |
645 | 1 | 3 | Baclini; Miss. Eugenie | female | 14 | 2 | 1 | 2666 | 70.2674765417 | C |