DecisionTree Example 1 - Teradata Vantage

Machine Learning Engine Analytic Function Reference

Product

Teradata Vantage

Release Number

8.00

1.0

Published

May 2019

Language

English (United States)

Last Update

2019-11-22

dita:mapPath

blj1506016597986.ditamap

dita:ditavalPath

blj1506016597986.ditaval

dita:id

B700-4003

lifecycle

Product Category

Teradata Vantage™

The function DecisionTree acts on the train set to create the model. The Single_Tree_Predict function uses that model and a test set to predict the output. The prediction accuracy is determined based on the original and prediction results.

Input

This example uses the iris data set (iris_input). The data has values for four attributes (sepal_length, sepal_width, petal_length and petal_width), which are grouped into three categories (setosa (1), versicolor (2), virginica (3)). From the raw data, the example creates training and test data sets.

Raw Data: iris_input
id	sepal_length	sepal_width	petal_length	petal_width	species
1	5.1	3.5	1.4	0.2	1
2	4.9	3	1.4	0.2	1
3	4.7	3.2	1.3	0.2	1
4	4.6	3.1	1.5	0.2	1
5	5	3.6	1.4	0.2	1
6	5.4	3.9	1.7	0.4	1
7	4.6	3.4	1.4	0.3	1
8	5	3.4	1.5	0.2	1
9	4.4	2.9	1.4	0.2	1
10	4.9	3.1	1.5	0.1	1
...	...	...	...	...	...

This code divides the 150 data rows into a training data set (80%) and a test data set (20%):

DROP TABLE iris_train;
DROP TABLE iris_test;

CREATE MULTISET TABLE iris_train AS (
  SELECT * FROM iris_input WHERE id%5!=0
) WITH DATA;

CREATE MULTISET TABLE iris_test AS (
  SELECT * FROM iris_input WHERE id%5=0
) WITH DATA;

Alternatively, you can do the preceding task with the Sampling or RandomSample function.

This query returns the following table:

SELECT * FROM iris_train ORDER BY id;

Training Data Set: iris_train
id	sepal_length	sepal_width	petal_length	petal_width	species
1	5.1	3.5	1.4	0.2	1
2	4.9	3	1.4	0.2	1
3	4.7	3.2	1.3	0.2	1
4	4.6	3.1	1.5	0.2	1
6	5.4	3.9	1.7	0.4	1
7	4.6	3.4	1.4	0.3	1
8	5	3.4	1.5	0.2	1
9	4.4	2.9	1.4	0.2	1
11	5.4	3.7	1.5	0.2	1
12	4.8	3.4	1.6	0.2	1
13	4.8	3	1.4	0.1	1
14	4.3	3	1.1	0.1	1
16	5.7	4.4	1.5	0.4	1
...	...	...	...	...	...

This query returns the following table:

SELECT * FROM iris_test ORDER BY id;

Test Data Set: iris_test
id	sepal_length	sepal_width	petal_length	petal_width	species
5	5	3.6	1.4	0.2	1
10	4.9	3.1	1.5	0.1	1
15	5.8	4	1.2	0.2	1
20	5.1	3.8	1.5	0.3	1
25	4.8	3.4	1.9	0.2	1
30	4.7	3.2	1.6	0.2	1
35	4.9	3.1	1.5	0.2	1
40	5.1	3.4	1.5	0.2	1
45	5.1	3.8	1.9	0.4	1
50	5	3.3	1.4	0.2	1
55	6.5	2.8	4.6	1.5	2
60	5.2	2.7	3.9	1.4	2
65	5.6	2.9	3.6	1.3	2
70	5.6	2.5	3.9	1.1	2
75	6.4	2.9	4.3	1.3	2
80	5.7	2.6	3.5	1	2
85	5.4	3	4.5	1.5	2
90	5.5	2.5	4	1.3	2
95	5.6	2.7	4.2	1.3	2
100	5.7	2.8	4.1	1.3	2
105	6.5	3	5.8	2.2	3
110	7.2	3.6	6.1	2.5	3
115	5.8	2.8	5.1	2.4	3
120	6	2.2	5	1.5	3
125	6.7	3.3	5.7	2.1	3
130	7.2	3	5.8	1.6	3
135	6.1	2.6	5.6	1.4	3
140	6.9	3.1	5.4	2.1	3
145	6.7	3.3	5.7	2.5	3
150	5.9	3	5.1	1.8	3

The attribute tables, iris_attribute_train and iris_attribute_test, are created from the raw train and test data, iris_train and iris_test. The recommended way to create the attribute tables is to use the Unpivoting function:

CREATE MULTISET TABLE singletree1 AS ( 
  SELECT * FROM Unpivoting (
    ON iris_train
    USING
    TargetColumns ('sepal_length', 'sepal_width', 'petal_length', 'petal_width')
    AttributeColumn ('attribute')
    ValueColumn ('value_col')
    InputTypes ('false')
    Accumulate ('id')
  ) AS dt
) WITH DATA;

CREATE MULTISET TABLE iris_attribute_train AS (
  SELECT id AS pid, attribute, value_col AS attrvalue FROM singletree1
) WITH DATA;

CREATE MULTISET TABLE singletree2 AS (
  SELECT * FROM Unpivoting (
    ON iris_test
    USING
    TargetColumns ('sepal_length', 'sepal_width', 'petal_length', 'petal_width')
    AttributeColumn ('attribute')
    ValueColumn ('value_col')
    InputTypes ('false')
    Accumulate ('id')
  ) AS dt
) WITH DATA;

CREATE MULTISET TABLE iris_attribute_test AS (
  SELECT id AS pid, attribute, value_col AS attrvalue FROM singletree2
) WITH DATA;

This query returns the following table:

SELECT * FROM iris_attribute_train ORDER BY pid, attribute;

Attribute Table: iris_attribute_train
pid	attribute	attrvalue
1	petal_length	1.4
1	petal_width	0.2
1	sepal_length	5.1
1	sepal_width	3.5
2	petal_length	1.4
2	petal_width	0.2
2	sepal_length	4.9
2	sepal_width	3
3	petal_length	1.3
3	petal_width	0.2
3	sepal_length	4.7
3	sepal_width	3.2
...	...	...

This query returns the following table:

SELECT * FROM iris_attribute_test ORDER BY pid, attribute;

Attribute Table: iris_attribute_test
pid	attribute	attrvalue
5	petal_length	1.4
5	petal_width	0.2
5	sepal_length	5
5	sepal_width	3.6
10	petal_length	1.5
10	petal_width	0.1
10	sepal_length	4.9
10	sepal_width	3.1
15	petal_length	1.2
15	petal_width	0.2
15	sepal_length	5.8
15	sepal_width	4
...	...	...

Response tables, created from the raw train and test data, are used as inputs.

DROP TABLE iris_response_train;
CREATE MULTISET TABLE iris_response_train (pid INTEGER, response VARCHAR);
INSERT INTO iris_response_train SELECT id, species FROM iris_train;

This query returns the following table:

SELECT * FROM iris_response_train ORDER BY pid;

Response Table: iris_response_train
pid	response
1	1
2	1
3	1
4	1
6	1
7	1
8	1
9	1
11	1
12	1
13	1
14	1
16	1
...	...

DROP TABLE iris_response_test;
CREATE MULTISET TABLE iris_response_test (pid INTEGER, response VARCHAR);
INSERT INTO iris_response_test SELECT id, species FROM iris_test;

This query returns the following table:

SELECT * FROM iris_response_test ORDER BY pid;

Response Table: iris_response_test
pid	response
5	1
10	1
15	1
20	1
25	1
30	1
35	1
40	1
45	1
50	1
55	2
60	2
65	2
70	2
75	2
80	2
85	2
90	2
95	2
100	2
105	3
110	3
115	3
120	3
125	3
130	3
135	3
140	3
145	3
150	3

SQL Call

DROP TABLE iris_attribute_output;
DROP TABLE splits_small;

SELECT * FROM DecisionTree (
  ON iris_attribute_train AS AttributeTableName
  ON iris_response_train AS ResponseTableName
  OUT TABLE OutputTable (iris_attribute_output)
  OUT TABLE IntermediateSplitsTable (splits_small)
  USING
  NumSplits (3)
  SplitMeasure ('gini')
  MaxDepth (10)
  IDColumns ('pid')
  AttributeNameColumns ('attribute')
  AttributeValueColumn ('attrvalue')
  ResponseColumn ('response')
  MinNodeSize (10)
  ApproxSplits ('false')
) AS dt;

Output

message
Output model table is successfully stored in the table specified in OutputTable argument. Depth of the tree is:6

This query returns the following table:

SELECT * FROM iris_attribute_output ORDER BY 1;

iris_attribute_output
node_id	node_size	node_gini_p	node_entropy	node_chisq_pv	node_label	node_majorvotes	split_value	split_gini_p	split_entropy	split_chisq_pv	left_id	left_size	left_label	left_majorvotes	right_id	right_size	right_label	right_majorvotes	attribute
0	120	0.666666666666667	1.58496250072116	1	1	40	3	0.333333333333333	0.666666666666666	0	1	40	1	40	2	80	2	40	petal_length
2	80	0.5	1	1	2	40	1.70000004768372	0.0719199499687304	0.227979833481065	1.11022302462516e-16	5	39	2	38	6	41	3	39	petal_width
5	39	0.0499671268902038	0.172036949353113	1	2	38	4.90000009536743	0.0384615384615385	0.0832080127650393	0.00272911340977045	11	35	2	35	12	4	2	3	petal_length
6	41	0.0928019036287924	0.281193796432043	1	3	39	4.90000009536743	0.0840474620962426	0.240916755467913	0.0492232443463754	13	4	3	3	14	37	3	36	petal_length
14	37	0.0525931336742148	0.179256066928321	1	3	36	2.90000009536743	0.0518018018018018	0.162085811567472	0.455587679837851	29	13	3	13	30	24	3	23	sepal_width
30	24	0.0798611111111112	0.249882292833186	1	3	23	3.20000004768372	0.0773809523809524	0.216552190540511	0.387955122826146	61	14	3	13	62	10	3	10	sepal_width
61	14	0.13265306122449	0.371232326640875	1	3	13	6.30000019073486	0.131868131868132	0.363297594798595	0.773484680980946	123	1	3	1	124	13	3	12	sepal_length

id	sepal_length	sepal_width	petal_length	petal_width	species
1	5.1	3.5	1.4	0.2	1
2	4.9	3	1.4	0.2	1
3	4.7	3.2	1.3	0.2	1
4	4.6	3.1	1.5	0.2	1
5	5	3.6	1.4	0.2	1
6	5.4	3.9	1.7	0.4	1
7	4.6	3.4	1.4	0.3	1
8	5	3.4	1.5	0.2	1
9	4.4	2.9	1.4	0.2	1
10	4.9	3.1	1.5	0.1	1
...	...	...	...	...	...

id	sepal_length	sepal_width	petal_length	petal_width	species
1	5.1	3.5	1.4	0.2	1
2	4.9	3	1.4	0.2	1
3	4.7	3.2	1.3	0.2	1
4	4.6	3.1	1.5	0.2	1
6	5.4	3.9	1.7	0.4	1
7	4.6	3.4	1.4	0.3	1
8	5	3.4	1.5	0.2	1
9	4.4	2.9	1.4	0.2	1
11	5.4	3.7	1.5	0.2	1
12	4.8	3.4	1.6	0.2	1
13	4.8	3	1.4	0.1	1
14	4.3	3	1.1	0.1	1
16	5.7	4.4	1.5	0.4	1
...	...	...	...	...	...

id	sepal_length	sepal_width	petal_length	petal_width	species
5	5	3.6	1.4	0.2	1
10	4.9	3.1	1.5	0.1	1
15	5.8	4	1.2	0.2	1
20	5.1	3.8	1.5	0.3	1
25	4.8	3.4	1.9	0.2	1
30	4.7	3.2	1.6	0.2	1
35	4.9	3.1	1.5	0.2	1
40	5.1	3.4	1.5	0.2	1
45	5.1	3.8	1.9	0.4	1
50	5	3.3	1.4	0.2	1
55	6.5	2.8	4.6	1.5	2
60	5.2	2.7	3.9	1.4	2
65	5.6	2.9	3.6	1.3	2
70	5.6	2.5	3.9	1.1	2
75	6.4	2.9	4.3	1.3	2
80	5.7	2.6	3.5	1	2
85	5.4	3	4.5	1.5	2
90	5.5	2.5	4	1.3	2
95	5.6	2.7	4.2	1.3	2
100	5.7	2.8	4.1	1.3	2
105	6.5	3	5.8	2.2	3
110	7.2	3.6	6.1	2.5	3
115	5.8	2.8	5.1	2.4	3
120	6	2.2	5	1.5	3
125	6.7	3.3	5.7	2.1	3
130	7.2	3	5.8	1.6	3
135	6.1	2.6	5.6	1.4	3
140	6.9	3.1	5.4	2.1	3
145	6.7	3.3	5.7	2.5	3
150	5.9	3	5.1	1.8	3

pid	response
5	1
10	1
15	1
20	1
25	1
30	1
35	1
40	1
45	1
50	1
55	2
60	2
65	2
70	2
75	2
80	2
85	2
90	2
95	2
100	2
105	3
110	3
115	3
120	3
125	3
130	3
135	3
140	3
145	3
150	3

id	sepal_length	sepal_width	petal_length	petal_width	species
1	5.1	3.5	1.4	0.2	1
2	4.9	3	1.4	0.2	1
3	4.7	3.2	1.3	0.2	1
4	4.6	3.1	1.5	0.2	1
5	5	3.6	1.4	0.2	1
6	5.4	3.9	1.7	0.4	1
7	4.6	3.4	1.4	0.3	1
8	5	3.4	1.5	0.2	1
9	4.4	2.9	1.4	0.2	1
10	4.9	3.1	1.5	0.1	1
...	...	...	...	...	...

id	sepal_length	sepal_width	petal_length	petal_width	species
1	5.1	3.5	1.4	0.2	1
2	4.9	3	1.4	0.2	1
3	4.7	3.2	1.3	0.2	1
4	4.6	3.1	1.5	0.2	1
6	5.4	3.9	1.7	0.4	1
7	4.6	3.4	1.4	0.3	1
8	5	3.4	1.5	0.2	1
9	4.4	2.9	1.4	0.2	1
11	5.4	3.7	1.5	0.2	1
12	4.8	3.4	1.6	0.2	1
13	4.8	3	1.4	0.1	1
14	4.3	3	1.1	0.1	1
16	5.7	4.4	1.5	0.4	1
...	...	...	...	...	...

id	sepal_length	sepal_width	petal_length	petal_width	species
5	5	3.6	1.4	0.2	1
10	4.9	3.1	1.5	0.1	1
15	5.8	4	1.2	0.2	1
20	5.1	3.8	1.5	0.3	1
25	4.8	3.4	1.9	0.2	1
30	4.7	3.2	1.6	0.2	1
35	4.9	3.1	1.5	0.2	1
40	5.1	3.4	1.5	0.2	1
45	5.1	3.8	1.9	0.4	1
50	5	3.3	1.4	0.2	1
55	6.5	2.8	4.6	1.5	2
60	5.2	2.7	3.9	1.4	2
65	5.6	2.9	3.6	1.3	2
70	5.6	2.5	3.9	1.1	2
75	6.4	2.9	4.3	1.3	2
80	5.7	2.6	3.5	1	2
85	5.4	3	4.5	1.5	2
90	5.5	2.5	4	1.3	2
95	5.6	2.7	4.2	1.3	2
100	5.7	2.8	4.1	1.3	2
105	6.5	3	5.8	2.2	3
110	7.2	3.6	6.1	2.5	3
115	5.8	2.8	5.1	2.4	3
120	6	2.2	5	1.5	3
125	6.7	3.3	5.7	2.1	3
130	7.2	3	5.8	1.6	3
135	6.1	2.6	5.6	1.4	3
140	6.9	3.1	5.4	2.1	3
145	6.7	3.3	5.7	2.5	3
150	5.9	3	5.1	1.8	3

pid	response
5	1
10	1
15	1
20	1
25	1
30	1
35	1
40	1
45	1
50	1
55	2
60	2
65	2
70	2
75	2
80	2
85	2
90	2
95	2
100	2
105	3
110	3
115	3
120	3
125	3
130	3
135	3
140	3
145	3
150	3

id	sepal_length	sepal_width	petal_length	petal_width	species
1	5.1	3.5	1.4	0.2	1
2	4.9	3	1.4	0.2	1
3	4.7	3.2	1.3	0.2	1
4	4.6	3.1	1.5	0.2	1
5	5	3.6	1.4	0.2	1
6	5.4	3.9	1.7	0.4	1
7	4.6	3.4	1.4	0.3	1
8	5	3.4	1.5	0.2	1
9	4.4	2.9	1.4	0.2	1
10	4.9	3.1	1.5	0.1	1
...	...	...	...	...	...

id	sepal_length	sepal_width	petal_length	petal_width	species
1	5.1	3.5	1.4	0.2	1
2	4.9	3	1.4	0.2	1
3	4.7	3.2	1.3	0.2	1
4	4.6	3.1	1.5	0.2	1
6	5.4	3.9	1.7	0.4	1
7	4.6	3.4	1.4	0.3	1
8	5	3.4	1.5	0.2	1
9	4.4	2.9	1.4	0.2	1
11	5.4	3.7	1.5	0.2	1
12	4.8	3.4	1.6	0.2	1
13	4.8	3	1.4	0.1	1
14	4.3	3	1.1	0.1	1
16	5.7	4.4	1.5	0.4	1
...	...	...	...	...	...

id	sepal_length	sepal_width	petal_length	petal_width	species
5	5	3.6	1.4	0.2	1
10	4.9	3.1	1.5	0.1	1
15	5.8	4	1.2	0.2	1
20	5.1	3.8	1.5	0.3	1
25	4.8	3.4	1.9	0.2	1
30	4.7	3.2	1.6	0.2	1
35	4.9	3.1	1.5	0.2	1
40	5.1	3.4	1.5	0.2	1
45	5.1	3.8	1.9	0.4	1
50	5	3.3	1.4	0.2	1
55	6.5	2.8	4.6	1.5	2
60	5.2	2.7	3.9	1.4	2
65	5.6	2.9	3.6	1.3	2
70	5.6	2.5	3.9	1.1	2
75	6.4	2.9	4.3	1.3	2
80	5.7	2.6	3.5	1	2
85	5.4	3	4.5	1.5	2
90	5.5	2.5	4	1.3	2
95	5.6	2.7	4.2	1.3	2
100	5.7	2.8	4.1	1.3	2
105	6.5	3	5.8	2.2	3
110	7.2	3.6	6.1	2.5	3
115	5.8	2.8	5.1	2.4	3
120	6	2.2	5	1.5	3
125	6.7	3.3	5.7	2.1	3
130	7.2	3	5.8	1.6	3
135	6.1	2.6	5.6	1.4	3
140	6.9	3.1	5.4	2.1	3
145	6.7	3.3	5.7	2.5	3
150	5.9	3	5.1	1.8	3

pid	response
5	1
10	1
15	1
20	1
25	1
30	1
35	1
40	1
45	1
50	1
55	2
60	2
65	2
70	2
75	2
80	2
85	2
90	2
95	2
100	2
105	3
110	3
115	3
120	3
125	3
130	3
135	3
140	3
145	3
150	3