Teradata Package for R Function Reference | 17.00 - 17.00 - BYOM - Scoring Using PMML File - Teradata Package for R

Teradata® Package for R Function Reference

Product
Teradata Package for R
Release Number
17.00
Release Date
July 2021
Content Type
Programming Reference
Publication ID
B700-4007-090K
Language
English (United States)

Introduction to td_pmml_predict

The goal of this vignette is to get the user familiar with the usage of td_pmml_predict() and provide few introductory examples. This function is used to score data in Vantage with a model that has been created outside Vantage and exported to Vantage using PMML format.

Once the connection is established using td_create_context, you can load the example table bostonand create respective tibbles using the following statements:

loadExampleData("pmmlpredict_example", "boston")
loadExampleData("pmmlpredict_example", "iris_train", "iris_test")

# Create tbl_teradata objects.
boston <- tbl(con, "boston")
boston_train <- boston %>% filter(id < 200)
iris_train <- tbl(con, "iris_train")
iris_test <- tbl(con, "iris_test")

RandomForest.

Load required libraries. Prepare dataset for creating a random forest model.

library(pmml)
#> Loading required package: XML
library(MASS)
#> 
#> Attaching package: 'MASS'
#> The following object is masked from 'package:dplyr':
#> 
#>     select
library(randomForest)
#> randomForest 4.6-14
#> Type rfNews() to see new features/changes/bug fixes.
#> 
#> Attaching package: 'randomForest'
#> The following object is masked from 'package:dplyr':
#> 
#>     combine
dataBox <- xform_wrap(boston)
# Normalize all numeric variables of the loaded boston dataset to lie
# between 0 and 1.
dataBox <- xform_min_max(dataBox)
# Include only derived fields for modeling.
# Last element derived_medv index.
last <- length(names(dataBox$data))
# Create dataset without original predictors 1-13 and last derived_medv
# to train the model.
dataset <- dataBox$data[-c(1:13, eval(last))]
names(dataset)
#>  [1] "lstat"           "medv"            "derived_id"      "derived_crim"   
#>  [5] "derived_zn"      "derived_indus"   "derived_chas"    "derived_nox"    
#>  [9] "derived_rm"      "derived_age"     "derived_dis"     "derived_rad"    
#> [13] "derived_tax"     "derived_ptratio" "derived_black"   "derived_lstat"

Generate random forest model and save it to a pmml file.

fit <- randomForest(medv ~ ., data=na.omit(dataset), ntree=50)
tpmml <- pmml(fit, transforms = dataBox)
#> Now converting tree  1  to PMML
#> Now converting tree  2  to PMML
#> Now converting tree  3  to PMML
#> Now converting tree  4  to PMML
#> Now converting tree  5  to PMML
#> Now converting tree  6  to PMML
#> Now converting tree  7  to PMML
#> Now converting tree  8  to PMML
#> Now converting tree  9  to PMML
#> Now converting tree  10  to PMML
#> Now converting tree  11  to PMML
#> Now converting tree  12  to PMML
#> Now converting tree  13  to PMML
#> Now converting tree  14  to PMML
#> Now converting tree  15  to PMML
#> Now converting tree  16  to PMML
#> Now converting tree  17  to PMML
#> Now converting tree  18  to PMML
#> Now converting tree  19  to PMML
#> Now converting tree  20  to PMML
#> Now converting tree  21  to PMML
#> Now converting tree  22  to PMML
#> Now converting tree  23  to PMML
#> Now converting tree  24  to PMML
#> Now converting tree  25  to PMML
#> Now converting tree  26  to PMML
#> Now converting tree  27  to PMML
#> Now converting tree  28  to PMML
#> Now converting tree  29  to PMML
#> Now converting tree  30  to PMML
#> Now converting tree  31  to PMML
#> Now converting tree  32  to PMML
#> Now converting tree  33  to PMML
#> Now converting tree  34  to PMML
#> Now converting tree  35  to PMML
#> Now converting tree  36  to PMML
#> Now converting tree  37  to PMML
#> Now converting tree  38  to PMML
#> Now converting tree  39  to PMML
#> Now converting tree  40  to PMML
#> Now converting tree  41  to PMML
#> Now converting tree  42  to PMML
#> Now converting tree  43  to PMML
#> Now converting tree  44  to PMML
#> Now converting tree  45  to PMML
#> Now converting tree  46  to PMML
#> Now converting tree  47  to PMML
#> Now converting tree  48  to PMML
#> Now converting tree  49  to PMML
#> Now converting tree  50  to PMML
# pmml file will be saved in the tdplyr installation directory.
save_pmml(tpmml,"boston_trans_normcont_model.pmml")

User should create a ‘pmml_models’ table on Vantage and insert pmml file in the BLOB column of the table.

# Create following table on vantage. 
# crt_tbl <- "CREATE SET TABLE pmml_models(model_id VARCHAR(40), model BLOB) 
#             PRIMARY INDEX (model_id);"
# DBI::dbExecute(con, sql(crt_tbl))

# Create a file load_pmml_model.txt that has a model_id and a model file name 
# entry such as:
# boston_rf_model|boston_trans_normcont_model.pmml
# This file and the pmml models to be loaded should be in the same directory. 
#  
# Loading model with BTEQ. 
# .import vartext file load_pmml_model.txt
# .repeat *
# USING (c1 VARCHAR(40), c2 BLOB AS DEFERRED BY NAME) INSERT INTO pmml_models(:c1, :c2);

# Loading model with TDStudio. The '?' will open a GUI to browse the PMML file 
# that needs to be updated.

# insert into pmml_models values ('boston_rf_model', ?);

Create test dataset and select the model to use for scoring. Run td_pmml_predict().

# Create test data to score. 
boston_test <- td_sample(df=boston, n=0.3)           

modeldata <- tbl(con, "pmml_models") %>% filter(model_id=='boston_rf_model')

result <- td_pmml_predict(modeldata = modeldata, 
                          newdata = boston_test, 
                          accumulate = c("id"))
result
#> ############ result ############
#> 
#> # Source:   SQL [?? x 1]
#> # Database: Teradata
#> # Groups:  
#>      id prediction         json_report                              
#>   <int> <chr>              <chr>                                    
#> 1   326 24.961100000000005 "{\"Predicted_medv\":24.961100000000005}"
#> 2   122 20.631299999999996 "{\"Predicted_medv\":20.631299999999996}"
#> 3   387 8.832233333333335  "{\"Predicted_medv\":8.832233333333335}" 
#> 4   345 30.5882            "{\"Predicted_medv\":30.5882}"           
#> 5   446 11.7003            "{\"Predicted_medv\":11.7003}"           
#> 6   467 17.1689            "{\"Predicted_medv\":17.1689}"           
#> # ... with more rows

GLM.

Load required libraries. Generate GLM model and save it to a pmml file.

library(pmml)
library(stats)

iris_glm <- glm(sepal_length ~ ., data=iris_train)
iris_glm_pmml <- pmml(iris_glm)
# pmml file will be saved in the tdplyr installation directory.
save_pmml(iris_glm_pmml,"iris_glm.pmml")

User should create a ‘pmml_models’ table on Vantage and insert pmml file in the BLOB column of the table.

# Create following table on vantage. 
#crt_tbl <- "CREATE SET TABLE pmml_models(model_id VARCHAR(40), model BLOB) 
#            PRIMARY INDEX (model_id);"
#DBI::dbExecute(con, sql(crt_tbl))

# Create a file load_pmml_model.txt that has a model_id and a model file name 
# entry such as:
# iris_glm|iris_glm.pmml
# This file and the pmml models to be loaded should be in the same directory.  

# Loading model with BTEQ. 
# .import vartext file load_pmml_model.txt
# .repeat *
# USING (c1 VARCHAR(40), c2 BLOB AS DEFERRED BY NAME) INSERT INTO pmml_models(:c1, :c2);

# Loading model with TDStudio. The '?' will open a GUI to browse the PMML file 
# that needs to be updated.

# insert into pmml_models values ('iris_glm', ?);

Select the model to use for scoring. Run td_pmml_predict().

modeldata <- tbl(con, "pmml_models") %>% filter(model_id=='iris_glm')
result <- td_pmml_predict(modeldata = modeldata, 
                          newdata = iris_test, 
                          accumulate = c("id"))
result
#> ############ result ############
#> 
#> # Source:   SQL [?? x 1]
#> # Database: Teradata
#> # Groups:  
#>      id prediction        json_report                                     
#>   <int> <chr>             <chr>                                           
#> 1     5 5.046237108253978 "{\"Predicted_sepal_length\":5.046237108253978}"
#> 2    15 5.144527283275113 "{\"Predicted_sepal_length\":5.144527283275113}"
#> 3    30 4.978340744147729 "{\"Predicted_sepal_length\":4.978340744147729}"
#> 4    40 5.031827804646137 "{\"Predicted_sepal_length\":5.031827804646137}"
#> 5    80 5.472652135619913 "{\"Predicted_sepal_length\":5.472652135619913}"
#> 6    20 5.215592270022347 "{\"Predicted_sepal_length\":5.215592270022347}"
#> # ... with more rows

NaiveBayes.

Load required libraries. Generate Naive Bayes model and save it to a pmml file.

library(pmml)
library(e1071)
# naiveBayes function works with "data.frame".
# Convert tbl_teradata to "data.frame".
iris_train_df <- as.data.frame(iris_train)
nb_iris <- naiveBayes(species ~ ., data=iris_train_df)
nb_iris
#> 
#> Naive Bayes Classifier for Discrete Predictors
#> 
#> Call:
#> naiveBayes.default(x = X, y = Y, laplace = laplace)
#> 
#> A-priori probabilities:
#> Y
#>     1     2     3 
#> 0.333 0.333 0.333 
#> 
#> Conditional probabilities:
#>    id
#> Y   [,1] [,2]
#>   1   25 14.6
#>   2   75 14.6
#>   3  125 14.6
#> 
#>    sepal_length
#> Y   [,1]  [,2]
#>   1 5.00 0.368
#>   2 5.99 0.530
#>   3 6.61 0.665
#> 
#>    sepal_width
#> Y   [,1]  [,2]
#>   1 3.42 0.396
#>   2 2.78 0.342
#>   3 2.97 0.308
#> 
#>    petal_length
#> Y   [,1]  [,2]
#>   1 1.44 0.158
#>   2 4.31 0.485
#>   3 5.56 0.593
#> 
#>    petal_width
#> Y    [,1]  [,2]
#>   1 0.252 0.111
#>   2 1.333 0.208
#>   3 2.030 0.236

# Convert the generated model to "PMML" format.
nb_iris_pmml <- pmml(nb_iris, 
                     dataset=iris_train_df, 
                     predicted_field = "species")
# pmml file will be saved in the tdplyr installation directory.
save_pmml(nb_iris_pmml, "nb_iris_tdplyr.pmml")

User should create a ‘pmml_models’ table on Vantage and insert pmml file in the BLOB column of the table.

# Create following table on vantage. 
# crt_tbl <- "CREATE SET TABLE pmml_models(model_id VARCHAR(40), model BLOB) 
#             PRIMARY INDEX (model_id);"
# DBI::dbExecute(con, sql(crt_tbl))
#
# Create a file load_pmml_model.txt that has a model_id and a model file name 
# entry such as:
# nb_iris_tdplyr|nb_iris_tdplyr.pmml
#
# This file and the pmml models to be loaded should be in the same directory.  
# Loading model with BTEQ. 
# .import vartext file load_pmml_model.txt
# .repeat *
# USING (c1 VARCHAR(40), c2 BLOB AS DEFERRED BY NAME) INSERT INTO pmml_models(:c1, :c2);
#
# Loading model with TDStudio. The '?' will open a GUI to browse the PMML file 
# that needs to be updated.
# insert into pmml_models values ('nb_iris_tdplyr', ?);

Select the model to use for scoring. Run td_pmml_predict().

modeldata <- tbl(con, "pmml_models") %>% filter(model_id=='nb_iris_tdplyr')
nb_result <- td_pmml_predict(modeldata = modeldata, 
                             newdata = iris_test, 
                             accumulate = c("id"))
nb_result
#> ############ result ############
#> 
#> # Source:   SQL [?? x 1]
#> # Database: Teradata
#> # Groups:  
#>      id prediction json_report                                                  
#>   <int> <chr>      <chr>                                                        
#> 1     5 1          "{\"Predicted_species\":\"1\",\"Probability_1\":0.9999999998~
#> 2    15 1          "{\"Predicted_species\":\"1\",\"Probability_1\":0.9999999985~
#> 3    30 1          "{\"Predicted_species\":\"1\",\"Probability_1\":0.9999999996~
#> 4    40 1          "{\"Predicted_species\":\"1\",\"Probability_1\":0.9999999993~
#> 5    80 2          "{\"Predicted_species\":\"2\",\"Probability_1\":1.1239874514~
#> 6    20 1          "{\"Predicted_species\":\"1\",\"Probability_1\":0.9999999999~
#> # ... with more rows

XGBoost.

Load required libraries. Generate XGBoost model and save it to a pmml file.

library(pmml)
library(xgboost)
#> 
#> Attaching package: 'xgboost'
#> The following object is masked from 'package:dplyr':
#> 
#>     slice
# xgboost() function works with "matrix" only.
# Convert tbl_teradata to "data.frame".
iris_train_df <- as.data.frame(iris_train)

# Multinomial model using iris data.
xgb_model <- xgboost(data = as.matrix(iris_train_df[, 2:5]),
                     label = as.numeric(iris_train_df[, 6])-1,
                     max_depth = 2, 
                     eta = 1, 
                     nthread = 2, 
                     nrounds = 2,
                     objective = "multi:softprob", 
                     num_class = 3
                     )
#> [10:23:18] WARNING: amalgamation/../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
#> [1]  train-mlogloss:0.266147 
#> [2]  train-mlogloss:0.128672
xgb_model 
#> ##### xgb.Booster
#> raw: 6.6 Kb 
#> call:
#>   xgb.train(params = params, data = dtrain, nrounds = nrounds, 
#>     watchlist = watchlist, verbose = verbose, print_every_n = print_every_n, 
#>     early_stopping_rounds = early_stopping_rounds, maximize = maximize, 
#>     save_period = save_period, save_name = save_name, xgb_model = xgb_model, 
#>     callbacks = callbacks, max_depth = 2, eta = 1, nthread = 2, 
#>     objective = "multi:softprob", num_class = 3)
#> params (as set within xgb.train):
#>   max_depth = "2", eta = "1", nthread = "2", objective = "multi:softprob", num_class = "3", validate_parameters = "TRUE"
#> xgb.attributes:
#>   niter
#> callbacks:
#>   cb.print.evaluation(period = print_every_n)
#>   cb.evaluation.log()
#> # of features: 4 
#> niter: 2
#> nfeatures : 4 
#> evaluation_log:
#>  iter train_mlogloss
#>     1          0.266
#>     2          0.129

# Convert the generated model to "PMML" format.
# Save the tree information in an external file.
xgb.dump(xgb_model, "xgb_model.dumped.trees")
#> [1] TRUE

# Convert to PMML.
train_data_colnames <- colnames(as.matrix(iris_train_df[, 2:5]))
xgb_model_pmml <- pmml(xgb_model,
                       input_feature_names = train_data_colnames,
                       output_label_name = "species",
                       output_categories = c(1, 2, 3), 
                       xgb_dump_file = "xgb_model.dumped.trees"
                       )
#> Now converting tree  0  to PMML
#> Now converting tree  1  to PMML
#> Now converting tree  2  to PMML
#> Now converting tree  3  to PMML
#> Now converting tree  4  to PMML
#> Now converting tree  5  to PMML

# pmml file will be saved in the tdplyr installation directory.
save_pmml(xgb_model_pmml, "xgb_model_tdplyr.pmml")

User should create a ‘pmml_models’ table on Vantage and insert pmml file in the BLOB column of the table.

# Create following table on vantage. 
# crt_tbl <- "CREATE SET TABLE pmml_models(model_id VARCHAR(40), model BLOB) 
#             PRIMARY INDEX (model_id);"
# DBI::dbExecute(con, sql(crt_tbl))
#
# Create a file load_pmml_model.txt that has a model_id and a model file name 
# entry such as:
# xgb_model_tdplyr|xgb_model_tdplyr.pmml
#
# This file and the pmml models to be loaded should be in the same directory.  
# Loading model with BTEQ. 
# .import vartext file load_pmml_model.txt
# .repeat *
# USING (c1 VARCHAR(40), c2 BLOB AS DEFERRED BY NAME) INSERT INTO pmml_models(:c1, :c2);
#
# Loading model with TDStudio. The '?' will open a GUI to browse the PMML file 
# that needs to be updated.
# insert into pmml_models values ('xgb_model_tdplyr', ?);

Select the model to use for scoring. Run td_pmml_predict().

modeldata <- tbl(con, "pmml_models") %>% filter(model_id=='xgb_model_tdplyr')
xgb_result <- td_pmml_predict(modeldata = modeldata, 
                              newdata = iris_test, 
                              accumulate = c("id"))
xgb_result
#> ############ result ############
#> 
#> # Source:   SQL [?? x 1]
#> # Database: Teradata
#> # Groups:  
#>      id prediction json_report                                                  
#>   <int> <chr>      <chr>                                                        
#> 1     5 1          "{\"Predicted_species\":\"1\",\"Probability_1\":0.9253211031~
#> 2    15 1          "{\"Predicted_species\":\"1\",\"Probability_1\":0.9253211031~
#> 3    30 1          "{\"Predicted_species\":\"1\",\"Probability_1\":0.9253211031~
#> 4    40 1          "{\"Predicted_species\":\"1\",\"Probability_1\":0.9253211031~
#> 5    80 2          "{\"Predicted_species\":\"2\",\"Probability_1\":0.0424986393~
#> 6    20 1          "{\"Predicted_species\":\"1\",\"Probability_1\":0.9253211031~
#> # ... with more rows

Decision Tree.

Load required libraries. Generate Decision Tree model and save it to a pmml file.

library(pmml)
library(rpart)

# Use 'rpart()' function to generate decision tree model.
dt_iris <- rpart(species ~ ., data=iris_train)
dt_iris
#> n= 120 
#> 
#> node), split, n, deviance, yval
#>       * denotes terminal node
#> 
#> 1) root 120 80 2.0  
#>   2) id< 50 40  0 1.0 *
#>   3) id>=50 80 20 2.5  
#>     6) id< 100 40  0 2.0 *
#>     7) id>=100 40  0 3.0 *

# Convert the generated model to "PMML" format.
dt_iris_tdplyr <- pmml(dt_iris, 
                       predicted_field = "species")

# pmml file will be saved in the tdplyr installation directory.
save_pmml(dt_iris_tdplyr, "dt_iris_tdplyr.pmml")

User should create a ‘pmml_models’ table on Vantage and insert pmml file in the BLOB column of the table.

# Create following tables on vantage. 
# crt_tbl <- "CREATE SET TABLE pmml_models(model_id VARCHAR(40), model BLOB) 
#             PRIMARY INDEX (model_id);"
# DBI::dbExecute(con, sql(crt_tbl))
#
# Create a file load_pmml_model.txt that has a model_id and a model file name 
# entry such as:
# dt_iris_tdplyr|dt_iris_tdplyr.pmml
#
# This file and the pmml models to be loaded should be in the same directory.  
# Loading model with BTEQ. 
# .import vartext file load_pmml_model.txt
# .repeat *
# USING (c1 VARCHAR(40), c2 BLOB AS DEFERRED BY NAME) INSERT INTO pmml_models(:c1, :c2);
#
# Loading model with TDStudio. The '?' will open a GUI to browse the PMML file 
# that needs to be updated.
# insert into pmml_models values ('dt_iris_tdplyr', ?);

Select the model to use for scoring. Run td_pmml_predict().

modeldata <- tbl(con, "pmml_models") %>% filter(model_id=='dt_iris_tdplyr')
dt_result <- td_pmml_predict(modeldata = modeldata, 
                             newdata = iris_test, 
                             accumulate = c("id"))
dt_result
#> ############ result ############
#> 
#> # Source:   SQL [?? x 1]
#> # Database: Teradata
#> # Groups:  
#>      id prediction json_report                  
#>   <int> <chr>      <chr>                        
#> 1    15 1.0        "{\"Predicted_species\":1.0}"
#> 2     5 1.0        "{\"Predicted_species\":1.0}"
#> 3    20 1.0        "{\"Predicted_species\":1.0}"
#> 4    10 1.0        "{\"Predicted_species\":1.0}"
#> 5    65 2.0        "{\"Predicted_species\":2.0}"
#> 6    30 1.0        "{\"Predicted_species\":1.0}"
#> # ... with more rows