Scoring using PMML file
Introduction to td_pmml_predict
The goal of this vignette is to get the user familiar with the usage of td_pmml_predict() and provide few introductory examples. This function is used to score data in Vantage with a model that has been created outside Vantage and exported to Vantage using PMML format.
Once the connection is established using td_create_context, you can load the example table boston
and create respective tibbles using the following statements:
loadExampleData("pmmlpredict_example", "boston")
loadExampleData("pmmlpredict_example", "iris_train", "iris_test")
# Create tbl_teradata objects.
boston <- tbl(con, "boston")
boston_train <- boston %>% filter(id < 200)
iris_train <- tbl(con, "iris_train")
iris_test <- tbl(con, "iris_test")
RandomForest.
Load required libraries. Prepare dataset for creating a random forest model.
library(pmml)
#> Loading required package: XML
library(MASS)
#>
#> Attaching package: 'MASS'
#> The following object is masked from 'package:dplyr':
#>
#> select
library(randomForest)
#> randomForest 4.6-14
#> Type rfNews() to see new features/changes/bug fixes.
#>
#> Attaching package: 'randomForest'
#> The following object is masked from 'package:dplyr':
#>
#> combine
dataBox <- xform_wrap(boston)
# Normalize all numeric variables of the loaded boston dataset to lie
# between 0 and 1.
dataBox <- xform_min_max(dataBox)
# Include only derived fields for modeling.
# Last element derived_medv index.
last <- length(names(dataBox$data))
# Create dataset without original predictors 1-13 and last derived_medv
# to train the model.
dataset <- dataBox$data[-c(1:13, eval(last))]
names(dataset)
#> [1] "lstat" "medv" "derived_id" "derived_crim"
#> [5] "derived_zn" "derived_indus" "derived_chas" "derived_nox"
#> [9] "derived_rm" "derived_age" "derived_dis" "derived_rad"
#> [13] "derived_tax" "derived_ptratio" "derived_black" "derived_lstat"
Generate random forest model and save it to a pmml file.
fit <- randomForest(medv ~ ., data=na.omit(dataset), ntree=50)
tpmml <- pmml(fit, transforms = dataBox)
#> Now converting tree 1 to PMML
#> Now converting tree 2 to PMML
#> Now converting tree 3 to PMML
#> Now converting tree 4 to PMML
#> Now converting tree 5 to PMML
#> Now converting tree 6 to PMML
#> Now converting tree 7 to PMML
#> Now converting tree 8 to PMML
#> Now converting tree 9 to PMML
#> Now converting tree 10 to PMML
#> Now converting tree 11 to PMML
#> Now converting tree 12 to PMML
#> Now converting tree 13 to PMML
#> Now converting tree 14 to PMML
#> Now converting tree 15 to PMML
#> Now converting tree 16 to PMML
#> Now converting tree 17 to PMML
#> Now converting tree 18 to PMML
#> Now converting tree 19 to PMML
#> Now converting tree 20 to PMML
#> Now converting tree 21 to PMML
#> Now converting tree 22 to PMML
#> Now converting tree 23 to PMML
#> Now converting tree 24 to PMML
#> Now converting tree 25 to PMML
#> Now converting tree 26 to PMML
#> Now converting tree 27 to PMML
#> Now converting tree 28 to PMML
#> Now converting tree 29 to PMML
#> Now converting tree 30 to PMML
#> Now converting tree 31 to PMML
#> Now converting tree 32 to PMML
#> Now converting tree 33 to PMML
#> Now converting tree 34 to PMML
#> Now converting tree 35 to PMML
#> Now converting tree 36 to PMML
#> Now converting tree 37 to PMML
#> Now converting tree 38 to PMML
#> Now converting tree 39 to PMML
#> Now converting tree 40 to PMML
#> Now converting tree 41 to PMML
#> Now converting tree 42 to PMML
#> Now converting tree 43 to PMML
#> Now converting tree 44 to PMML
#> Now converting tree 45 to PMML
#> Now converting tree 46 to PMML
#> Now converting tree 47 to PMML
#> Now converting tree 48 to PMML
#> Now converting tree 49 to PMML
#> Now converting tree 50 to PMML
# pmml file will be saved in the tdplyr installation directory.
save_pmml(tpmml,"boston_trans_normcont_model.pmml")
User should create a ‘pmml_models’ table on Vantage and insert pmml file in the BLOB column of the table.
# Create following table on vantage.
# crt_tbl <- "CREATE SET TABLE pmml_models(model_id VARCHAR(40), model BLOB)
# PRIMARY INDEX (model_id);"
# DBI::dbExecute(con, sql(crt_tbl))
# Create a file load_pmml_model.txt that has a model_id and a model file name
# entry such as:
# boston_rf_model|boston_trans_normcont_model.pmml
# This file and the pmml models to be loaded should be in the same directory.
#
# Loading model with BTEQ.
# .import vartext file load_pmml_model.txt
# .repeat *
# USING (c1 VARCHAR(40), c2 BLOB AS DEFERRED BY NAME) INSERT INTO pmml_models(:c1, :c2);
# Loading model with TDStudio. The '?' will open a GUI to browse the PMML file
# that needs to be updated.
# insert into pmml_models values ('boston_rf_model', ?);
Create test dataset and select the model to use for scoring. Run td_pmml_predict().
# Create test data to score.
boston_test <- td_sample(df=boston, n=0.3)
modeldata <- tbl(con, "pmml_models") %>% filter(model_id=='boston_rf_model')
result <- td_pmml_predict(modeldata = modeldata,
newdata = boston_test,
accumulate = c("id"))
result
#> ############ result ############
#>
#> # Source: SQL [?? x 1]
#> # Database: Teradata
#> # Groups:
#> id prediction json_report
#> <int> <chr> <chr>
#> 1 326 24.961100000000005 "{\"Predicted_medv\":24.961100000000005}"
#> 2 122 20.631299999999996 "{\"Predicted_medv\":20.631299999999996}"
#> 3 387 8.832233333333335 "{\"Predicted_medv\":8.832233333333335}"
#> 4 345 30.5882 "{\"Predicted_medv\":30.5882}"
#> 5 446 11.7003 "{\"Predicted_medv\":11.7003}"
#> 6 467 17.1689 "{\"Predicted_medv\":17.1689}"
#> # ... with more rows
GLM.
Load required libraries. Generate GLM model and save it to a pmml file.
library(pmml)
library(stats)
iris_glm <- glm(sepal_length ~ ., data=iris_train)
iris_glm_pmml <- pmml(iris_glm)
# pmml file will be saved in the tdplyr installation directory.
save_pmml(iris_glm_pmml,"iris_glm.pmml")
User should create a ‘pmml_models’ table on Vantage and insert pmml file in the BLOB column of the table.
# Create following table on vantage.
#crt_tbl <- "CREATE SET TABLE pmml_models(model_id VARCHAR(40), model BLOB)
# PRIMARY INDEX (model_id);"
#DBI::dbExecute(con, sql(crt_tbl))
# Create a file load_pmml_model.txt that has a model_id and a model file name
# entry such as:
# iris_glm|iris_glm.pmml
# This file and the pmml models to be loaded should be in the same directory.
# Loading model with BTEQ.
# .import vartext file load_pmml_model.txt
# .repeat *
# USING (c1 VARCHAR(40), c2 BLOB AS DEFERRED BY NAME) INSERT INTO pmml_models(:c1, :c2);
# Loading model with TDStudio. The '?' will open a GUI to browse the PMML file
# that needs to be updated.
# insert into pmml_models values ('iris_glm', ?);
Select the model to use for scoring. Run td_pmml_predict().
modeldata <- tbl(con, "pmml_models") %>% filter(model_id=='iris_glm')
result <- td_pmml_predict(modeldata = modeldata,
newdata = iris_test,
accumulate = c("id"))
result
#> ############ result ############
#>
#> # Source: SQL [?? x 1]
#> # Database: Teradata
#> # Groups:
#> id prediction json_report
#> <int> <chr> <chr>
#> 1 5 5.046237108253978 "{\"Predicted_sepal_length\":5.046237108253978}"
#> 2 15 5.144527283275113 "{\"Predicted_sepal_length\":5.144527283275113}"
#> 3 30 4.978340744147729 "{\"Predicted_sepal_length\":4.978340744147729}"
#> 4 40 5.031827804646137 "{\"Predicted_sepal_length\":5.031827804646137}"
#> 5 80 5.472652135619913 "{\"Predicted_sepal_length\":5.472652135619913}"
#> 6 20 5.215592270022347 "{\"Predicted_sepal_length\":5.215592270022347}"
#> # ... with more rows
NaiveBayes.
Load required libraries. Generate Naive Bayes model and save it to a pmml file.
library(pmml)
library(e1071)
# naiveBayes function works with "data.frame".
# Convert tbl_teradata to "data.frame".
iris_train_df <- as.data.frame(iris_train)
nb_iris <- naiveBayes(species ~ ., data=iris_train_df)
nb_iris
#>
#> Naive Bayes Classifier for Discrete Predictors
#>
#> Call:
#> naiveBayes.default(x = X, y = Y, laplace = laplace)
#>
#> A-priori probabilities:
#> Y
#> 1 2 3
#> 0.333 0.333 0.333
#>
#> Conditional probabilities:
#> id
#> Y [,1] [,2]
#> 1 25 14.6
#> 2 75 14.6
#> 3 125 14.6
#>
#> sepal_length
#> Y [,1] [,2]
#> 1 5.00 0.368
#> 2 5.99 0.530
#> 3 6.61 0.665
#>
#> sepal_width
#> Y [,1] [,2]
#> 1 3.42 0.396
#> 2 2.78 0.342
#> 3 2.97 0.308
#>
#> petal_length
#> Y [,1] [,2]
#> 1 1.44 0.158
#> 2 4.31 0.485
#> 3 5.56 0.593
#>
#> petal_width
#> Y [,1] [,2]
#> 1 0.252 0.111
#> 2 1.333 0.208
#> 3 2.030 0.236
# Convert the generated model to "PMML" format.
nb_iris_pmml <- pmml(nb_iris,
dataset=iris_train_df,
predicted_field = "species")
# pmml file will be saved in the tdplyr installation directory.
save_pmml(nb_iris_pmml, "nb_iris_tdplyr.pmml")
User should create a ‘pmml_models’ table on Vantage and insert pmml file in the BLOB column of the table.
# Create following table on vantage.
# crt_tbl <- "CREATE SET TABLE pmml_models(model_id VARCHAR(40), model BLOB)
# PRIMARY INDEX (model_id);"
# DBI::dbExecute(con, sql(crt_tbl))
#
# Create a file load_pmml_model.txt that has a model_id and a model file name
# entry such as:
# nb_iris_tdplyr|nb_iris_tdplyr.pmml
#
# This file and the pmml models to be loaded should be in the same directory.
# Loading model with BTEQ.
# .import vartext file load_pmml_model.txt
# .repeat *
# USING (c1 VARCHAR(40), c2 BLOB AS DEFERRED BY NAME) INSERT INTO pmml_models(:c1, :c2);
#
# Loading model with TDStudio. The '?' will open a GUI to browse the PMML file
# that needs to be updated.
# insert into pmml_models values ('nb_iris_tdplyr', ?);
Select the model to use for scoring. Run td_pmml_predict().
modeldata <- tbl(con, "pmml_models") %>% filter(model_id=='nb_iris_tdplyr')
nb_result <- td_pmml_predict(modeldata = modeldata,
newdata = iris_test,
accumulate = c("id"))
nb_result
#> ############ result ############
#>
#> # Source: SQL [?? x 1]
#> # Database: Teradata
#> # Groups:
#> id prediction json_report
#> <int> <chr> <chr>
#> 1 5 1 "{\"Predicted_species\":\"1\",\"Probability_1\":0.9999999998~
#> 2 15 1 "{\"Predicted_species\":\"1\",\"Probability_1\":0.9999999985~
#> 3 30 1 "{\"Predicted_species\":\"1\",\"Probability_1\":0.9999999996~
#> 4 40 1 "{\"Predicted_species\":\"1\",\"Probability_1\":0.9999999993~
#> 5 80 2 "{\"Predicted_species\":\"2\",\"Probability_1\":1.1239874514~
#> 6 20 1 "{\"Predicted_species\":\"1\",\"Probability_1\":0.9999999999~
#> # ... with more rows
XGBoost.
Load required libraries. Generate XGBoost model and save it to a pmml file.
library(pmml)
library(xgboost)
#>
#> Attaching package: 'xgboost'
#> The following object is masked from 'package:dplyr':
#>
#> slice
# xgboost() function works with "matrix" only.
# Convert tbl_teradata to "data.frame".
iris_train_df <- as.data.frame(iris_train)
# Multinomial model using iris data.
xgb_model <- xgboost(data = as.matrix(iris_train_df[, 2:5]),
label = as.numeric(iris_train_df[, 6])-1,
max_depth = 2,
eta = 1,
nthread = 2,
nrounds = 2,
objective = "multi:softprob",
num_class = 3
)
#> [10:23:18] WARNING: amalgamation/../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'multi:softprob' was changed from 'merror' to 'mlogloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
#> [1] train-mlogloss:0.266147
#> [2] train-mlogloss:0.128672
xgb_model
#> ##### xgb.Booster
#> raw: 6.6 Kb
#> call:
#> xgb.train(params = params, data = dtrain, nrounds = nrounds,
#> watchlist = watchlist, verbose = verbose, print_every_n = print_every_n,
#> early_stopping_rounds = early_stopping_rounds, maximize = maximize,
#> save_period = save_period, save_name = save_name, xgb_model = xgb_model,
#> callbacks = callbacks, max_depth = 2, eta = 1, nthread = 2,
#> objective = "multi:softprob", num_class = 3)
#> params (as set within xgb.train):
#> max_depth = "2", eta = "1", nthread = "2", objective = "multi:softprob", num_class = "3", validate_parameters = "TRUE"
#> xgb.attributes:
#> niter
#> callbacks:
#> cb.print.evaluation(period = print_every_n)
#> cb.evaluation.log()
#> # of features: 4
#> niter: 2
#> nfeatures : 4
#> evaluation_log:
#> iter train_mlogloss
#> 1 0.266
#> 2 0.129
# Convert the generated model to "PMML" format.
# Save the tree information in an external file.
xgb.dump(xgb_model, "xgb_model.dumped.trees")
#> [1] TRUE
# Convert to PMML.
train_data_colnames <- colnames(as.matrix(iris_train_df[, 2:5]))
xgb_model_pmml <- pmml(xgb_model,
input_feature_names = train_data_colnames,
output_label_name = "species",
output_categories = c(1, 2, 3),
xgb_dump_file = "xgb_model.dumped.trees"
)
#> Now converting tree 0 to PMML
#> Now converting tree 1 to PMML
#> Now converting tree 2 to PMML
#> Now converting tree 3 to PMML
#> Now converting tree 4 to PMML
#> Now converting tree 5 to PMML
# pmml file will be saved in the tdplyr installation directory.
save_pmml(xgb_model_pmml, "xgb_model_tdplyr.pmml")
User should create a ‘pmml_models’ table on Vantage and insert pmml file in the BLOB column of the table.
# Create following table on vantage.
# crt_tbl <- "CREATE SET TABLE pmml_models(model_id VARCHAR(40), model BLOB)
# PRIMARY INDEX (model_id);"
# DBI::dbExecute(con, sql(crt_tbl))
#
# Create a file load_pmml_model.txt that has a model_id and a model file name
# entry such as:
# xgb_model_tdplyr|xgb_model_tdplyr.pmml
#
# This file and the pmml models to be loaded should be in the same directory.
# Loading model with BTEQ.
# .import vartext file load_pmml_model.txt
# .repeat *
# USING (c1 VARCHAR(40), c2 BLOB AS DEFERRED BY NAME) INSERT INTO pmml_models(:c1, :c2);
#
# Loading model with TDStudio. The '?' will open a GUI to browse the PMML file
# that needs to be updated.
# insert into pmml_models values ('xgb_model_tdplyr', ?);
Select the model to use for scoring. Run td_pmml_predict().
modeldata <- tbl(con, "pmml_models") %>% filter(model_id=='xgb_model_tdplyr')
xgb_result <- td_pmml_predict(modeldata = modeldata,
newdata = iris_test,
accumulate = c("id"))
xgb_result
#> ############ result ############
#>
#> # Source: SQL [?? x 1]
#> # Database: Teradata
#> # Groups:
#> id prediction json_report
#> <int> <chr> <chr>
#> 1 5 1 "{\"Predicted_species\":\"1\",\"Probability_1\":0.9253211031~
#> 2 15 1 "{\"Predicted_species\":\"1\",\"Probability_1\":0.9253211031~
#> 3 30 1 "{\"Predicted_species\":\"1\",\"Probability_1\":0.9253211031~
#> 4 40 1 "{\"Predicted_species\":\"1\",\"Probability_1\":0.9253211031~
#> 5 80 2 "{\"Predicted_species\":\"2\",\"Probability_1\":0.0424986393~
#> 6 20 1 "{\"Predicted_species\":\"1\",\"Probability_1\":0.9253211031~
#> # ... with more rows
Decision Tree.
Load required libraries. Generate Decision Tree model and save it to a pmml file.
library(pmml)
library(rpart)
# Use 'rpart()' function to generate decision tree model.
dt_iris <- rpart(species ~ ., data=iris_train)
dt_iris
#> n= 120
#>
#> node), split, n, deviance, yval
#> * denotes terminal node
#>
#> 1) root 120 80 2.0
#> 2) id< 50 40 0 1.0 *
#> 3) id>=50 80 20 2.5
#> 6) id< 100 40 0 2.0 *
#> 7) id>=100 40 0 3.0 *
# Convert the generated model to "PMML" format.
dt_iris_tdplyr <- pmml(dt_iris,
predicted_field = "species")
# pmml file will be saved in the tdplyr installation directory.
save_pmml(dt_iris_tdplyr, "dt_iris_tdplyr.pmml")
User should create a ‘pmml_models’ table on Vantage and insert pmml file in the BLOB column of the table.
# Create following tables on vantage.
# crt_tbl <- "CREATE SET TABLE pmml_models(model_id VARCHAR(40), model BLOB)
# PRIMARY INDEX (model_id);"
# DBI::dbExecute(con, sql(crt_tbl))
#
# Create a file load_pmml_model.txt that has a model_id and a model file name
# entry such as:
# dt_iris_tdplyr|dt_iris_tdplyr.pmml
#
# This file and the pmml models to be loaded should be in the same directory.
# Loading model with BTEQ.
# .import vartext file load_pmml_model.txt
# .repeat *
# USING (c1 VARCHAR(40), c2 BLOB AS DEFERRED BY NAME) INSERT INTO pmml_models(:c1, :c2);
#
# Loading model with TDStudio. The '?' will open a GUI to browse the PMML file
# that needs to be updated.
# insert into pmml_models values ('dt_iris_tdplyr', ?);
Select the model to use for scoring. Run td_pmml_predict().
modeldata <- tbl(con, "pmml_models") %>% filter(model_id=='dt_iris_tdplyr')
dt_result <- td_pmml_predict(modeldata = modeldata,
newdata = iris_test,
accumulate = c("id"))
dt_result
#> ############ result ############
#>
#> # Source: SQL [?? x 1]
#> # Database: Teradata
#> # Groups:
#> id prediction json_report
#> <int> <chr> <chr>
#> 1 15 1.0 "{\"Predicted_species\":1.0}"
#> 2 5 1.0 "{\"Predicted_species\":1.0}"
#> 3 20 1.0 "{\"Predicted_species\":1.0}"
#> 4 10 1.0 "{\"Predicted_species\":1.0}"
#> 5 65 2.0 "{\"Predicted_species\":2.0}"
#> 6 30 1.0 "{\"Predicted_species\":1.0}"
#> # ... with more rows