PMMLPredict() using XGBoost model.¶
Setup¶
In [1]:
# Import required libraries
import tempfile
import getpass
from teradataml.dataframe.sql_functions import case
from teradataml import PMMLPredict, DataFrame, load_example_data, create_context, \
db_drop_table, remove_context, save_byom, delete_byom, retrieve_byom, list_byom
from teradataml.options.configure import configure
In [2]:
# Create the connection.
con = create_context(host=getpass.getpass("Hostname: "),
username=getpass.getpass("Username: "),
password=getpass.getpass("Password: "))
Load example data.¶
In [3]:
# Load the example data.
load_example_data("byom", "iris_input")
iris_input = DataFrame("iris_input")
# convert the "species" column to 0, 1, 2.
iris_input_df = iris_input.assign(species = case([(iris_input.species == 1, 0),
(iris_input.species == 2, 1),
(iris_input.species == 3, 2)],
else_=0))
# Create 2 samples of input data - sample 1 will have 80% of total rows and sample 2 will have 20% of total rows.
iris_sample = iris_input_df.sample(frac=[0.8, 0.2])
iris_sample
Out[3]:
In [4]:
# Create train dataset from sample 1 by filtering on "sampleid" and drop "sampleid" column as it is not required for training model.
iris_train = iris_sample[iris_sample.sampleid == "1"].drop(["sampleid"], axis = 1)
iris_train
Out[4]:
In [5]:
# Create test dataset from sample 2 by filtering on "sampleid" and drop "sampleid" column as it is not required for scoring.
iris_test = iris_sample[iris_sample.sampleid == "2"].drop(["sampleid"], axis = 1)
iris_test
Out[5]:
Train XGBoost Model.¶
In [6]:
# Import required libraries.
import numpy as np
from sklearn import tree
from nyoka import xgboost_to_pmml
from sklearn.pipeline import Pipeline
from sklearn_pandas import DataFrameMapper
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
In [7]:
# Convert teradataml dataframe to pandas dataframe.
# features : Training data.
# target : Training targets.
traid_pd = iris_train.to_pandas(coerce_float=True)
features = traid_pd.columns.drop('species')
target = 'species'
In [8]:
#Generate the XGBoost model.
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
rf_pipe_obj = Pipeline([
("mapping", DataFrameMapper([
(['sepal_length', 'sepal_width'], StandardScaler()) ,
(['petal_length', 'petal_width'], imputer)
])),
("xgb", XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'))
])
In [9]:
rf_pipe_obj.fit(traid_pd[features], traid_pd[target])
Out[9]:
Save the model in PMML format.¶
In [10]:
temp_dir = tempfile.TemporaryDirectory()
model_file_path = f"{temp_dir.name}/iris_db_xgb_model.pmml"
In [11]:
xgboost_to_pmml(rf_pipe_obj, features, target, model_file_path)
Save the model in Vantage.¶
In [12]:
# Save the PMML Model in Vantage.
save_byom("pmml_xgboost_iris", model_file_path, "byom_models")
List the models from Vantage.¶
In [13]:
# List the PMML Model in Vantage.
list_byom("byom_models")
Retrieve the model from Vantage.¶
In [14]:
# Retrieve the model from table "byom_models", using the model id 'pmml_random_forest_iris'.
modeldata = retrieve_byom("pmml_xgboost_iris", "byom_models")
Set "configure.byom_install_location" to the database where BYOM functions are installed.¶
In [15]:
configure.byom_install_location = getpass.getpass("byom_install_location: ")
Score the model.¶
In [16]:
# Perform prediction using PMMLPredict() and the PMML model stored in Vantage.
result = PMMLPredict(
modeldata = modeldata,
newdata = iris_test,
accumulate = ['id', 'sepal_length', 'petal_length'],
overwrite_cached_models = '*',
)
In [17]:
# Print the query.
print(result.show_query())
In [18]:
# Print the result.
result.result
Out[18]:
Cleanup.¶
In [19]:
# Delete the model from table "byom_models", using the model id 'pmml_random_forest_iris'.
delete_byom("pmml_xgboost_iris", "byom_models")
In [20]:
# Drop models table.
db_drop_table("byom_models")
Out[20]:
In [21]:
# Drop input data tables.
db_drop_table("iris_input")
Out[21]:
In [22]:
# One must run remove_context() to close the connection and garbage collect internally generated objects.
remove_context()
Out[22]: