Teradata Package for Python Function Reference on VantageCloud Lake - XGBoost - Teradata Package for Python - Look here for syntax, methods and examples for the functions included in the Teradata Package for Python.

Teradata® Package for Python Function Reference on VantageCloud Lake

Deployment
VantageCloud
Edition
Lake
Product
Teradata Package for Python
Release Number
20.00.00.03
Published
December 2024
ft:locale
en-US
ft:lastEdition
2024-12-19
dita:id
TeradataPython_FxRef_Lake_2000
Product Category
Teradata Vantage

PMMLPredict() using XGBoost model.

Setup

In [1]:
# Import required libraries
import tempfile
import getpass
from teradataml.dataframe.sql_functions import case
from teradataml import PMMLPredict, DataFrame, load_example_data, create_context, \
db_drop_table, remove_context, save_byom, delete_byom, retrieve_byom, list_byom
from teradataml.options.configure import configure
In [2]:
# Create the connection.
con = create_context(host=getpass.getpass("Hostname: "), 
                     username=getpass.getpass("Username: "),
                     password=getpass.getpass("Password: "))
Hostname: ········
Username: ········
Password: ········

Load example data.

In [3]:
# Load the example data.
load_example_data("byom", "iris_input")
iris_input = DataFrame("iris_input")

# convert the "species" column to 0, 1, 2.
iris_input_df = iris_input.assign(species = case([(iris_input.species == 1, 0),
                                                (iris_input.species == 2, 1),
                                                (iris_input.species == 3, 2)],
                                                else_=0))

# Create 2 samples of input data - sample 1 will have 80% of total rows and sample 2 will have 20% of total rows. 
iris_sample = iris_input_df.sample(frac=[0.8, 0.2])
iris_sample
Out[3]:
id sepal_length sepal_width petal_length petal_width species sampleid
120 6.0 2.2 5.0 1.5 2 1
19 5.7 3.8 1.7 0.3 0 1
59 6.6 2.9 4.6 1.3 1 1
61 5.0 2.0 3.5 1.0 1 1
78 6.7 3.0 5.0 1.7 1 1
101 6.3 3.3 6.0 2.5 2 1
141 6.7 3.1 5.6 2.4 2 1
17 5.4 3.9 1.3 0.4 0 1
38 4.9 3.6 1.4 0.1 0 1
122 5.6 2.8 4.9 2.0 2 1
In [4]:
# Create train dataset from sample 1 by filtering on "sampleid" and drop "sampleid" column as it is not required for training model.
iris_train = iris_sample[iris_sample.sampleid == "1"].drop(["sampleid"], axis = 1)
iris_train
Out[4]:
id sepal_length sepal_width petal_length petal_width species
118 7.7 3.8 6.7 2.2 2
19 5.7 3.8 1.7 0.3 0
59 6.6 2.9 4.6 1.3 1
61 5.0 2.0 3.5 1.0 1
78 6.7 3.0 5.0 1.7 1
101 6.3 3.3 6.0 2.5 2
141 6.7 3.1 5.6 2.4 2
17 5.4 3.9 1.3 0.4 0
38 4.9 3.6 1.4 0.1 0
122 5.6 2.8 4.9 2.0 2
In [5]:
# Create test dataset from sample 2 by filtering on "sampleid" and drop "sampleid" column as it is not required for scoring.
iris_test = iris_sample[iris_sample.sampleid == "2"].drop(["sampleid"], axis = 1)
iris_test
Out[5]:
id sepal_length sepal_width petal_length petal_width species
7 4.6 3.4 1.4 0.3 0
99 5.1 2.5 3.0 1.1 1
15 5.8 4.0 1.2 0.2 0
78 6.7 3.0 5.0 1.7 1
60 5.2 2.7 3.9 1.4 1
34 5.5 4.2 1.4 0.2 0
13 4.8 3.0 1.4 0.1 0
11 5.4 3.7 1.5 0.2 0
131 7.4 2.8 6.1 1.9 2
122 5.6 2.8 4.9 2.0 2

Train XGBoost Model.

In [6]:
# Import required libraries.
import numpy as np
from sklearn import tree
from nyoka import xgboost_to_pmml
from sklearn.pipeline import Pipeline
from sklearn_pandas import DataFrameMapper
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
In [7]:
# Convert teradataml dataframe to pandas dataframe.
# features : Training data.
# target : Training targets.
traid_pd = iris_train.to_pandas(coerce_float=True)
features = traid_pd.columns.drop('species')
target = 'species'
In [8]:
#Generate the XGBoost model.
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
rf_pipe_obj = Pipeline([
    ("mapping", DataFrameMapper([
    (['sepal_length', 'sepal_width'], StandardScaler()) ,
    (['petal_length', 'petal_width'], imputer)
    ])),
    ("xgb", XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'))
])
In [9]:
rf_pipe_obj.fit(traid_pd[features], traid_pd[target])
Out[9]:
Pipeline(steps=[('mapping',
                 DataFrameMapper(drop_cols=[],
                                 features=[(['sepal_length', 'sepal_width'],
                                            StandardScaler()),
                                           (['petal_length', 'petal_width'],
                                            SimpleImputer())])),
                ('xgb',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, eval_metric='mlogloss',
                               gamma=0, gpu_id=-1, importance_type='gain',
                               interac...ts='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=6, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=100,
                               n_jobs=16, num_parallel_tree=1,
                               objective='multi:softprob', random_state=0,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
                               subsample=1, tree_method='exact',
                               use_label_encoder=False, validate_parameters=1,
                               verbosity=None))])

Save the model in PMML format.

In [10]:
temp_dir = tempfile.TemporaryDirectory()
model_file_path = f"{temp_dir.name}/iris_db_xgb_model.pmml"
In [11]:
xgboost_to_pmml(rf_pipe_obj, features, target, model_file_path)

Save the model in Vantage.

In [12]:
# Save the PMML Model in Vantage.
save_byom("pmml_xgboost_iris", model_file_path, "byom_models")
Created the model table 'byom_models' as it does not exist.
Model is saved.

List the models from Vantage.

In [13]:
# List the PMML Model in Vantage.
list_byom("byom_models")
                                      model
model_id                                   
pmml_xgboost_iris  b'3C3F786D6C20766572...'

Retrieve the model from Vantage.

In [14]:
# Retrieve the model from table "byom_models", using the model id 'pmml_random_forest_iris'.
modeldata = retrieve_byom("pmml_xgboost_iris", "byom_models")

Set "configure.byom_install_location" to the database where BYOM functions are installed.

In [15]:
configure.byom_install_location = getpass.getpass("byom_install_location: ")
byom_install_location: ········

Score the model.

In [16]:
# Perform prediction using PMMLPredict() and the PMML model stored in Vantage.
result = PMMLPredict(
                    modeldata = modeldata,
                    newdata = iris_test,
                    accumulate = ['id', 'sepal_length', 'petal_length'],
                    overwrite_cached_models = '*',
                    )
In [17]:
# Print the query.
print(result.show_query())
SELECT * FROM "mldb".PMMLPredict(
	ON "MLDB"."ml__select__163422737162745" AS InputTable
	PARTITION BY ANY 
	ON (select model_id,model from "MLDB"."ml__filter__163429691779201") AS ModelTable
	DIMENSION
	USING
	Accumulate('id','sepal_length','petal_length')
	OverwriteCachedModel('*')
) as sqlmr
In [18]:
# Print the result.
result.result
Out[18]:
id sepal_length petal_length prediction json_report
91 5.5 4.4 1 {"probability_0":0.0031631026405870067,"probability_1":0.9927252740591257,"predicted_species":1,"probability_2":0.004111623300287216}
120 6.0 5.0 2 {"probability_0":0.006574205810514272,"probability_1":0.14864402724284423,"predicted_species":2,"probability_2":0.8447817669466414}
95 5.6 4.2 1 {"probability_0":0.001571336511149009,"probability_1":0.9975133706102051,"predicted_species":1,"probability_2":9.15292878645976E-4}
34 5.5 1.4 0 {"probability_0":0.9909726169633473,"probability_1":0.008225234519697451,"predicted_species":0,"probability_2":8.021485169552667E-4}
81 5.5 3.8 1 {"probability_0":0.0035613376851459104,"probability_1":0.9910389530219135,"predicted_species":1,"probability_2":0.005399709292940567}
93 5.8 4.0 1 {"probability_0":0.0016650949448973374,"probability_1":0.9961704975253616,"predicted_species":1,"probability_2":0.0021644075297410615}
26 5.0 1.6 0 {"probability_0":0.9962812439487461,"probability_1":0.0025958448582197586,"predicted_species":0,"probability_2":0.0011229111930341426}
5 5.0 1.4 0 {"probability_0":0.9964661313735003,"probability_1":0.0027272733525285243,"predicted_species":0,"probability_2":8.065952739710961E-4}
13 4.8 1.4 0 {"probability_0":0.9962812439487461,"probability_1":0.0025958448582197586,"predicted_species":0,"probability_2":0.0011229111930341426}
80 5.7 3.5 1 {"probability_0":0.0019740235797788904,"probability_1":0.9957818191767968,"predicted_species":1,"probability_2":0.0022441572434242912}

Cleanup.

In [19]:
# Delete the model from table "byom_models", using the model id 'pmml_random_forest_iris'.
delete_byom("pmml_xgboost_iris", "byom_models")
Model is deleted.
In [20]:
# Drop models table.
db_drop_table("byom_models")
Out[20]:
True
In [21]:
# Drop input data tables.
db_drop_table("iris_input")
Out[21]:
True
In [22]:
# One must run remove_context() to close the connection and garbage collect internally generated objects.
remove_context()
Out[22]:
True