PMMLPredict() using KMeans Clustering¶
Setup¶
In [1]:
# Import required libraries
import getpass
import tempfile
from teradataml import PMMLPredict, DataFrame, load_example_data, create_context, \
db_drop_table, remove_context, save_byom, retrieve_byom, delete_byom, list_byom
from teradataml.options.configure import configure
In [2]:
# Create the connection.
host = getpass.getpass("Host: ")
username = getpass.getpass("Username: ")
password = getpass.getpass("Password: ")
con = create_context(host=host, username=username, password=password)
Host: ········ Username: ········ Password: ········
Load example data and use DataFrame.sample() for splitting input data into testing and training dataset.¶
In [3]:
# Load the example data.
load_example_data("byom", "iris_input")
iris_input = DataFrame("iris_input")
WARNING: Skipped loading table iris_input since it already exists in the database.
In [4]:
# Create 2 samples of input data - sample 1 will have 80% of total rows and sample 2 will have 20% of total rows.
iris_sample = iris_input.sample(frac=[0.8, 0.2])
In [5]:
# Create train dataset from sample 1 by filtering on "sampleid" and drop "sampleid" column as it is not required for training model.
iris_train = iris_sample[iris_sample.sampleid == "1"].drop("sampleid", axis = 1)
iris_train
Out[5]:
id | sepal_length | sepal_width | petal_length | petal_width | species |
---|---|---|---|---|---|
99 | 5.1 | 2.5 | 3.0 | 1.1 | 2 |
120 | 6.0 | 2.2 | 5.0 | 1.5 | 3 |
57 | 6.3 | 3.3 | 4.7 | 1.6 | 2 |
101 | 6.3 | 3.3 | 6.0 | 2.5 | 3 |
17 | 5.4 | 3.9 | 1.3 | 0.4 | 1 |
61 | 5.0 | 2.0 | 3.5 | 1.0 | 2 |
38 | 4.9 | 3.6 | 1.4 | 0.1 | 1 |
78 | 6.7 | 3.0 | 5.0 | 1.7 | 2 |
141 | 6.7 | 3.1 | 5.6 | 2.4 | 3 |
40 | 5.1 | 3.4 | 1.5 | 0.2 | 1 |
In [6]:
# Create test dataset from sample 2 by filtering on "sampleid" and drop "sampleid" column as it is not required for scoring.
iris_test = iris_sample[iris_sample.sampleid == "2"].drop("sampleid", axis = 1)
iris_test
Out[6]:
id | sepal_length | sepal_width | petal_length | petal_width | species |
---|---|---|---|---|---|
74 | 6.1 | 2.8 | 4.7 | 1.2 | 2 |
135 | 6.1 | 2.6 | 5.6 | 1.4 | 3 |
110 | 7.2 | 3.6 | 6.1 | 2.5 | 3 |
141 | 6.7 | 3.1 | 5.6 | 2.4 | 3 |
13 | 4.8 | 3.0 | 1.4 | 0.1 | 1 |
133 | 6.4 | 2.8 | 5.6 | 2.2 | 3 |
108 | 7.3 | 2.9 | 6.3 | 1.8 | 3 |
1 | 5.1 | 3.5 | 1.4 | 0.2 | 1 |
139 | 6.0 | 3.0 | 4.8 | 1.8 | 3 |
55 | 6.5 | 2.8 | 4.6 | 1.5 | 2 |
Train Kmeans Clustering model¶
In [7]:
# Import required libraries.
import numpy as np
from sklearn import tree
from nyoka import skl_to_pmml
from sklearn.pipeline import Pipeline
In [8]:
# Convert teradataml dataframe to pandas dataframe.
# features : Training data.
# target : Training targets.
train_pd = iris_train.to_pandas()
features = train_pd.columns.drop('species')
target = 'species'
In [9]:
# Generate Kmeans Clustering pipeline.
from sklearn.cluster import SpectralClustering
from sklearn.cluster import KMeans
km_pipe_obj = Pipeline([
("km",KMeans(n_clusters=8, random_state=0))
])
In [10]:
km_pipe_obj.fit(train_pd[features], train_pd[target])
Out[10]:
Pipeline(steps=[('km', KMeans(random_state=0))])
Save the model in PMML format.¶
In [11]:
temp_dir = tempfile.TemporaryDirectory()
model_file_path = f"{temp_dir.name}/iris_kmeans_model.pmml"
In [12]:
skl_to_pmml(km_pipe_obj ,features,target,model_file_path)
Save the model in Vantage.¶
In [13]:
# Save the PMML Model in Vantage.
save_byom("pmml_kmeans_iris", model_file_path, "byom_models")
Created the model table 'byom_models' as it does not exist. Model is saved.
List the model from Vantage.¶
In [14]:
# List the PMML Models in Vantage.
list_byom("byom_models")
model model_id pmml_kmeans_iris b'3C3F786D6C20766572...'
Retrieve the model from Vantage.¶
In [15]:
# Retrieve the model from table "byom_models", using the model id 'pmml_kmeans_iris'.
modeldata = retrieve_byom("pmml_kmeans_iris", "byom_models")
Set "configure.byom_install_location" to the database where BYOM functions are installed.¶
In [16]:
configure.byom_install_location = getpass.getpass("byom_install_location: ")
byom_install_location: ········
Score the model.¶
In [17]:
# Perform prediction using PMMLPredict() and the PMML model stored in Vantage.
result = PMMLPredict(
modeldata = modeldata,
newdata = iris_test,
accumulate = ['id', 'sepal_length', 'petal_length'],
overwrite_cached_models = '*',
)
In [18]:
# Print the query.
print(result.show_query())
SELECT * FROM "mldb".PMMLPredict( ON "MLDB"."ml__select__1646324588886704" AS InputTable PARTITION BY ANY ON (select model_id,model from "MLDB"."ml__filter__1646324705248551") AS ModelTable DIMENSION USING Accumulate('id','sepal_length','petal_length') OverwriteCachedModel('*') ) as sqlmr
In [19]:
# Print the result.
result.result
Out[19]:
id | sepal_length | petal_length | prediction | json_report |
---|---|---|---|---|
26 | 5.0 | 1.6 | {"cluster":"5","affinity(0)":0.751672360718738,"affinity(1)":3.544763216406395,"affinity(2)":3.9376134904279265,"affinity(3)":5.638328103405557,"affinity(4)":1.9011692308798933,"affinity(5)":0.4307515235028199,"affinity(6)":2.7851621187431186,"affinity(7)":4.633992758830002} | |
9 | 4.4 | 1.4 | {"cluster":"5","affinity(0)":1.1611824452693957,"affinity(1)":3.9874248314800367,"affinity(2)":4.300248054860711,"affinity(3)":6.091336484326675,"affinity(4)":2.1317702607092643,"affinity(5)":0.3815060615507961,"affinity(6)":3.1261814375724954,"affinity(7)":5.033940360746793} | |
49 | 5.3 | 1.5 | {"cluster":"0","affinity(0)":0.1012394839308061,"affinity(1)":3.5942320675110357,"affinity(2)":4.070560976900686,"affinity(3)":5.631148613803742,"affinity(4)":2.3161270354720283,"affinity(5)":0.8347435983582026,"affinity(6)":2.992944413612751,"affinity(7)":4.667678461743291} | |
118 | 7.7 | 6.7 | {"cluster":"3","affinity(0)":6.076673738059406,"affinity(1)":2.6979678409710757,"affinity(2)":2.6210939192126124,"affinity(3)":0.8590788839947237,"affinity(4)":4.78132942926035,"affinity(5)":6.48107220103279,"affinity(6)":3.6764047052920366,"affinity(7)":1.800154314372953} | |
62 | 5.9 | 4.2 | {"cluster":"6","affinity(0)":3.126120660737127,"affinity(1)":0.6744033990115543,"affinity(2)":0.9434687770845057,"affinity(3)":2.7041749173215615,"affinity(4)":1.5452435981999015,"affinity(5)":3.3539971489254436,"affinity(6)":0.527545730974952,"affinity(7)":1.6012148165967262} | |
70 | 5.6 | 3.9 | {"cluster":"6","affinity(0)":2.8388062388401933,"affinity(1)":1.2373906958079262,"affinity(2)":1.4147320123142286,"affinity(3)":3.2592272064817873,"affinity(4)":0.9297550453987564,"affinity(5)":2.9194257782995616,"affinity(6)":0.31447215069702733,"affinity(7)":2.2219860985663757} | |
129 | 6.4 | 5.6 | {"cluster":"7","affinity(0)":4.731129121725585,"affinity(1)":1.221120687758594,"affinity(2)":0.7412601882380211,"affinity(3)":1.3466518014617457,"affinity(4)":3.0029615012147217,"affinity(5)":4.9643022545973166,"affinity(6)":1.9195767988036296,"affinity(7)":0.3472111109333275} | |
140 | 6.9 | 5.4 | {"cluster":"7","affinity(0)":4.663716268503754,"affinity(1)":1.148719629284536,"affinity(2)":1.053312236075641,"affinity(3)":1.1007510583969395,"affinity(4)":3.174551586882434,"affinity(5)":4.983903778665877,"affinity(6)":2.089767457800428,"affinity(7)":0.3519785346990484} | |
110 | 7.2 | 6.1 | {"cluster":"3","affinity(0)":5.483417591313434,"affinity(1)":2.0898448563340066,"affinity(2)":1.942026433050453,"affinity(3)":0.8044298838410525,"affinity(4)":4.103792284108823,"affinity(5)":5.854105130162936,"affinity(6)":3.0134129221609123,"affinity(7)":1.0726395273136058} | |
34 | 5.5 | 1.4 | {"cluster":"0","affinity(0)":0.5771538581381218,"affinity(1)":3.755217976344582,"affinity(2)":4.274903507682951,"affinity(3)":5.719410187472305,"affinity(4)":2.7247833756914415,"affinity(5)":1.3259136001263432,"affinity(6)":3.2698369852386526,"affinity(7)":4.795541911215271} |
Cleanup.¶
In [20]:
# Delete the saved Model.
delete_byom("pmml_kmeans_iris", table_name="byom_models")
Model is deleted.
In [21]:
# Drop model table.
db_drop_table("byom_models")
Out[21]:
True
In [22]:
# Drop input data table.
db_drop_table("iris_input")
Out[22]:
True
In [23]:
# One must run remove_context() to close the connection and garbage collect internally generated objects.
remove_context()
Out[23]:
True
In [ ]: