PMMLPredict using Multi-layer Perceptron model.¶
Setup¶
In [1]:
# Import required libraries
import tempfile
import getpass
from teradataml import PMMLPredict, DataFrame, load_example_data, create_context, \
db_drop_table, remove_context, save_byom, retrieve_byom, delete_byom, list_byom
from teradataml.options.configure import configure
In [2]:
# Create the connection.
host = getpass.getpass("Host: ")
username = getpass.getpass("Username: ")
password = getpass.getpass("Password: ")
con = create_context(host=host, username=username, password=password)
Host: ········ Username: ········ Password: ········
Load example data¶
In [3]:
# Load the example data.
load_example_data("dataframe", "insurance")
WARNING: Skipped loading table insurance since it already exists in the database.
In [4]:
# Create teradataml DataFrames.
insurance_input = DataFrame("insurance")
In [5]:
insurance_input
Out[5]:
age | sex | bmi | children | smoker | region | charges |
---|---|---|---|---|---|---|
34 | female | 27.5 | 1 | no | southwest | 5003.853 |
34 | male | 25.3 | 2 | yes | southeast | 18972.495 |
34 | female | 26.73 | 1 | no | southeast | 5002.7827 |
34 | female | 33.7 | 1 | no | southwest | 5012.471 |
34 | male | 30.8 | 0 | yes | southwest | 35491.64 |
34 | female | 29.26 | 3 | no | southeast | 6184.2994 |
34 | male | 25.27 | 1 | no | northwest | 4894.7533 |
34 | male | 22.42 | 2 | no | northeast | 27375.90478 |
34 | female | 37.335 | 2 | no | northwest | 5989.52365 |
34 | female | 31.92 | 1 | yes | northeast | 37701.8768 |
Encode the data using One Hot Encoder and transform the data.¶
In [6]:
# import required library.
from teradataml import OneHotEncoder, valib, Retain
configure.val_install_location = "MLDB"
In [7]:
# Perform One Hot Encoding on the categorical features.
dc1 = OneHotEncoder(values=["southwest", "southeast", "northwest", "northeast" ], columns="region")
dc2 = OneHotEncoder(style="contrast", values="yes", reference_value=0, columns="smoker")
dc3 = OneHotEncoder(style="contrast", values="male", reference_value=0, columns="sex")
In [8]:
# Retaining Some columns unchanged.
retain = Retain(columns=["charges","bmi","children"])
In [9]:
# Execute Transform() function.
obj = valib.Transform(data=insurance_input, one_hot_encode=[dc1,dc2,dc3], key_columns="ID", retain=retain)
insurance_input = obj.result
insurance_input
Out[9]:
age | charges | bmi | children | southwest_region | southeast_region | northwest_region | northeast_region | yes_smoker | male_sex |
---|---|---|---|---|---|---|---|---|---|
61 | 13616.3586 | 22.04 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
61 | 48517.56315 | 36.385 | 1 | 0 | 0 | 0 | 1 | 1 | 0 |
61 | 13429.0354 | 31.16 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
61 | 13415.0381 | 21.09 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
61 | 28868.6639 | 28.31 | 1 | 0 | 0 | 1 | 0 | 1 | 1 |
61 | 24513.09126 | 25.08 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
61 | 46599.1084 | 35.86 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
61 | 12557.6053 | 31.57 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
61 | 30942.1918 | 29.92 | 3 | 0 | 1 | 0 | 0 | 1 | 0 |
61 | 14235.072 | 39.1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 |
use DataFrame.sample() for splitting input data into testing and training dataset.¶
In [10]:
# Create 2 samples of input data - sample 1 will have 80% of total rows and sample 2 will have 20% of total rows.
insurance_sample = insurance_input.sample(frac=[0.8, 0.2])
insurance_sample
Out[10]:
age | charges | bmi | children | southwest_region | southeast_region | northwest_region | northeast_region | yes_smoker | male_sex | sampleid |
---|---|---|---|---|---|---|---|---|---|---|
34 | 5003.853 | 27.5 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
61 | 30942.1918 | 29.92 | 3 | 0 | 1 | 0 | 0 | 1 | 0 | 1 |
61 | 13616.3586 | 22.04 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
19 | 16884.924 | 27.9 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 2 |
19 | 4687.797 | 28.6 | 5 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
40 | 8059.6791 | 28.69 | 3 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
40 | 6389.37785 | 26.315 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
40 | 5920.1041 | 36.19 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
19 | 1837.237 | 24.6 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 2 |
61 | 14235.072 | 39.1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 2 |
In [11]:
# Create train dataset from sample 1 by filtering on "sampleid" and drop "sampleid" column as it is not required for training model.
insurance_train = insurance_sample[insurance_sample.sampleid == "1"].drop("sampleid", axis = 1)
insurance_train
Out[11]:
age | charges | bmi | children | southwest_region | southeast_region | northwest_region | northeast_region | yes_smoker | male_sex |
---|---|---|---|---|---|---|---|---|---|
34 | 5003.853 | 27.5 | 1 | 1 | 0 | 0 | 0 | 0 | 0 |
61 | 30942.1918 | 29.92 | 3 | 0 | 1 | 0 | 0 | 1 | 0 |
61 | 13616.3586 | 22.04 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
19 | 16884.924 | 27.9 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
19 | 1743.214 | 28.9 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
40 | 8059.6791 | 28.69 | 3 | 0 | 0 | 1 | 0 | 0 | 0 |
40 | 6389.37785 | 26.315 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
40 | 5920.1041 | 36.19 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
19 | 4687.797 | 28.6 | 5 | 1 | 0 | 0 | 0 | 0 | 0 |
61 | 14235.072 | 39.1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 |
In [12]:
# Create test dataset from sample 2 by filtering on "sampleid" and drop "sampleid" column as it is not required for scoring.
insurance_test = insurance_sample[insurance_sample.sampleid == "2"].drop("sampleid", axis = 1)
insurance_test
Out[12]:
age | charges | bmi | children | southwest_region | southeast_region | northwest_region | northeast_region | yes_smoker | male_sex |
---|---|---|---|---|---|---|---|---|---|
34 | 5012.471 | 33.7 | 1 | 1 | 0 | 0 | 0 | 0 | 0 |
61 | 13415.0381 | 21.09 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
61 | 46599.1084 | 35.86 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
19 | 16884.924 | 27.9 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
19 | 1842.519 | 28.4 | 1 | 1 | 0 | 0 | 0 | 0 | 1 |
40 | 7682.67 | 33.0 | 3 | 0 | 1 | 0 | 0 | 0 | 0 |
40 | 6500.2359 | 29.81 | 1 | 0 | 1 | 0 | 0 | 0 | 0 |
40 | 15828.82173 | 29.3 | 4 | 1 | 0 | 0 | 0 | 0 | 0 |
19 | 4687.797 | 28.6 | 5 | 1 | 0 | 0 | 0 | 0 | 0 |
61 | 30942.1918 | 29.92 | 3 | 0 | 1 | 0 | 0 | 1 | 0 |
Prepare dataset for creating a Multi-layer Perceptron model.¶
In [13]:
# Import required libraries.
import numpy as np
from nyoka import skl_to_pmml
from sklearn.pipeline import Pipeline
from sklearn_pandas import DataFrameMapper
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
In [14]:
# Convert teradataml dataframe to pandas dataframe.
# features : Training data.
# target : Training targets.
traid_pd = insurance_train.to_pandas()
features = traid_pd.columns.drop('charges')
target = 'charges'
Train Multi-layer Perceptron model.¶
In [15]:
# Import required libraries.
from sklearn.neural_network import MLPRegressor
In [16]:
# Generate the Multi-layer Perceptron Regressor model.
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
MLP_pipe_obj = Pipeline([
("mapping", DataFrameMapper([
(['bmi'], StandardScaler()) ,
(['male_sex', 'children'], imputer)
])),
("mlp", MLPRegressor(random_state=7, max_iter=200))
])
In [17]:
MLP_pipe_obj.fit(traid_pd[features], traid_pd[target])
C:\Users\pg255042\Anaconda3\envs\teraml\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:617: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet. % self.max_iter, ConvergenceWarning)
Out[17]:
Pipeline(steps=[('mapping', DataFrameMapper(drop_cols=[], features=[(['bmi'], StandardScaler()), (['male_sex', 'children'], SimpleImputer())])), ('mlp', MLPRegressor(random_state=7))])
Save the model in PMML format.¶
In [18]:
temp_dir = tempfile.TemporaryDirectory()
model_file_path = f"{temp_dir.name}/insurance_db_MLP_model.pmml"
In [19]:
skl_to_pmml(MLP_pipe_obj, features, target, model_file_path)
Save the model in Vantage.¶
In [20]:
# Save the PMML Model in Vantage.
save_byom("pmml_MLP_insurance", model_file_path, "byom_models")
Created the model table 'byom_models' as it does not exist. Model is saved.
List the models from Vantage¶
In [21]:
# List the PMML Models in Vantage.
list_byom("byom_models")
model model_id pmml_MLP_insurance b'3C3F786D6C20766572...'
Retrieve the model from Vantage.¶
In [22]:
# Retrieve the model from table "byom_models", using the model id 'pmml_MLP_insurance'.
modeldata = retrieve_byom("pmml_MLP_insurance", "byom_models")
In [23]:
configure.byom_install_location = getpass.getpass("byom_install_location: ")
byom_install_location: ········
Score the model.¶
In [24]:
# Perform prediction using PMMLPredict() and the PMML model stored in Vantage.
result = PMMLPredict(
modeldata = modeldata,
newdata = insurance_test,
accumulate = ['male_sex', 'bmi', 'children'],
overwrite_cached_models = '*',
)
In [25]:
# Print the query.
print(result.show_query())
SELECT * FROM "mldb".PMMLPredict( ON "MLDB"."ml__select__164656615949361" AS InputTable PARTITION BY ANY ON (select model_id,model from "MLDB"."ml__filter__164661377060078") AS ModelTable DIMENSION USING Accumulate('male_sex','bmi','children') OverwriteCachedModel('*') ) as sqlmr
In [26]:
# Print the result.
result.result
Out[26]:
male_sex | bmi | children | prediction | json_report |
---|---|---|---|---|
1 | 25.3 | 2 | 866.2542248842337 | {"predicted_charges":866.2542248842337} |
1 | 33.915 | 0 | 643.1867081097179 | {"predicted_charges":643.1867081097179} |
1 | 33.535 | 0 | 629.8514774182736 | {"predicted_charges":629.8514774182736} |
1 | 20.425 | 0 | 169.78601856344287 | {"predicted_charges":169.78601856344287} |
0 | 32.11 | 0 | 319.8599458312356 | {"predicted_charges":319.8599458312356} |
0 | 28.69 | 3 | 987.9297200599833 | {"predicted_charges":987.9297200599833} |
1 | 26.315 | 1 | 639.1777110980797 | {"predicted_charges":639.1777110980797} |
1 | 29.355 | 1 | 745.8595566296347 | {"predicted_charges":745.8595566296347} |
0 | 28.4 | 1 | 452.3615998451724 | {"predicted_charges":452.3615998451724} |
1 | 43.4 | 0 | 976.0410847106386 | {"predicted_charges":976.0410847106386} |
Cleanup.¶
In [27]:
# Delete the model from table "byom_models", using the model id 'pmml_MLP_insurance'.
delete_byom("pmml_MLP_insurance", "byom_models")
Model is deleted.
In [28]:
# Drop models table.
db_drop_table("byom_models")
Out[28]:
True
In [29]:
# Drop input data tables.
db_drop_table("insurance")
Out[29]:
True
In [30]:
# One must run remove_context() to close the connection and garbage collect internally generated objects.
remove_context()
Out[30]:
True
In [ ]: