PMMLPredict using Linear Regression model.¶
Setup¶
In [1]:
# Import required libraries
import tempfile
import getpass
from teradataml import PMMLPredict, DataFrame, load_example_data, create_context, \
db_drop_table, remove_context, save_byom, retrieve_byom, delete_byom, list_byom
from teradataml.options.configure import configure
In [2]:
# Create the connection.
host = getpass.getpass("Host: ")
username = getpass.getpass("Username: ")
password = getpass.getpass("Password: ")
con = create_context(host=host, username=username, password=password)
Host: ········ Username: ········ Password: ········
Load example data¶
In [3]:
# Load the example data.
load_example_data("dataframe", "insurance")
In [4]:
# Create teradataml DataFrames.
insurance_input = DataFrame("insurance")
In [5]:
insurance_input
Out[5]:
age | sex | bmi | children | smoker | region | charges |
---|---|---|---|---|---|---|
19 | female | 28.6 | 5 | no | southwest | 4687.797 |
40 | male | 26.315 | 1 | no | northwest | 6389.37785 |
40 | female | 36.19 | 0 | no | southeast | 5920.1041 |
34 | female | 31.92 | 1 | yes | northeast | 37701.8768 |
34 | female | 27.5 | 1 | no | southwest | 5003.853 |
61 | female | 39.1 | 2 | no | southwest | 14235.072 |
61 | female | 29.92 | 3 | yes | southeast | 30942.1918 |
61 | female | 22.04 | 0 | no | northeast | 13616.3586 |
34 | female | 37.335 | 2 | no | northwest | 5989.52365 |
40 | female | 28.69 | 3 | no | northwest | 8059.6791 |
One Hot Encoder¶
In [6]:
# import required library.
from teradataml import OneHotEncoder, valib, Retain
configure.val_install_location = getpass.getpass("byom_install_location: ")
byom_install_location: ········
In [7]:
# Perform One Hot Encoding on the categorical features.
dc1 = OneHotEncoder(values=["southwest", "southeast", "northwest", "northeast" ], columns="region")
dc2 = OneHotEncoder(style="contrast", values="yes", reference_value=0, columns="smoker")
dc3 = OneHotEncoder(style="contrast", values="male", reference_value=0, columns="sex")
In [8]:
# Retaining Some columns unchanged.
retain = Retain(columns=["charges","bmi","children"])
In [9]:
# Execute Transform() function.
obj = valib.Transform(data=insurance_input, one_hot_encode=[dc1,dc2,dc3], key_columns="ID", retain=retain)
insurance_input = obj.result
insurance_input
Out[9]:
age | charges | bmi | children | southwest_region | southeast_region | northwest_region | northeast_region | yes_smoker | male_sex |
---|---|---|---|---|---|---|---|---|---|
19 | 4687.797 | 28.6 | 5 | 1 | 0 | 0 | 0 | 0 | 0 |
19 | 1743.214 | 28.9 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
19 | 2331.519 | 28.4 | 1 | 1 | 0 | 0 | 0 | 0 | 0 |
19 | 17081.08 | 28.3 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
19 | 1261.442 | 34.1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
19 | 1842.519 | 28.4 | 1 | 1 | 0 | 0 | 0 | 0 | 1 |
19 | 1632.56445 | 25.555 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
19 | 1625.43375 | 20.425 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
19 | 1837.237 | 24.6 | 1 | 1 | 0 | 0 | 0 | 0 | 1 |
19 | 16884.924 | 27.9 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
use DataFrame.sample() for splitting input data into testing and training dataset.¶
In [10]:
# Create 2 samples of input data - sample 1 will have 80% of total rows and sample 2 will have 20% of total rows.
insurance_sample = insurance_input.sample(frac=[0.8, 0.2])
insurance_sample
Out[10]:
age | charges | bmi | children | southwest_region | southeast_region | northwest_region | northeast_region | yes_smoker | male_sex | sampleid |
---|---|---|---|---|---|---|---|---|---|---|
34 | 5003.853 | 27.5 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 2 |
61 | 30942.1918 | 29.92 | 3 | 0 | 1 | 0 | 0 | 1 | 0 | 1 |
61 | 13616.3586 | 22.04 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
19 | 16884.924 | 27.9 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 |
19 | 4687.797 | 28.6 | 5 | 1 | 0 | 0 | 0 | 0 | 0 | 2 |
40 | 8059.6791 | 28.69 | 3 | 0 | 0 | 1 | 0 | 0 | 0 | 2 |
40 | 6389.37785 | 26.315 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 |
40 | 5920.1041 | 36.19 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 2 |
19 | 1837.237 | 24.6 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 1 |
61 | 14235.072 | 39.1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
In [11]:
# Create train dataset from sample 1 by filtering on "sampleid" and drop "sampleid" column as it is not required for training model.
insurance_train = insurance_sample[insurance_sample.sampleid == "1"].drop("sampleid", axis = 1)
insurance_train
Out[11]:
age | charges | bmi | children | southwest_region | southeast_region | northwest_region | northeast_region | yes_smoker | male_sex |
---|---|---|---|---|---|---|---|---|---|
34 | 27375.90478 | 22.42 | 2 | 0 | 0 | 0 | 1 | 0 | 1 |
61 | 30942.1918 | 29.92 | 3 | 0 | 1 | 0 | 0 | 1 | 0 |
61 | 13616.3586 | 22.04 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
19 | 16884.924 | 27.9 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
19 | 4687.797 | 28.6 | 5 | 1 | 0 | 0 | 0 | 0 | 0 |
40 | 8059.6791 | 28.69 | 3 | 0 | 0 | 1 | 0 | 0 | 0 |
40 | 6389.37785 | 26.315 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
40 | 5920.1041 | 36.19 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
19 | 1837.237 | 24.6 | 1 | 1 | 0 | 0 | 0 | 0 | 1 |
61 | 14235.072 | 39.1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 |
In [12]:
# Create test dataset from sample 2 by filtering on "sampleid" and drop "sampleid" column as it is not required for scoring.
insurance_test = insurance_sample[insurance_sample.sampleid == "2"].drop("sampleid", axis = 1)
insurance_test
Out[12]:
age | charges | bmi | children | southwest_region | southeast_region | northwest_region | northeast_region | yes_smoker | male_sex |
---|---|---|---|---|---|---|---|---|---|
34 | 3935.1799 | 34.21 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
61 | 48517.56315 | 36.385 | 1 | 0 | 0 | 0 | 1 | 1 | 0 |
61 | 24513.09126 | 25.08 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
19 | 1837.237 | 24.6 | 1 | 1 | 0 | 0 | 0 | 0 | 1 |
19 | 1261.442 | 34.1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
40 | 7077.1894 | 25.46 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
40 | 6600.20595 | 34.105 | 1 | 0 | 0 | 0 | 1 | 0 | 1 |
40 | 7173.35995 | 22.705 | 2 | 0 | 0 | 0 | 1 | 0 | 1 |
19 | 2331.519 | 28.4 | 1 | 1 | 0 | 0 | 0 | 0 | 0 |
61 | 14235.072 | 39.1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 |
Prepare dataset for creating a Linear Regression model.¶
In [13]:
# Import required libraries.
import numpy as np
from nyoka import skl_to_pmml
from sklearn.pipeline import Pipeline
from sklearn_pandas import DataFrameMapper
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
In [14]:
# Convert teradataml dataframe to pandas dataframe.
# features : Training data.
# target : Training targets.
traid_pd = insurance_train.to_pandas()
features = traid_pd.columns.drop('charges')
target = 'charges'
Train Linear Regression Model.¶
In [15]:
# Import required libraries.
from sklearn import linear_model
In [16]:
# Generate the Linear Regressor model.
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
LRM_pipe_obj = Pipeline([
("mapping", DataFrameMapper([
(['bmi'], StandardScaler()) ,
(['male_sex', 'children'], imputer)
])),
("LRM", linear_model.LinearRegression())
])
In [17]:
LRM_pipe_obj.fit(traid_pd[features], traid_pd[target])
Out[17]:
Pipeline(steps=[('mapping', DataFrameMapper(drop_cols=[], features=[(['bmi'], StandardScaler()), (['male_sex', 'children'], SimpleImputer())])), ('LRM', LinearRegression())])
Save the model in PMML format.¶
In [18]:
temp_dir = tempfile.TemporaryDirectory()
model_file_path = f"{temp_dir.name}/insurance_db_LRM_model.pmml"
In [19]:
skl_to_pmml(LRM_pipe_obj, features, target, model_file_path)
Save the model in Vantage.¶
In [20]:
# Save the PMML Model in Vantage.
save_byom("pmml_LRM_insurance", model_file_path, "byom_models")
Created the model table 'byom_models' as it does not exist. Model is saved.
List the models from Vantage¶
In [21]:
# List the PMML Models in Vantage.
list_byom("byom_models")
model model_id pmml_LRM_insurance b'3C3F786D6C20766572...'
Retrieve the model from Vantage.¶
In [22]:
# Retrieve the model from table "byom_models", using the model id 'pmml_LRM_insurance'.
modeldata = retrieve_byom("pmml_LRM_insurance", "byom_models")
In [23]:
configure.byom_install_location = getpass.getpass("byom_install_location: ")
byom_install_location: ········
Score the model.¶
In [24]:
# Perform prediction using PMMLPredict() and the PMML model stored in Vantage.
result = PMMLPredict(
modeldata = modeldata,
newdata = insurance_test,
accumulate = ['male_sex', 'bmi', 'children'],
overwrite_cached_models = '*',
)
In [25]:
# Print the query.
print(result.show_query())
SELECT * FROM "mldb".PMMLPredict( ON "MLDB"."ml__select__1646289308222356" AS InputTable PARTITION BY ANY ON (select model_id,model from "MLDB"."ml__filter__1646284961549474") AS ModelTable DIMENSION USING Accumulate('male_sex','bmi','children') OverwriteCachedModel('*') ) as sqlmr
In [26]:
# Print the result.
result.result
Out[26]:
male_sex | bmi | children | prediction | json_report |
---|---|---|---|---|
0 | 21.09 | 0 | 8496.226240089216 | {"predicted_charges":8496.226240089216} |
0 | 25.46 | 1 | 10814.194870937952 | {"predicted_charges":10814.194870937952} |
0 | 28.12 | 1 | 11905.440626739286 | {"predicted_charges":11905.440626739286} |
1 | 20.425 | 0 | 9079.054672356815 | {"predicted_charges":9079.054672356815} |
0 | 17.8 | 0 | 7146.527542124409 | {"predicted_charges":7146.527542124409} |
0 | 33.25 | 1 | 14009.986012927573 | {"predicted_charges":14009.986012927573} |
0 | 23.56 | 0 | 9509.525870476169 | {"predicted_charges":9509.525870476169} |
1 | 42.13 | 2 | 19033.78938667244 | {"predicted_charges":19033.78938667244} |
1 | 28.4 | 1 | 12875.948472252094 | {"predicted_charges":12875.948472252094} |
1 | 26.315 | 1 | 12020.592306520599 | {"predicted_charges":12020.592306520599} |
Cleanup.¶
In [27]:
# Delete the model from table "byom_models", using the model id 'pmml_LRM_insurance'.
delete_byom("pmml_LRM_insurance", "byom_models")
Model is deleted.
In [28]:
# Drop models table.
db_drop_table("byom_models")
Out[28]:
True
In [29]:
# Drop input data tables.
db_drop_table("insurance")
Out[29]:
True
In [30]:
# One must run remove_context() to close the connection and garbage collect internally generated objects.
remove_context()
Out[30]:
True