ONNXPredict using Linear Regression model.¶
Setup¶
In [ ]:
# Import required libraries.
import tempfile
import getpass
from teradataml import DataFrame, load_example_data, create_context, \
db_drop_table, remove_context, save_byom, retrieve_byom, delete_byom, list_byom
from teradataml.options.configure import configure
In [ ]:
# Create the connection.
host = getpass.getpass("Host: ")
username = getpass.getpass("Username: ")
password = getpass.getpass("Password: ")
con = create_context(host=host, username=username, password=password)
Load example data¶
In [ ]:
# Load the example data.
load_example_data("dataframe", "insurance")
In [ ]:
# Create teradataml DataFrames.
insurance_input = DataFrame("insurance")
insurance_input
Out[ ]:
age | sex | bmi | children | smoker | region | charges |
---|---|---|---|---|---|---|
19 | female | 28.6 | 5 | no | southwest | 4687.797 |
40 | male | 26.315 | 1 | no | northwest | 6389.37785 |
40 | female | 36.19 | 0 | no | southeast | 5920.1041 |
34 | female | 31.92 | 1 | yes | northeast | 37701.8768 |
34 | female | 27.5 | 1 | no | southwest | 5003.853 |
61 | female | 39.1 | 2 | no | southwest | 14235.072 |
61 | female | 29.92 | 3 | yes | southeast | 30942.1918 |
61 | female | 22.04 | 0 | no | northeast | 13616.3586 |
34 | female | 37.335 | 2 | no | northwest | 5989.52365 |
40 | female | 28.69 | 3 | no | northwest | 8059.6791 |
One Hot Encoder¶
In [ ]:
# Import required library and configure val install location.
from teradataml import OneHotEncoder, valib, Retain, FillNa
configure.val_install_location = getpass.getpass("val_install_location: ")
In [ ]:
# Perform One Hot Encoding on the categorical features.
dc1 = OneHotEncoder(values=["southwest", "southeast", "northwest", "northeast"], columns="region")
dc2 = OneHotEncoder(style="contrast", values="yes", reference_value=0, columns="smoker")
dc3 = OneHotEncoder(style="contrast", values="male", reference_value=0, columns="sex")
In [ ]:
# Perform FillNa on the children column.
fn1 = FillNa(style="median_wo_mean", columns="children", out_columns="mod_children")
In [ ]:
# Retaining unaltered columns.
retain = Retain(columns=["charges", "bmi"])
In [ ]:
# Execute Transform() function.
obj = valib.Transform(data=insurance_input, one_hot_encode=[dc1,dc2,dc3], fillna=fn1, key_columns="ID", retain=retain)
insurance_input = obj.result
insurance_input
Out[ ]:
age | charges | bmi | southwest_region | southeast_region | northwest_region | northeast_region | yes_smoker | male_sex | mod_children |
---|---|---|---|---|---|---|---|---|---|
61 | 12950.0712 | 38.38 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
61 | 13041.921 | 28.2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
61 | 30942.1918 | 29.92 | 0 | 1 | 0 | 0 | 1 | 0 | 3 |
61 | 24513.09126 | 25.08 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
61 | 13143.86485 | 33.915 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
61 | 27941.28758 | 36.1 | 1 | 0 | 0 | 0 | 0 | 1 | 3 |
61 | 46599.1084 | 35.86 | 0 | 1 | 0 | 0 | 1 | 1 | 0 |
61 | 13415.0381 | 21.09 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
61 | 14119.62 | 32.3 | 0 | 0 | 1 | 0 | 0 | 1 | 2 |
61 | 13635.6379 | 35.91 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
use sample() for splitting input data into testing and training dataset.¶
In [ ]:
# Create 2 samples of input data - sample 1 will have 80% of total rows and sample 2 will have 20% of total rows.
insurance_sample = insurance_input.sample(frac=[0.8, 0.2])
insurance_sample
Out[ ]:
age | charges | bmi | southwest_region | southeast_region | northwest_region | northeast_region | yes_smoker | male_sex | mod_children | sampleid |
---|---|---|---|---|---|---|---|---|---|---|
61 | 12950.0712 | 38.38 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 |
34 | 5385.3379 | 26.41 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 |
34 | 3935.1799 | 34.21 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 |
40 | 7196.867 | 35.3 | 1 | 0 | 0 | 0 | 0 | 1 | 3 | 1 |
40 | 22331.5668 | 28.12 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 2 |
19 | 13844.506 | 21.7 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
19 | 1639.5631 | 30.59 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 |
19 | 1743.214 | 28.9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
40 | 17179.522 | 19.8 | 0 | 1 | 0 | 0 | 1 | 1 | 1 | 1 |
34 | 14358.36437 | 32.8 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 1 |
In [ ]:
# Create train dataset from sample 1 by filtering on "sampleid" and drop "sampleid" column as it is not required for training model.
insurance_train = insurance_sample[insurance_sample.sampleid == "1"].drop("sampleid", axis = 1)
insurance_train
Out[ ]:
age | charges | bmi | southwest_region | southeast_region | northwest_region | northeast_region | yes_smoker | male_sex | mod_children |
---|---|---|---|---|---|---|---|---|---|
61 | 13415.0381 | 21.09 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
34 | 5385.3379 | 26.41 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
34 | 3935.1799 | 34.21 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
40 | 7196.867 | 35.3 | 1 | 0 | 0 | 0 | 0 | 1 | 3 |
40 | 22331.5668 | 28.12 | 0 | 0 | 0 | 1 | 1 | 0 | 1 |
19 | 13844.506 | 21.7 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
19 | 2130.6759 | 32.11 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
19 | 16884.924 | 27.9 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
40 | 17179.522 | 19.8 | 0 | 1 | 0 | 0 | 1 | 1 | 1 |
34 | 14358.36437 | 32.8 | 1 | 0 | 0 | 0 | 0 | 1 | 1 |
In [ ]:
# Create test dataset from sample 2 by filtering on "sampleid" and drop "sampleid" column as it is not required for scoring.
insurance_test = insurance_sample[insurance_sample.sampleid == "2"].drop("sampleid", axis = 1)
insurance_test
Out[ ]:
age | charges | bmi | southwest_region | southeast_region | northwest_region | northeast_region | yes_smoker | male_sex | mod_children |
---|---|---|---|---|---|---|---|---|---|
38 | 6373.55735 | 40.565 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
32 | 4673.3922 | 37.18 | 0 | 1 | 0 | 0 | 0 | 1 | 2 |
32 | 19719.6947 | 28.93 | 0 | 1 | 0 | 0 | 1 | 1 | 1 |
40 | 6600.361 | 29.9 | 1 | 0 | 0 | 0 | 0 | 1 | 2 |
40 | 6600.20595 | 34.105 | 0 | 0 | 0 | 1 | 0 | 1 | 1 |
19 | 1261.442 | 34.1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
19 | 1263.249 | 35.4 | 1 | 0 | 0 | 0 | 0 | 1 | 0 |
19 | 36219.40545 | 36.955 | 0 | 0 | 1 | 0 | 1 | 1 | 0 |
40 | 7173.35995 | 22.705 | 0 | 0 | 0 | 1 | 0 | 1 | 2 |
34 | 43943.8761 | 30.21 | 0 | 0 | 1 | 0 | 1 | 0 | 1 |
Prepare dataset for creating a linear regression model.¶
In [ ]:
# Convert teradataml dataframe to pandas dataframe.
# features : Training data.
# target : Training targets.
train_pd = insurance_train.to_pandas()
features = train_pd.columns.drop('charges')
target = 'charges'
Train Model.¶
In [ ]:
# Import required libraries.
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
In [ ]:
# Defining column tranformation pipeline.
col_trans_obj = ColumnTransformer([
("num_preprocess", MinMaxScaler(), ["bmi"])
])
In [ ]:
# Generate the Linear Regression model.
lin_reg_pipe_obj = Pipeline(steps=[
('col_trans', col_trans_obj),
("LRM", LinearRegression())
])
In [ ]:
lin_reg_pipe_obj.fit(train_pd[features], train_pd[target])
Out[ ]:
Pipeline(steps=[('col_trans', ColumnTransformer(transformers=[('num_preprocess', MinMaxScaler(), ['bmi'])])), ('LRM', LinearRegression())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('col_trans', ColumnTransformer(transformers=[('num_preprocess', MinMaxScaler(), ['bmi'])])), ('LRM', LinearRegression())])
ColumnTransformer(transformers=[('num_preprocess', MinMaxScaler(), ['bmi'])])
['bmi']
MinMaxScaler()
LinearRegression()
Save the model in ONNX format.¶
In [ ]:
# Import required libraries.
from skl2onnx import to_onnx
In [ ]:
# Create temporary filepath to save model.
temp_dir = tempfile.TemporaryDirectory()
model_file_path = f"{temp_dir.name}/insurance_db_lin_reg_model.onnx"
In [ ]:
onx = to_onnx(lin_reg_pipe_obj, train_pd.iloc[:, :10].astype(np.float32))
In [ ]:
with open(model_file_path, "wb") as f:
f.write(onx.SerializeToString())
Save the model in Vantage.¶
In [ ]:
# Save the ONNX model in Vantage.
save_byom("onnx_lin_reg_insurance", model_file_path, "byom_models")
Created the model table 'byom_models' as it does not exist. Model is saved.
In [ ]:
# List the ONNX models in Vantage.
list_byom("byom_models")
model model_id onnx_lin_reg_insurance b'8081208736B6C326F...'
Retrieve the model from Vantage.¶
In [ ]:
# Retrieve the model from table "byom_models", using the model id 'onnx_lin_reg_insurance'.
modeldata = retrieve_byom("onnx_lin_reg_insurance", "byom_models")
In [ ]:
configure.byom_install_location = getpass.getpass("byom_install_location: ")
Score the model.¶
In [ ]:
# Import required libraries
from teradataml import ONNXPredict
In [ ]:
# Perform prediction using ONNXPredict() and the ONNX model stored in Vantage.
predict_output = ONNXPredict(
modeldata = modeldata,
newdata = insurance_test,
accumulate = ['male_sex', 'bmi', 'mod_children'],
overwrite_cached_models = '*',
show_model_input_fields_map=True,
model_output_fields="variable"
)
In [ ]:
# Print the query.
print(predict_output.show_query())
SELECT * FROM "mldb".ONNXPredict( ON "VAL"."ml__select__1666875162231665" AS InputTable PARTITION BY ANY ON (select model_id,model from "VAL"."ml__filter__1666873236968949") AS ModelTable DIMENSION USING Accumulate('male_sex','bmi','mod_children') ModelOutputFields('variable') OverwriteCachedModel('*') ShowModelInputFieldsMap('True') ) as sqlmr
In [ ]:
# Print the result.
predict_output.result
Out[ ]:
ModelInputFieldsMap | male_sex | bmi | mod_children | variable |
---|---|---|---|---|
ModelInputFieldsMap('male_sex=male_sex','mod_children=mod_children','charges=charges','southeast_region=southeast_region','yes_smoker=yes_smoker','northwest_region=northwest_region','northeast_region=northeast_region','age=age','bmi=bmi','southwest_region=southwest_region') | None | None | None | None |
ModelInputFieldsMap('male_sex=male_sex','mod_children=mod_children','charges=charges','southeast_region=southeast_region','yes_smoker=yes_smoker','northwest_region=northwest_region','northeast_region=northeast_region','age=age','bmi=bmi','southwest_region=southwest_region') | None | None | None | None |
ModelInputFieldsMap('male_sex=male_sex','mod_children=mod_children','charges=charges','southeast_region=southeast_region','yes_smoker=yes_smoker','northwest_region=northwest_region','northeast_region=northeast_region','age=age','bmi=bmi','southwest_region=southwest_region') | None | None | None | None |
ModelInputFieldsMap('male_sex=male_sex','mod_children=mod_children','charges=charges','southeast_region=southeast_region','yes_smoker=yes_smoker','northwest_region=northwest_region','northeast_region=northeast_region','age=age','bmi=bmi','southwest_region=southwest_region') | None | None | None | None |
Cleanup.¶
In [ ]:
# Drop input data tables.
# Delete the model from table "byom_models", using the model id 'onnx_lin_reg_insurance'.
delete_byom("onnx_lin_reg_insurance", "byom_models")
Model is deleted.
In [ ]:
db_drop_table("byom_models")
Out[ ]:
True
In [ ]:
db_drop_table("insurance")
Out[ ]:
True
In [ ]:
remove_context()
Out[ ]:
True