ModelTable: boston_db_trans_dtsize_model.py
"""
boston_db_rf_trans_reg_model.py: Creates Random Forest Regression model uses Discretization transformation
********************
* Generated model file is in PMML format.
* To score this model , user needs insert/upload PMML model into Vantage table
********************
"""
import pyodbc
import pandas as pd
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml import sklearn2pmml
from sklearn.ensemble import RandomForestRegressor
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import StandardScaler
from sklearn2pmml.preprocessing import CutTransformer
from sklearn.impute import SimpleImputer
import os
import time
from teradataml import *
display.print_sqlmr_query = True
passwd = "alice"
uid = "alice"
con = create_context(host="server123@mydomain.com", username=uid, password=passwd)
con
train_df = DataFrame.from_query("select * from boston_train")
train_pd = train_df.to_pandas()
test_df = DataFrame.from_query("select * from boston_test")
test_pd = test_df.to_pandas()
train_pd = train_pd.apply(pd.to_numeric, errors='ignore')
test_pd = test_pd.apply(pd.to_numeric, errors='ignore')
train_pd
type(train_pd)
features = train_pd.columns.drop('medv')
target = 'medv'
ct = CutTransformer(bins=np.linspace(1,10,10), right=True, labels=False)
ct.transform(train_pd['rm'])
mapper = DataFrameMapper([
(['crim', 'zn', 'indus', 'chas', 'nox', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black', 'lstat'], StandardScaler()),
(['rm'], [SimpleImputer(), ct])
])
pipeline = PMMLPipeline([
("mapping", mapper),
("rfc", RandomForestRegressor(n_estimators = 100, random_state = 0))
])
pipeline.fit(train_pd[features], train_pd[[target]].values.ravel())
sklearn2pmml(pipeline, "../sql/boston_db_trans_dstize_py_model.pmml", with_repr = True)
#predict using test data
#........................
from pypmml import Model
model = Model.load('../sql/boston_db_trans_dstize_py_model.pmml')
predictions = model.predict(test_pd[features])
#print(predictions)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
print(predictions)
Query that Calls PMMLPredict Function
SELECT * FROM mldb.PMMLPredict (
ON boston_test
ON (SELECT * FROM pmml_models
WHERE model_id='boston_db_trans_dstize_py_model') DIMENSION
USING
Accumulate ('id')
) AS td;
Query Output
*** Query completed. 101 rows found. 3 columns returned.
*** Total elapsed time was 1 second.
id prediction json_report
----------- ------------------------- ----------------------------------
40 22.371999999999993 {"predicted_y":22.371999999999993}
265 36.622999999999976 {"predicted_y":36.622999999999976}
80 21.555 {"predicted_y":21.555}
305 34.488000000000014 {"predicted_y":34.488000000000014}
345 29.391 {"predicted_y":29.391}
120 20.758000000000003 {"predicted_y":20.758000000000003}
385 8.737999999999998 {"predicted_y":8.737999999999998}
160 28.87800000000002 {"predicted_y":28.87800000000002}
200 28.534999999999986 {"predicted_y":28.534999999999986}
425 14.989999999999998 {"predicted_y":14.989999999999998}
240 23.351000000000013 {"predicted_y":23.351000000000013}
...