2.0 - Python Example - Teradata Vantage

Teradata Vantageā„¢ - Bring Your Own Model User Guide

Product
Teradata Vantage
Release Number
2.0
Release Date
October 2021
Content Type
User Guide
Publication ID
B700-1111-051K
Language
English (United States)

ModelTable: boston_db_trans_dtsize_model.py

"""                                                                                                                   
boston_db_rf_trans_reg_model.py: Creates Random Forest Regression model uses Discretization transformation
********************
*  Generated model file is in PMML format.
*  To score this model , user needs insert/upload PMML model into Vantage table
********************
"""
import pyodbc
import pandas as pd
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml import sklearn2pmml
from sklearn.ensemble import RandomForestRegressor
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import StandardScaler
from sklearn2pmml.preprocessing import CutTransformer
from sklearn.impute import SimpleImputer
import os
import time
from teradataml import *
 
display.print_sqlmr_query = True
passwd = "alice"
uid = "alice"
 
con = create_context(host="server123@mydomain.com", username=uid, password=passwd)
con
 
train_df = DataFrame.from_query("select * from boston_train")
train_pd = train_df.to_pandas()
test_df = DataFrame.from_query("select * from boston_test")
test_pd = test_df.to_pandas()
train_pd = train_pd.apply(pd.to_numeric, errors='ignore')
test_pd = test_pd.apply(pd.to_numeric, errors='ignore')
train_pd
type(train_pd)
features = train_pd.columns.drop('medv')
target = 'medv'
 
ct = CutTransformer(bins=np.linspace(1,10,10), right=True, labels=False)
 
ct.transform(train_pd['rm'])
 
mapper = DataFrameMapper([
  (['crim', 'zn', 'indus', 'chas', 'nox', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black', 'lstat'], StandardScaler()),
  (['rm'], [SimpleImputer(), ct])
  ])
 
pipeline = PMMLPipeline([
    ("mapping", mapper),
    ("rfc", RandomForestRegressor(n_estimators = 100, random_state = 0))
])
 
pipeline.fit(train_pd[features], train_pd[[target]].values.ravel())
sklearn2pmml(pipeline, "../sql/boston_db_trans_dstize_py_model.pmml", with_repr = True)
#predict using test data
#........................
from pypmml import Model
model = Model.load('../sql/boston_db_trans_dstize_py_model.pmml')
predictions = model.predict(test_pd[features])
#print(predictions)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
   print(predictions)

Query that Calls PMMLPredict Function

SELECT * FROM mldb.PMMLPredict (
  ON boston_test
  ON (SELECT * FROM pmml_models
      WHERE model_id='boston_db_trans_dstize_py_model') DIMENSION
  USING
    Accumulate ('id')
) AS td;

Query Output

 *** Query completed. 101 rows found. 3 columns returned.
 *** Total elapsed time was 1 second.
 
         id prediction                json_report
----------- ------------------------- ----------------------------------
         40 22.371999999999993        {"predicted_y":22.371999999999993}
        265 36.622999999999976        {"predicted_y":36.622999999999976}
         80 21.555                    {"predicted_y":21.555}
        305 34.488000000000014        {"predicted_y":34.488000000000014}
        345 29.391                    {"predicted_y":29.391}
        120 20.758000000000003        {"predicted_y":20.758000000000003}
        385 8.737999999999998         {"predicted_y":8.737999999999998}
        160 28.87800000000002         {"predicted_y":28.87800000000002}
        200 28.534999999999986        {"predicted_y":28.534999999999986}
        425 14.989999999999998        {"predicted_y":14.989999999999998}
        240 23.351000000000013        {"predicted_y":23.351000000000013}
        ...