2.0 - Python Example - Teradata Vantage

Teradata Vantageā„¢ - Bring Your Own Model User Guide

Product
Teradata Vantage
Release Number
2.0
Release Date
October 2021
Content Type
User Guide
Publication ID
B700-1111-051K
Language
English (United States)

ModelTable: adult_db_glm_b_model.py

import os
import time
from teradataml import *
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
display.print_sqlmr_query = True
passwd = "alice"
uid = "alice"
host = "server123@mydomain.com"
from sklearn.linear_model import LogisticRegression
con = create_context(host=host, username=uid, password=passwd)
con
train_df = DataFrame.from_query("select * from adultTrain")
train_pd = train_df.to_pandas()
X = train_pd[['age','workclass','marital_status','relationship','race','occupation','native_country']]
 
target='income'
column_trans = [
    (['age'], StandardScaler()),
    (['workclass'], LabelEncoder()),
    (['marital_status'], LabelEncoder()),
    (['relationship'], LabelEncoder()),
    (['race'], LabelEncoder()),
    (['occupation'], LabelEncoder()),
    (['native_country'], LabelEncoder())
    ]
 
mapper = DataFrameMapper(column_trans)
glm = LogisticRegression(solver='liblinear', multi_class='ovr',random_state=0, max_iter=1000)
glm_pipeline = PMMLPipeline([("mapping", mapper),('impute', SimpleImputer(strategy='most_frequent')),("clf",glm)])
glm_pipeline.fit(X,train_pd[target].ravel())
sklearn2pmml(glm_pipeline,"../sql/adult_db_glm_b_model.pmml",with_repr=True)

Query that Calls PMMLPredict Function

CREATE MULTISET TABLE glm_adult_binary AS (
  SELECT * FROM mldb.PMMLPredict (
    ON adultTest
    ON (SELECT * FROM pmml_models WHERE model_id='adult_glm_b_model') DIMENSION
    USING
    Accumulate ('ID')
  ) AS TD
) WITH DATA;

Query Output

SELECT * FROM glm_adult_binary WHERE ID < 10 order by ID;
 *** Query completed. 9 rows found. 3 columns returned.
 *** Total elapsed time was 1 second.
     
         ID           1
 prediction
json_report {"probability(>50K)":0.18904571478478394,"probability(<=50K)":0.8109542852152161}
         ID           2
 prediction
json_report {"probability(>50K)":0.4151643124884145,"probability(<=50K)":0.5848356875115854}
         ID           3
 prediction
json_report {"probability(>50K)":0.4016537833053354,"probability(<=50K)":0.5983462166946646}
         ID           4
 prediction
json_report {"probability(>50K)":0.38992771488797595,"probability(<=50K)":0.610072285112024}
         ID           5
 prediction
json_report {"probability(>50K)":0.06177199104145875,"probability(<=50K)":0.9382280089585413}
         ID           6
 prediction
json_report {"probability(>50K)":0.0792526350125319,"probability(<=50K)":0.9207473649874681}
         ID           7
 prediction
json_report {"probability(>50K)":0.162835954471736,"probability(<=50K)":0.8371640455282641}
         ID           8
 prediction
json_report {"probability(>50K)":0.4306800788902661,"probability(<=50K)":0.569319921109734}
         ID           9
 prediction
json_report {"probability(>50K)":0.10372474462398505,"probability(<=50K)":0.8962752553760149}