ModelTable: adult_db_glm_b_model.py
import os
import time
from teradataml import *
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
display.print_sqlmr_query = True
passwd = "alice"
uid = "alice"
host = "server123@mydomain.com"
from sklearn.linear_model import LogisticRegression
con = create_context(host=host, username=uid, password=passwd)
con
train_df = DataFrame.from_query("select * from adultTrain")
train_pd = train_df.to_pandas()
X = train_pd[['age','workclass','marital_status','relationship','race','occupation','native_country']]
target='income'
column_trans = [
(['age'], StandardScaler()),
(['workclass'], LabelEncoder()),
(['marital_status'], LabelEncoder()),
(['relationship'], LabelEncoder()),
(['race'], LabelEncoder()),
(['occupation'], LabelEncoder()),
(['native_country'], LabelEncoder())
]
mapper = DataFrameMapper(column_trans)
glm = LogisticRegression(solver='liblinear', multi_class='ovr',random_state=0, max_iter=1000)
glm_pipeline = PMMLPipeline([("mapping", mapper),('impute', SimpleImputer(strategy='most_frequent')),("clf",glm)])
glm_pipeline.fit(X,train_pd[target].ravel())
sklearn2pmml(glm_pipeline,"../sql/adult_db_glm_b_model.pmml",with_repr=True)
Query that Calls PMMLPredict Function
CREATE MULTISET TABLE glm_adult_binary AS (
SELECT * FROM mldb.PMMLPredict (
ON adultTest
ON (SELECT * FROM pmml_models WHERE model_id='adult_glm_b_model') DIMENSION
USING
Accumulate ('ID')
) AS TD
) WITH DATA;
Query Output
SELECT * FROM glm_adult_binary WHERE ID < 10 order by ID;
*** Query completed. 9 rows found. 3 columns returned.
*** Total elapsed time was 1 second.
ID 1
prediction
json_report {"probability(>50K)":0.18904571478478394,"probability(<=50K)":0.8109542852152161}
ID 2
prediction
json_report {"probability(>50K)":0.4151643124884145,"probability(<=50K)":0.5848356875115854}
ID 3
prediction
json_report {"probability(>50K)":0.4016537833053354,"probability(<=50K)":0.5983462166946646}
ID 4
prediction
json_report {"probability(>50K)":0.38992771488797595,"probability(<=50K)":0.610072285112024}
ID 5
prediction
json_report {"probability(>50K)":0.06177199104145875,"probability(<=50K)":0.9382280089585413}
ID 6
prediction
json_report {"probability(>50K)":0.0792526350125319,"probability(<=50K)":0.9207473649874681}
ID 7
prediction
json_report {"probability(>50K)":0.162835954471736,"probability(<=50K)":0.8371640455282641}
ID 8
prediction
json_report {"probability(>50K)":0.4306800788902661,"probability(<=50K)":0.569319921109734}
ID 9
prediction
json_report {"probability(>50K)":0.10372474462398505,"probability(<=50K)":0.8962752553760149}