""" boston_db_glm_h2o_model.py: Creates GLM model using H2O framework ******************** * Generated model file is in Mojo format. * To score this model , user needs insert/upload Mojo model into Vantage table ******************** """ #!/usr/bin/env python # coding: utf-8 import pandas as pd import os import time import h2o from teradataml import DataFrame, create_context from h2o.estimators.glm import H2OGeneralizedLinearEstimator # connect to teradata passwd = "alice" uid = "alice" host = "sdt40744.labs.teradata.com" zip_extension = '.zip' connection = create_context(host=host, username=uid, password=passwd) # Server must be running. Use command java -jar h2o.jar to start the server from ~/Projects/H2O/h2o... h2o.init() # Training dataset from TD train_df = DataFrame.from_query("select * from data_gen.boston;") # convert to pandas train_df = train_df.to_pandas() train_df # convert to H2O dataframe boston = h2o.H2OFrame(train_df) # split into train and testing sets train, test = boston.split_frame(ratios = [0.8], seed = 1234) # Initialize H2O GLM # set the `alpha` parameter to 0.25 boston_glm = H2OGeneralizedLinearEstimator(alpha = 0.25) # set the predictor columns (exclude response column) predictors = boston.columns[:-2] # this example will predict the price column # you can run the following to see that medv is indeed a numeric value boston["price"].isnumeric() # set the response column to "price", which is the median value of owner-occupied homes in $1000's response = "price" # convert the `chas` column to a factor # `chas` = Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) boston['CHAS'] = boston['CHAS'].asfactor() boston_glm.train(x = predictors, y = response, training_frame = train) path="../sql/boston_db_glm_h20_model" model_path = boston_glm.save_mojo(path=path, force=True) os.rename(model_path, model_path.strip(zip_extension)) # predict using the model and the testing dataset predictions = boston_glm.predict(test) print(predictions)