This use case shows the steps to use SageMaker SKLearn Estimator with tdapiclient.
You can download the aws-usecases.zip file in the attachment as a reference. The sklearn folder in the zip file includes a Jupyter notebook file (ipynb) and a Python file (py) required to run this notebook file.
- Import necessary packages.
import os import getpass
from tdapiclient import create_tdapi_context, remove_tdapi_context, TDApiClient
from teradataml import create_context, DataFrame, copy_to_sql import pandas as pd from teradatasqlalchemy.types import *
- Create the connection.
host = input("Host: ") username = input("Username: ") password = getpass.getpass("Password: ")
td_context = create_context(host=host, username=username, password=password)
- Create TDAPI context and TDApiClient object.
s3_bucket = input("S3 Bucket(Please provide just the bucket name, for example: test-bucket): ") access_id = input("Access ID:") access_key = getpass.getpass("Acess Key: ") region = input("AWS Region: ")
os.environ["AWS_ACCESS_KEY_ID"] = access_id os.environ["AWS_SECRET_ACCESS_KEY"] = access_key os.environ["AWS_REGION"] = region
tdapi_context = create_tdapi_context("aws", bucket_name=s3_bucket)
td_apiclient = TDApiClient(tdapi_context)
- Set up data.
- Import necessary libraries and data.
from sklearn.model_selection import train_test_split from sklearn.datasets import fetch_california_housing
california_housing = fetch_california_housing(as_frame=True) data=california_housing.frame
- Insert the dataframe in the tables.
data_table = "housing_data"
column_types = {"MedInc": FLOAT, "HouseAge": FLOAT, "AveRooms": FLOAT, "AveBedrms": FLOAT, "Population": FLOAT, "AveOccup": FLOAT, "Latitude": FLOAT, "Longitude": FLOAT, "MedHouseVal" : FLOAT}
copy_to_sql(df=data, table_name=data_table, if_exists="replace", types=column_types)
- Create a teradataml DataFrame using the table.
data = DataFrame(table_name=data_table)
data
The output:MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude Longitude MedHouseVal 7.2574 52.0 8.288135593220339 1.073446327683616 496.0 2.8022598870056497 37.85 -122.24 3.521 3.8462 52.0 6.281853281853282 1.0810810810810811 565.0 2.1814671814671813 37.85 -122.25 3.422 4.0368 52.0 4.761658031088083 1.1036269430051813 413.0 2.139896373056995 37.85 -122.25 2.697 3.6591 52.0 4.9319066147859925 0.9513618677042801 1094.0 2.1284046692607004 37.84 -122.25 2.992 2.0804 42.0 4.294117647058823 1.1176470588235294 1206.0 2.026890756302521 37.84 -122.26 2.267 3.6912 52.0 4.970588235294118 0.9901960784313726 1551.0 2.172268907563025 37.84 -122.25 2.611 3.12 52.0 4.797527047913447 1.061823802163833 1157.0 1.7882534775888717 37.84 -122.25 2.414 5.6431 52.0 5.8173515981735155 1.0730593607305936 558.0 2.547945205479452 37.85 -122.25 3.413 8.3014 21.0 6.238137082601054 0.9718804920913884 2401.0 2.109841827768014 37.86 -122.22 3.585 8.3252 41.0 6.984126984126984 1.0238095238095237 322.0 2.5555555555555554 37.88 -122.23 4.526
- Create two samples of input data - sample 1 has 80% of total rows and sample 2 has 20% of total rows.
housing_sample = data.sample(frac=[0.8, 0.2])
- Create train dataset from sample 1 by filtering on "sampleid" and drop "sampleid" column as it is not required for training model.
housing_train = housing_sample[housing_sample.sampleid == "1"].drop("sampleid", axis = 1)
housing_train
The output:MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude Longitude MedHouseVal 7.2574 52.0 8.288135593220339 1.073446327683616 496.0 2.8022598870056497 37.85 -122.24 3.521 4.0368 52.0 4.761658031088083 1.1036269430051813 413.0 2.139896373056995 37.85 -122.25 2.697 3.6591 52.0 4.9319066147859925 0.9513618677042801 1094.0 2.1284046692607004 37.84 -122.25 2.992 3.12 52.0 4.797527047913447 1.061823802163833 1157.0 1.7882534775888717 37.84 -122.25 2.414 3.6912 52.0 4.970588235294118 0.9901960784313726 1551.0 2.172268907563025 37.84 -122.25 2.611 3.2031 52.0 5.477611940298507 1.0796019900497513 910.0 2.263681592039801 37.85 -122.26 2.815 2.0804 42.0 4.294117647058823 1.1176470588235294 1206.0 2.026890756302521 37.84 -122.26 2.267 3.8462 52.0 6.281853281853282 1.0810810810810811 565.0 2.1814671814671813 37.85 -122.25 3.422 8.3014 21.0 6.238137082601054 0.9718804920913884 2401.0 2.109841827768014 37.86 -122.22 3.585 8.3252 41.0 6.984126984126984 1.0238095238095237 322.0 2.5555555555555554 37.88 -122.23 4.526
- Create test dataset from sample 2 by filtering on "sampleid" and drop "sampleid" column as it is not required for scoring.
housing_test = housing_sample[housing_sample.sampleid == "2"].drop("sampleid", axis = 1)
housing_test
The output:MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude Longitude MedHouseVal 2.6033 52.0 5.465454545454546 1.0836363636363637 690.0 2.5090909090909093 37.84 -122.27 1.629 1.4861 49.0 4.6022727272727275 1.0681818181818181 570.0 2.159090909090909 37.83 -122.27 0.972 2.0978 52.0 4.215189873417722 1.0607594936708862 946.0 2.3949367088607594 37.83 -122.26 1.554 1.025 49.0 3.7724867724867726 1.0687830687830688 462.0 2.4444444444444446 37.84 -122.26 1.188 1.1108 41.0 4.473611111111111 1.1847222222222222 1959.0 2.720833333333333 37.82 -122.27 0.975 2.5625 2.0 2.7719298245614037 0.7543859649122807 94.0 1.6491228070175439 37.82 -122.29 0.6 1.7348 43.0 3.9802371541501977 1.233201581027668 558.0 2.205533596837945 37.82 -122.27 1.375 1.3578 40.0 4.524096385542169 1.108433734939759 409.0 2.463855421686747 37.85 -122.27 1.475 3.2705 52.0 4.772479564032698 1.0245231607629428 1504.0 2.0490463215258856 37.85 -122.26 2.418 3.6591 52.0 4.9319066147859925 0.9513618677042801 1094.0 2.1284046692607004 37.84 -122.25 2.992
- Import necessary libraries and data.
- Create SkLearn SageMaker estimator instance through tdapiclient.
exec_role_arn = "arn:aws:iam::076782961461:role/service-role/AmazonSageMaker-ExecutionRole-20210112T215668" FRAMEWORK_VERSION = "0.23-1"
# Create an estimator object based on sklearn sagemaker class sklearn_estimator = td_apiclient.SKLearn( entry_point="sklearn-script.py", role=exec_role_arn, instance_count=1, instance_type="ml.m5.large", framework_version=FRAMEWORK_VERSION, base_job_name="rf-scikit", metric_definitions=[{"Name": "median-AE", "Regex": "AE-at-50th-percentile: ([0-9.]+).*$"}], hyperparameters={ "n-estimators": 100, "min-samples-leaf": 3, "features": "MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude Longitude", "target": "MedHouseVal", }, )
- Create test and training DataFrames, and start training.
- Create test and training DataFrames.
test_df = DataFrame(table_name=test_table) train_df = DataFrame(table_name=train_table)
test_df
train_df
- Start training using the DataFrame objects created in previous step.
sklearn_estimator.fit({"train": train_df, "test": test_df}, content_type="csv", wait=True)
- Create test and training DataFrames.
- Create Serializer and Deserializer, so predictor can handle CSV input and output.
from sagemaker.serializers import CSVSerializer from sagemaker.deserializers import CSVDeserializer csv_ser = CSVSerializer() csv_dser = CSVDeserializer()
predictor = sklearn_estimator.deploy("aws-endpoint", sagemaker_kw_args={"instance_type": "ml.m5.large", "initial_instance_count": 1, "serializer": csv_ser, "deserializer": csv_dser})
- Try prediction integration with Teradata and the predictor object created in previous step.
- Try the predictor with simple CSV data to see if it works as expected.
item = "1.6812, 25.0, 4.192201, 1.022284, 1392.0, 3.877437, 36.06, -119.01"
print(predictor.cloudObj.accept) print(predictor.cloudObj.predict(item))
- Try prediction with UDF and Client options.Input:
input = DataFrame(table_name='housing_data_test') column_list = ["MedInc","HouseAge","AveRooms","AveBedrms","Population","AveOccup","Latitude","Longitude"]
input = input.sample(n=5).select(column_list)
input
Prediction with UDF option:output = predictor.predict(input, mode="UDF",content_type='csv')
output
Prediction with Client option:output = predictor.predict(input, mode="Client", content_type='csv')
output
- Try the predictor with simple CSV data to see if it works as expected.
- Clean up.
predictor.cloudObj.delete_model() predictor.cloudObj.delete_endpoint() remove_tdapi_context(tdapi_context)