This use case shows the steps to use SageMaker LDA with tdapiclient.
You can download the aws-usecases.zip file in the attachment as a reference. The lda folder in the zip file includes a Jupyter notebook file (ipynb) for this use case.
- Import required libraries.
import os import getpass from teradataml import create_context, DataFrame import pandas as pd from teradatasqlalchemy.types import * from teradataml import DataFrame, load_example_data, create_context import numpy as np import pandas as pd from tdapiclient import create_tdapi_context, TDApiClient, remove_tdapi_context from teradataml import create_context, DataFrame, load_example_data
- Create the connection.
host = input("Host: ") username = input("Username: ") password = getpass.getpass("Password: ")
td_context = create_context(host=host, username=username, password=password)
- Create TDAPI context and TDApiClient object.
s3_bucket = input("S3 Bucket(Please provide just the bucket name): ") access_id = input("Access ID:") access_key = getpass.getpass("Acess Key: ") region = input("AWS Region: ")
os.environ["AWS_ACCESS_KEY_ID"] = access_id os.environ["AWS_SECRET_ACCESS_KEY"] = access_key os.environ["AWS_REGION"] = region
tdapi_context = create_tdapi_context("aws", bucket_name=s3_bucket)
td_apiclient = TDApiClient(tdapi_context)
- Generate dataset.
- Define a function to generate synthetic data.
def generate_vector_count(vocabulary=25, documents=1000, mid=5): left = 0 right = vocabulary Document = list() for i in range(documents): Document.append(np.floor(np.random.triangular( left, mid, right, vocabulary).astype(float))) return Document
- Define a function to get labels.
def get_labels(vocabulary=25): labels = list() for i in range(vocabulary): labels.append("f"+str(i+1)+"") return labels
- Define a function to get column types.
def get_columns_type(vocabulary=25): column_types = dict() labels = get_labels(vocabulary) for i in labels: column_types[i] = FLOAT return column_types
- Define a function to plot data.
def Plot_Data(Documents): import seaborn as sns sns.displot(Documents,element="step",legend=False)
- Initialize parameters.
# Vocabulary size vocabulary = 25 # Distribution ranges between 0<mid<vocabulary mid = 14 # train and test document size train_doc_size = 1000 test_doc_size = 200
- Generate train dataset.
train_data = generate_vector_count(vocabulary=vocabulary, documents=train_doc_size ,mid=mid )
Plot_Data(train_data)
- Generate test dataset.
test_data = generate_vector_count(vocabulary=vocabulary, documents=test_doc_size ,mid=mid )
Plot_Data(test_data)
- Get training and test labels.
train_labels = get_labels(vocabulary=vocabulary) test_labels = get_labels(vocabulary=vocabulary)
- Define a function to generate synthetic data.
- Prepare train dataset.
- Convert train dataset to NumPy ndarray.
train_data_ndarray = np.asarray(train_data)
- Specify general training job information.
exec_role_arn = "arn:aws:iam::076782961461:role/service-role/AmazonSageMaker-ExecutionRole-20210112T215668"
prefix = "td_sagemaker/DEMO-lda-introduction"
- Set hyperparameters.
num_topics = 5 batch_size = 1
- Define LDA SageMaker instance through tdapiclient.
lda = td_apiclient.LDA( role=exec_role_arn, output_path="s3://{}/{}/output".format(s3_bucket, prefix), train_instance_count=1, train_instance_type="ml.c4.2xlarge", num_topics=num_topics, feature_dim=vocabulary, mini_batch_size=batch_size, alpha0=1.0, )
- Convert ndarray to RecordSet object.
train_recordset = lda.record_set(train_data_ndarray)
- Train model using RecordSet object.
lda.fit(inputs=train_recordset, mini_batch_size=batch_size)
- Convert train dataset to NumPy ndarray.
- Deploy.
- Load necessary libraries.
from sagemaker.serializers import CSVSerializer from sagemaker.deserializers import CSVDeserializer
- Deploy LDA model.
lda_inference = lda.deploy("aws-endpoint", sagemaker_kw_args={"instance_type": "ml.m4.xlarge", "initial_instance_count": 1})
- Add Serializer and Deserializer.
lda_inference.serializer = CSVSerializer() lda_inference.deserializer = CSVDeserializer()
- Load necessary libraries.
- Prepare teradataml DataFrame.
- Convert NumPy ndarray to pandas DataFrame.
test_pdf = pd.DataFrame(test_data,columns=test_labels)
- Get column types.
column_types = get_columns_type(vocabulary=vocabulary)
- Convert pandas DataFrame to teradataml DataFrame.
test_table = "LDA_Test_Sample"
from teradataml import copy_to_sql
copy_to_sql(df=test_pdf, table_name=test_table, if_exists="replace", types=column_types)
- Test DataFrame loaded.
test_df = DataFrame(test_table)
test_df
The output:f1 f2 f3 f4 f5 f6 f7 f8 f9 f10 f11 f12 f13 f14 f15 f16 f17 f18 f19 f20 f21 f22 f23 f24 f25 23.0 18.0 15.0 12.0 23.0 7.0 13.0 17.0 6.0 9.0 17.0 12.0 19.0 9.0 12.0 20.0 10.0 19.0 1.0 8.0 12.0 15.0 8.0 10.0 11.0 12.0 10.0 12.0 14.0 14.0 15.0 8.0 11.0 3.0 11.0 15.0 9.0 11.0 15.0 8.0 20.0 13.0 9.0 8.0 20.0 1.0 24.0 20.0 18.0 7.0 9.0 9.0 18.0 15.0 18.0 13.0 6.0 13.0 10.0 12.0 15.0 7.0 23.0 20.0 8.0 17.0 18.0 5.0 9.0 12.0 12.0 12.0 9.0 12.0 17.0 14.0 19.0 16.0 8.0 18.0 9.0 11.0 20.0 21.0 17.0 19.0 17.0 23.0 16.0 10.0 11.0 8.0 11.0 1.0 8.0 16.0 21.0 16.0 16.0 9.0 5.0 20.0 20.0 19.0 17.0 13.0 15.0 14.0 3.0 16.0 18.0 18.0 12.0 7.0 4.0 16.0 14.0 23.0 13.0 11.0 22.0 19.0 15.0 9.0 4.0 9.0 20.0 7.0 18.0 15.0 11.0 9.0 14.0 20.0 7.0 14.0 14.0 11.0 13.0 14.0 6.0 11.0 14.0 12.0 6.0 8.0 14.0 13.0 10.0 15.0 6.0 15.0 1.0 17.0 19.0 12.0 14.0 16.0 9.0 20.0 7.0 17.0 10.0 16.0 12.0 17.0 12.0 20.0 10.0 3.0 14.0 13.0 16.0 6.0 13.0 10.0 14.0 7.0 8.0 19.0 19.0 6.0 10.0 15.0 17.0 14.0 14.0 11.0 22.0 13.0 22.0 9.0 15.0 12.0 9.0 7.0 4.0 11.0 1.0 14.0 4.0 15.0 15.0 4.0 19.0 0.0 16.0 8.0 14.0 1.0 3.0 6.0 12.0 10.0 12.0 9.0 9.0 14.0 9.0 16.0 18.0 10.0 11.0 11.0 5.0 11.0 13.0 11.0 11.0 10.0 2.0 5.0 12.0 14.0 17.0 9.0 2.0 16.0 12.0 15.0 14.0 15.0 7.0 3.0 8.0 12.0 9.0 18.0 17.0 13.0
- Convert NumPy ndarray to pandas DataFrame.
- Prediction.
- cloudObj prediction.
item = test_data[0]
print(lda_inference.cloudObj.predict(item))
The output:[label { key: "topic_mixture" value { float32_tensor { values: 0.0 values: 0.1374109387397766 values: 0.0 values: 0.8625890612602234 values: 0.0 } } } ]
- Use the first two rows for prediction.
item = test_df.head(2)
item
The output:f1 f2 f3 f4 f5 f6 f7 f8 f9 f10 f11 f12 f13 f14 f15 f16 f17 f18 f19 f20 f21 f22 f23 f24 f25 2.0 7.0 10.0 1.0 10.0 5.0 10.0 2.0 9.0 17.0 19.0 10.0 20.0 23.0 2.0 13.0 16.0 12.0 4.0 7.0 7.0 11.0 12.0 18.0 13.0 1.0 7.0 18.0 13.0 5.0 13.0 4.0 8.0 20.0 12.0 15.0 15.0 7.0 13.0 8.0 22.0 10.0 13.0 14.0 20.0 10.0 13.0 8.0 10.0 14.0
- Prediction in UDF mode.
output = lda_inference.predict(item, mode="udf",content_type='csv')
output
The output:f1 f2 f3 f4 f5 f6 f7 f8 f9 f10 f11 f12 f13 f14 f15 f16 f17 f18 f19 f20 f21 f22 f23 f24 f25 Output 2.0 7.0 10.0 1.0 10.0 5.0 10.0 2.0 9.0 17.0 19.0 10.0 20.0 23.0 2.0 13.0 16.0 12.0 4.0 7.0 7.0 11.0 12.0 18.0 13.0 {"predictions": [{"topic_mixture": [0.32196688652038574, 0.34312328696250916, 0.0, 0.0, 0.3349097967147827]}]} 1.0 7.0 18.0 13.0 5.0 13.0 4.0 8.0 20.0 12.0 15.0 15.0 7.0 13.0 8.0 22.0 10.0 13.0 14.0 20.0 10.0 13.0 8.0 10.0 14.0 {"predictions": [{"topic_mixture": [0.43715721368789673, 0.4980888068675995, 0.0, 0.0, 0.06475400924682617]}]}
- cloudObj prediction.
- Clean up.
lda_inference.cloudObj.delete_model() lda_inference.cloudObj.delete_endpoint() remove_tdapi_context(tdapi_context)