import pandas as pd
import statsmodels.api as sm
import numpy as np
import sys
import pickle
import warnings
import random
import string
warnings.filterwarnings("ignore")
DELIMITER=','
# Know your data: You must know in advance the number and data types of the
# incoming columns from the SQL Engine database!
# For this script, the input expected format is:
# 0: p_id, 1-5: indep vars, 6: dep var, 7: nRow, 8: model (if nRow==1), NULL o/w
colNames = ['p_id','x1','x2','x3','x4','x5','y']
# All input columns are float numbers.
# If any numbers are streamed in scientific format that contains blanks i
# (such as "1 E002" for 100), the following Lambda function removes blanks
# from the input string so that Python interprets the number correctly.
sciStrToFloat = lambda x: float("".join(x.split()))
# Use the Lambda function in the following converters for each input column.
converters = {0: sciStrToFloat,
1: sciStrToFloat,
2: sciStrToFloat,
3: sciStrToFloat,
4: sciStrToFloat,
5: sciStrToFloat,
6: sciStrToFloat}
try:
### Ingest and process the rest of the input data rows
###
input_file = sys.stdin
df = pd.read_csv(input_file, sep=DELIMITER, header=None, names=colNames,
index_col=False, iterator=False, converters=converters)
# Print the number of rows and columns in the dataframe
#print(f"Number of rows in df: {len(df)}", file=sys.stderr)
#print(f"Number of columns in df: {len(df.columns)}", file=sys.stderr)
# For AMPs that receive no data, exit the script instance gracefully.
if df.empty:
sys.exit(0)
# Create object with intercept and independent variables. The intercept column
# must be present to use the object in the StatsModels GLM() in the following.
dfx = df.loc[:,'x1':'x5']
dfx.insert(0,'Intercept',1.0)
# Create object with dependent variable
dfy = df.loc[:,'y']
# Use GLM in statsmodels for binomial general linear modeling.
logit = sm.GLM(dfy, dfx, family = sm.families.Binomial())
# Fit the model. Use disp=0 in the parenthesis to prevent sterr output.
fitResult = logit.fit(disp=0)
model_name = ''.join(random.choices(string.ascii_letters + string.digits, k=32))
model_path = f"/lob/{model_name}.pkl"
with open(model_path, "wb") as model_file:
pickle.dump(fitResult, model_file)
# Export results to the SQL Engine database through standard output
print('{}{}{}'.format(df.loc[0]['p_id'], DELIMITER, model_path))
except Exception as e:
print(f"Error : {e}")
sys.exit(1)