Skip to content

Commit fc85adc

Browse files
committed
changed "interval" to "prediction" + changed calculate_model_statistics to allow for prediction models
1 parent 3913c30 commit fc85adc

File tree

1 file changed

+75
-48
lines changed

1 file changed

+75
-48
lines changed

src/sasctl/pzmm/write_json_files.py

Lines changed: 75 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1174,6 +1174,7 @@ def calculate_model_statistics(
11741174
train_data: Union[DataFrame, List[list], Type["numpy.array"]] = None,
11751175
test_data: Union[DataFrame, List[list], Type["numpy.array"]] = None,
11761176
json_path: Union[str, Path, None] = None,
1177+
target_type: str = "classification"
11771178
) -> Union[dict, None]:
11781179
"""
11791180
Calculates fit statistics (including ROC and Lift curves) from datasets and then
@@ -1214,6 +1215,9 @@ def calculate_model_statistics(
12141215
Dataset pertaining to the test data. The default value is None.
12151216
json_path : str or Path, optional
12161217
Location for the output JSON files. The default value is None.
1218+
target_type: str, optional
1219+
Type of target the model is trying to find. Currently supports "classification"
1220+
and "prediction" types. The default value is "classification".
12171221
12181222
Returns
12191223
-------
@@ -1260,18 +1264,26 @@ def calculate_model_statistics(
12601264
data,
12611265
casout={"name": "assess_dataset", "replace": True, "caslib": "Public"},
12621266
)
1263-
1264-
conn.percentile.assess(
1265-
table={"name": "assess_dataset", "caslib": "Public"},
1266-
response="predict",
1267-
pVar="predict_proba",
1268-
event=str(target_value),
1269-
pEvent=str(prob_value) if prob_value else str(0.5),
1270-
inputs="actual",
1271-
fitStatOut={"name": "FitStat", "replace": True, "caslib": "Public"},
1272-
rocOut={"name": "ROC", "replace": True, "caslib": "Public"},
1273-
casout={"name": "Lift", "replace": True, "caslib": "Public"},
1274-
)
1267+
if target_type == 'classification':
1268+
conn.percentile.assess(
1269+
table={"name": "assess_dataset", "caslib": "Public"},
1270+
response="predict",
1271+
pVar="predict_proba",
1272+
event=str(target_value),
1273+
pEvent=str(prob_value) if prob_value else str(0.5),
1274+
inputs="actual",
1275+
fitStatOut={"name": "FitStat", "replace": True, "caslib": "Public"},
1276+
rocOut={"name": "ROC", "replace": True, "caslib": "Public"},
1277+
casout={"name": "Lift", "replace": True, "caslib": "Public"},
1278+
)
1279+
else:
1280+
conn.percentile.assess(
1281+
table={"name": "assess_dataset", "caslib": "Public"},
1282+
response="predict",
1283+
inputs="actual",
1284+
fitStatOut={"name": "FitStat", "replace": True, "caslib": "Public"},
1285+
casout={"name": "Lift", "replace": True, "caslib": "Public"}
1286+
)
12751287

12761288
fitstat_dict = (
12771289
pd.DataFrame(conn.CASTable("FitStat", caslib="Public").to_frame())
@@ -1280,11 +1292,11 @@ def calculate_model_statistics(
12801292
.to_dict()
12811293
)
12821294
json_dict[0]["data"][i]["dataMap"].update(fitstat_dict)
1283-
1284-
roc_df = pd.DataFrame(conn.CASTable("ROC", caslib="Public").to_frame())
1285-
roc_dict = cls.apply_dataframe_to_json(json_dict[1]["data"], i, roc_df)
1286-
for j in range(len(roc_dict)):
1287-
json_dict[1]["data"][j].update(roc_dict[j])
1295+
if target_type == 'classification':
1296+
roc_df = pd.DataFrame(conn.CASTable("ROC", caslib="Public").to_frame())
1297+
roc_dict = cls.apply_dataframe_to_json(json_dict[1]["data"], i, roc_df)
1298+
for j in range(len(roc_dict)):
1299+
json_dict[1]["data"][j].update(roc_dict[j])
12881300

12891301
lift_df = pd.DataFrame(conn.CASTable("Lift", caslib="Public").to_frame())
12901302
lift_dict = cls.apply_dataframe_to_json(json_dict[2]["data"], i, lift_df, 1)
@@ -1293,19 +1305,26 @@ def calculate_model_statistics(
12931305

12941306
if json_path:
12951307
for i, name in enumerate([FITSTAT, ROC, LIFT]):
1296-
with open(Path(json_path) / name, "w") as json_file:
1297-
json_file.write(json.dumps(json_dict[i], indent=4, cls=NpEncoder))
1298-
if cls.notebook_output:
1299-
print(
1300-
f"{name} was successfully written and saved to "
1301-
f"{Path(json_path) / name}"
1302-
)
1308+
if not (name == ROC and target_type == "prediction"):
1309+
with open(Path(json_path) / name, "w") as json_file:
1310+
json_file.write(json.dumps(json_dict[i], indent=4, cls=NpEncoder))
1311+
if cls.notebook_output:
1312+
print(
1313+
f"{name} was successfully written and saved to "
1314+
f"{Path(json_path) / name}"
1315+
)
13031316
else:
1304-
return {
1305-
FITSTAT: json.dumps(json_dict[0], indent=4, cls=NpEncoder),
1306-
ROC: json.dumps(json_dict[1], indent=4, cls=NpEncoder),
1307-
LIFT: json.dumps(json_dict[2], indent=4, cls=NpEncoder),
1308-
}
1317+
if target_type == 'classification':
1318+
return {
1319+
FITSTAT: json.dumps(json_dict[0], indent=4, cls=NpEncoder),
1320+
ROC: json.dumps(json_dict[1], indent=4, cls=NpEncoder),
1321+
LIFT: json.dumps(json_dict[2], indent=4, cls=NpEncoder),
1322+
}
1323+
else:
1324+
return {
1325+
FITSTAT: json.dumps(json_dict[0], indent=4, cls=NpEncoder),
1326+
LIFT: json.dumps(json_dict[2], indent=4, cls=NpEncoder),
1327+
}
13091328

13101329
@staticmethod
13111330
def check_for_data(
@@ -2208,11 +2227,11 @@ def generate_model_card(
22082227
algorithm: str,
22092228
train_data: pd.DataFrame,
22102229
train_predictions: Union[pd.Series, list],
2211-
target_type: str = "interval",
2230+
target_type: str = "classificaiton",
22122231
target_value: Union[str, int, float, None] = None,
22132232
interval_vars: Optional[list] = [],
22142233
class_vars: Optional[list] = [],
2215-
selection_statistic: str = "_GINI_",
2234+
selection_statistic: str = None,
22162235
server: str = "cas-shared-default",
22172236
caslib: str = "Public",
22182237
):
@@ -2237,19 +2256,22 @@ def generate_model_card(
22372256
train_predictions : pandas.Series, list
22382257
List of predictions made by the model on the training data.
22392258
target_type : string
2240-
Type the model is targeting. Currently supports "classification" and "interval" types.
2241-
The default value is "Interval".
2259+
Type of target the model is trying to find. Currently supports "classification" and "prediction" types.
2260+
The default value is "classification".
22422261
target_value : string, int, float, optional
22432262
Value the model is targeting for classification models. This argument is not needed for
2244-
Interval models. The default value is None.
2263+
prediction models. The default value is None.
22452264
interval_vars : list, optional
22462265
A list of interval variables. The default value is an empty list.
22472266
class_vars : list, optional
22482267
A list of classification variables. The default value is an empty list.
22492268
selection_statistic: str, optional
2250-
The selection statistic chosen to score the model against other models. Can be any of the
2251-
following values: "_RASE_", "_NObs_", "_GINI_", "_GAMMA_", "_MCE_", "_ASE_", "_MCLL_",
2252-
"_KS_", "_KSPostCutoff_", "_DIV_", "_TAU_", "_KSCut_", or "_C_". The default value is "_GINI_".
2269+
The selection statistic chosen to score the model against other models. Classification
2270+
models can take any of the following values: "_RASE_", "_GINI_", "_GAMMA_", "_MCE_",
2271+
"_ASE_", "_MCLL_", "_KS_", "_KSPostCutoff_", "_DIV_", "_TAU_", "_KSCut_", or "_C_".
2272+
Prediction models can take any of the following values: "_ASE_", "_DIV_", "_RASE_", "_MAE_",
2273+
"_RMAE_", "_MSLE_", "_RMSLE_" The default value is "_KS_" for classification models and
2274+
"_ASE_" for prediction models.
22532275
server: str, optional
22542276
The CAS server the training data will be stored on. The default value is "cas-shared-default"
22552277
caslib: str, optional
@@ -2260,10 +2282,15 @@ def generate_model_card(
22602282
"For the model card data to be properly generated on a classification "
22612283
"model, a target value is required."
22622284
)
2263-
if target_type not in ["classification", "interval"]:
2285+
if target_type not in ["classification", "prediction"]:
22642286
raise RuntimeError(
2265-
"Only classification and interval target types are currently accepted."
2287+
"Only classification and prediction target types are currently accepted."
22662288
)
2289+
if selection_statistic is None:
2290+
if target_type is 'classification':
2291+
selection_statistic = '_KS_'
2292+
elif target_type is 'prediction':
2293+
selection_statistic = "_ASE_"
22672294
if selection_statistic not in cls.valid_params:
22682295
raise RuntimeError(
22692296
"The selection statistic must be a value generated in dmcas_fitstat.json. See "
@@ -2292,7 +2319,7 @@ def generate_model_card(
22922319
)
22932320

22942321
# Generates the event percentage for Classification targets, and the event average
2295-
# for Interval targets
2322+
# for prediction targets
22962323
update_dict = cls.generate_outcome_average(
22972324
train_data=train_data,
22982325
input_variables=interval_vars + class_vars,
@@ -2373,7 +2400,7 @@ def generate_outcome_average(
23732400
target_value: Union[str, int, float] = None
23742401
):
23752402
"""
2376-
Generates the outcome average of the training data. For Interval targets, the event average
2403+
Generates the outcome average of the training data. For prediction targets, the event average
23772404
is generated. For Classification targets, the event percentage is returned.
23782405
23792406
Parameters
@@ -2385,10 +2412,10 @@ def generate_outcome_average(
23852412
input_variables: list
23862413
A list of all input variables used by the model. Used to isolate the output variable.
23872414
target_type : string
2388-
Type the model is targeting. Currently supports "Classification" and "Interval" types.
2415+
Type the model is targeting. Currently supports "classification" and "prediction" types.
23892416
target_value : string, int, float, optional
23902417
Value the model is targeting for Classification models. This argument is not needed for
2391-
Interval models. The default value is None.
2418+
prediction models. The default value is None.
23922419
23932420
Returns
23942421
-------
@@ -2400,7 +2427,7 @@ def generate_outcome_average(
24002427
if target_type == "classification":
24012428
value_counts = output_var[output_var.columns[0]].value_counts()
24022429
return {'eventPercentage': value_counts[target_value]/sum(value_counts)}
2403-
elif target_type == "interval":
2430+
elif target_type == "prediction":
24042431
if not isinstance(output_var[output_var.columns[0]].iloc[0], numbers.Number):
24052432
raise ValueError("Detected output column is not numeric. Please ensure that " +
24062433
"the correct output column is being passed, and that no extra columns " +
@@ -2515,7 +2542,7 @@ def generate_variable_importance(
25152542
model_files: Union[str, Path, dict],
25162543
train_data: pd.DataFrame,
25172544
train_predictions: Union[pd.Series, list],
2518-
target_type: str = "interval",
2545+
target_type: str = "classification",
25192546
interval_vars: Optional[list] = [],
25202547
class_vars: Optional[list] = [],
25212548
caslib: str = "Public",
@@ -2535,8 +2562,8 @@ def generate_variable_importance(
25352562
train_predictions : pandas.Series, list
25362563
List of predictions made by the model on the training data.
25372564
target_type : string, optional
2538-
Type the model is targeting. Currently supports "Classification" and "Interval" types.
2539-
The default value is "Interval".
2565+
Type the model is targeting. Currently supports "classification" and "prediction" types.
2566+
The default value is "classification".
25402567
interval_vars : list, optional
25412568
A list of interval variables. The default value is an empty list.
25422569
class_vars : list, optional
@@ -2564,7 +2591,7 @@ def generate_variable_importance(
25642591
treeCrit = 'RSS'
25652592
else:
25662593
raise RuntimeError(
2567-
"The selected model type is unsupported. Currently, only models that have interval or classification target types are supported."
2594+
"The selected model type is unsupported. Currently, only models that have prediction or classification target types are supported."
25682595
)
25692596
request_packages = list()
25702597
if interval_vars:

0 commit comments

Comments
 (0)