From ee23c183b85067e9a4096c3b263a95bceb710626 Mon Sep 17 00:00:00 2001 From: djm21 Date: Tue, 15 Oct 2024 12:07:06 -0700 Subject: [PATCH 1/7] added capability to have preprocessing function in score code --- src/sasctl/pzmm/import_model.py | 3 ++ src/sasctl/pzmm/write_score_code.py | 48 +++++++++++++++++++++++++++++ tests/unit/test_write_score_code.py | 23 ++++++++++++++ 3 files changed, 74 insertions(+) diff --git a/src/sasctl/pzmm/import_model.py b/src/sasctl/pzmm/import_model.py index cc28f7e5..bfa9c6f7 100644 --- a/src/sasctl/pzmm/import_model.py +++ b/src/sasctl/pzmm/import_model.py @@ -213,6 +213,7 @@ def import_model( target_values: Optional[List[str]] = None, overwrite_project_properties: Optional[bool] = False, target_index: Optional[int] = None, + preprocess_function: Optional[Callable[DataFrame, DataFrame]] = None, **kwargs, ) -> Tuple[RestObj, Union[dict, str, Path]]: """ @@ -371,6 +372,7 @@ def import_model( missing_values=missing_values, score_cas=score_cas, target_index=target_index, + preprocess_function=preprocess_function, **kwargs, ) if score_code_dict: @@ -471,6 +473,7 @@ def import_model( missing_values=missing_values, score_cas=score_cas, target_index=target_index, + preprocess_function=preprocess_function, **kwargs, ) if score_code_dict: diff --git a/src/sasctl/pzmm/write_score_code.py b/src/sasctl/pzmm/write_score_code.py index 778d14e2..37bf5db3 100644 --- a/src/sasctl/pzmm/write_score_code.py +++ b/src/sasctl/pzmm/write_score_code.py @@ -36,6 +36,7 @@ def write_score_code( score_cas: Optional[bool] = True, score_code_path: Union[Path, str, None] = None, target_index: Optional[int] = None, + preprocess_function: Optional[Callable[DataFrame, DataFrame]] = None, **kwargs, ) -> Union[dict, None]: """ @@ -292,6 +293,9 @@ def score(var1, var2, var3, var4): if missing_values: self._impute_missing_values(input_data, missing_values) + if preprocess_function: + self._add_preprocess_code(preprocess_function) + # SAS Viya 3.5 model if model_id: mas_code, cas_code = self._viya35_score_code_import( @@ -759,6 +763,7 @@ def _predict_method( missing_values: Optional[Any] = None, statsmodels_model: Optional[bool] = False, tf_model: Optional[bool] = False, + preprocess_function: Optional[Callable[DataFrame, DataFrame]] = None, ) -> None: """ Write the model prediction section of the score code. @@ -809,6 +814,10 @@ def _predict_method( input_frame = f'{{{", ".join(input_dict)}}}, index=index' self.score_code += self._wrap_indent_string(input_frame, 8) self.score_code += f"\n{'':4})\n" + if preprocess_function: + self.score_code += ( + f"{'':4}input_array = {preprocess_function.__name__}(input_array)" + ) if missing_values: self.score_code += ( f"{'':4}input_array = impute_missing_values(input_array)\n" @@ -851,6 +860,10 @@ def _predict_method( input_frame = f'{{{", ".join(input_dict)}}}, index=index' self.score_code += self._wrap_indent_string(input_frame, 8) self.score_code += f"\n{'':4})\n" + if preprocess_function: + self.score_code += ( + f"{'':4}input_array = {preprocess_function.__name__}(input_array)" + ) if missing_values: self.score_code += ( f"{'':4}input_array = impute_missing_values(input_array)\n" @@ -872,6 +885,10 @@ def _predict_method( input_frame = f'{{{", ".join(input_dict)}}}, index=index' self.score_code += self._wrap_indent_string(input_frame, 8) self.score_code += f"\n{'':4})\n" + if preprocess_function: + self.score_code += ( + f"{'':4}input_array = {preprocess_function.__name__}(input_array)" + ) if missing_values: self.score_code += ( f"{'':4}input_array = impute_missing_values(input_array)\n" @@ -904,6 +921,10 @@ def _predict_method( input_frame = f'{{{", ".join(input_dict)}}}, index=index' self.score_code += self._wrap_indent_string(input_frame, 8) self.score_code += f"\n{'':4})\n" + if preprocess_function: + self.score_code += ( + f"{'':4}input_array = {preprocess_function.__name__}(input_array)" + ) if missing_values: self.score_code += ( f"{'':4}input_array = impute_missing_values(input_array)\n" @@ -2238,3 +2259,30 @@ def _viya35_score_code_import( model["scoreCodeType"] = "ds2MultiType" mr.update_model(model) return mas_code, cas_code + + def _add_preprocess_code( + self, + preprocess_function: Callable[DataFrame, DataFrame] + ): + """ + Places the given preprocess function, which must both take a DataFrame as an argument + and return a DataFrame, into the score code. If the preprocess function does not + return anything, an error is thrown. + + Parameters + ---------- + preprocess_function: function + The preprocess function to be added to the score code. + """ + import inspect + preprocess_code = inspect.getsource(preprocess_function) + if not "return" in preprocess_code: + raise ValueError( + "The given score code does not return a value. " + + "To allow for the score code to work correctly, please ensure the preprocessed " + + "data is returned." + ) + if self.score_code[-1] == '\n': + self.score_code += preprocess_code + else: + self.score_code += '\n' + preprocess_code \ No newline at end of file diff --git a/tests/unit/test_write_score_code.py b/tests/unit/test_write_score_code.py index f4f857ac..21fbf5b5 100644 --- a/tests/unit/test_write_score_code.py +++ b/tests/unit/test_write_score_code.py @@ -188,6 +188,29 @@ def test_impute_missing_values(): assert "'b': 'test'" in sc.score_code assert "'c': 1" in sc.score_code or "'c': np.int64(1)" in sc.score_code +def test_preprocess_function(): + """ + Test Cases: + - function + - function with no return + """ + test_df = pd.DataFrame( + data=[[0, "a", 1], [2, "b", 0]], columns=["num", "char", "bin"] + ) + sc = ScoreCode() + def preprocess_function_one(data: pd.DataFrame): + print("preprocessing happens here") + return data + sc._add_preprocess_code(preprocess_function_one) + assert "preprocessing happens here" in sc.score_code + assert "preprocess_function_one" in sc.score_code + + sc = ScoreCode() + def preprocess_function_two(data: pd.DataFrame): + print("preprocessing happens here?") + with pytest.raises(ValueError): + sc._add_preprocess_code(preprocess_function_two) + def test_predict_method(): """ From a7fdbfab04afba335c1a53a90efd83f0b75fb232 Mon Sep 17 00:00:00 2001 From: djm21 Date: Tue, 15 Oct 2024 12:11:15 -0700 Subject: [PATCH 2/7] black reformatting --- src/sasctl/pzmm/write_score_code.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/sasctl/pzmm/write_score_code.py b/src/sasctl/pzmm/write_score_code.py index 37bf5db3..9a6e80f8 100644 --- a/src/sasctl/pzmm/write_score_code.py +++ b/src/sasctl/pzmm/write_score_code.py @@ -295,7 +295,7 @@ def score(var1, var2, var3, var4): if preprocess_function: self._add_preprocess_code(preprocess_function) - + # SAS Viya 3.5 model if model_id: mas_code, cas_code = self._viya35_score_code_import( @@ -2260,13 +2260,10 @@ def _viya35_score_code_import( mr.update_model(model) return mas_code, cas_code - def _add_preprocess_code( - self, - preprocess_function: Callable[DataFrame, DataFrame] - ): + def _add_preprocess_code(self, preprocess_function: Callable[DataFrame, DataFrame]): """ Places the given preprocess function, which must both take a DataFrame as an argument - and return a DataFrame, into the score code. If the preprocess function does not + and return a DataFrame, into the score code. If the preprocess function does not return anything, an error is thrown. Parameters @@ -2275,14 +2272,15 @@ def _add_preprocess_code( The preprocess function to be added to the score code. """ import inspect + preprocess_code = inspect.getsource(preprocess_function) if not "return" in preprocess_code: raise ValueError( - "The given score code does not return a value. " + - "To allow for the score code to work correctly, please ensure the preprocessed " + - "data is returned." + "The given score code does not return a value. " + + "To allow for the score code to work correctly, please ensure the preprocessed " + + "data is returned." ) - if self.score_code[-1] == '\n': + if self.score_code[-1] == "\n": self.score_code += preprocess_code else: - self.score_code += '\n' + preprocess_code \ No newline at end of file + self.score_code += "\n" + preprocess_code From d73f6b6a6d7100b730929b5304bca2bb71ae36c3 Mon Sep 17 00:00:00 2001 From: djm21 Date: Tue, 15 Oct 2024 12:17:42 -0700 Subject: [PATCH 3/7] update to preprocess unit tests --- tests/unit/test_write_score_code.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/unit/test_write_score_code.py b/tests/unit/test_write_score_code.py index 21fbf5b5..9a1de0ab 100644 --- a/tests/unit/test_write_score_code.py +++ b/tests/unit/test_write_score_code.py @@ -198,6 +198,7 @@ def test_preprocess_function(): data=[[0, "a", 1], [2, "b", 0]], columns=["num", "char", "bin"] ) sc = ScoreCode() + sc.score_code = " " def preprocess_function_one(data: pd.DataFrame): print("preprocessing happens here") return data @@ -206,6 +207,7 @@ def preprocess_function_one(data: pd.DataFrame): assert "preprocess_function_one" in sc.score_code sc = ScoreCode() + sc.score_code = " " def preprocess_function_two(data: pd.DataFrame): print("preprocessing happens here?") with pytest.raises(ValueError): From 9158ab26d11b7f504c490a41da8bee272a86ed90 Mon Sep 17 00:00:00 2001 From: djm21 Date: Mon, 21 Oct 2024 00:46:11 -0700 Subject: [PATCH 4/7] Updates to preprocess function --- src/sasctl/pzmm/import_model.py | 2 +- src/sasctl/pzmm/write_score_code.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/sasctl/pzmm/import_model.py b/src/sasctl/pzmm/import_model.py index bfa9c6f7..3b2b9218 100644 --- a/src/sasctl/pzmm/import_model.py +++ b/src/sasctl/pzmm/import_model.py @@ -213,7 +213,7 @@ def import_model( target_values: Optional[List[str]] = None, overwrite_project_properties: Optional[bool] = False, target_index: Optional[int] = None, - preprocess_function: Optional[Callable[DataFrame, DataFrame]] = None, + preprocess_function: Optional[Callable[[DataFrame], DataFrame]] = None, **kwargs, ) -> Tuple[RestObj, Union[dict, str, Path]]: """ diff --git a/src/sasctl/pzmm/write_score_code.py b/src/sasctl/pzmm/write_score_code.py index 9a6e80f8..e990d461 100644 --- a/src/sasctl/pzmm/write_score_code.py +++ b/src/sasctl/pzmm/write_score_code.py @@ -36,7 +36,7 @@ def write_score_code( score_cas: Optional[bool] = True, score_code_path: Union[Path, str, None] = None, target_index: Optional[int] = None, - preprocess_function: Optional[Callable[DataFrame, DataFrame]] = None, + preprocess_function: Optional[Callable[[DataFrame], DataFrame]] = None, **kwargs, ) -> Union[dict, None]: """ @@ -763,7 +763,7 @@ def _predict_method( missing_values: Optional[Any] = None, statsmodels_model: Optional[bool] = False, tf_model: Optional[bool] = False, - preprocess_function: Optional[Callable[DataFrame, DataFrame]] = None, + preprocess_function: Optional[Callable[[DataFrame], DataFrame]] = None, ) -> None: """ Write the model prediction section of the score code. @@ -2260,7 +2260,7 @@ def _viya35_score_code_import( mr.update_model(model) return mas_code, cas_code - def _add_preprocess_code(self, preprocess_function: Callable[DataFrame, DataFrame]): + def _add_preprocess_code(self, preprocess_function: Callable[[DataFrame], DataFrame]): """ Places the given preprocess function, which must both take a DataFrame as an argument and return a DataFrame, into the score code. If the preprocess function does not From d55f57dbf1eab1d9202b5826c2d86f5f6deb57ac Mon Sep 17 00:00:00 2001 From: djm21 Date: Mon, 28 Oct 2024 14:36:23 -0700 Subject: [PATCH 5/7] updates to write_score_code to correct formatting errors --- src/sasctl/pzmm/write_score_code.py | 35 ++++++++++++++++------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/src/sasctl/pzmm/write_score_code.py b/src/sasctl/pzmm/write_score_code.py index e990d461..b44df206 100644 --- a/src/sasctl/pzmm/write_score_code.py +++ b/src/sasctl/pzmm/write_score_code.py @@ -250,6 +250,7 @@ def score(var1, var2, var3, var4): input_var_list, missing_values=missing_values, dtype_list=input_dtypes_list, + preprocess_function=preprocess_function ) self._predictions_to_metrics( score_metrics, @@ -266,6 +267,7 @@ def score(var1, var2, var3, var4): missing_values=missing_values, statsmodels_model="statsmodels_model" in kwargs, tf_model="tf_keras_model" in kwargs or "tf_core_model" in kwargs, + preprocess_function=preprocess_function ) # Include check for numpy values and a conversion operation as needed self.score_code += ( @@ -814,14 +816,15 @@ def _predict_method( input_frame = f'{{{", ".join(input_dict)}}}, index=index' self.score_code += self._wrap_indent_string(input_frame, 8) self.score_code += f"\n{'':4})\n" - if preprocess_function: - self.score_code += ( - f"{'':4}input_array = {preprocess_function.__name__}(input_array)" - ) + if missing_values: self.score_code += ( f"{'':4}input_array = impute_missing_values(input_array)\n" ) + if preprocess_function: + self.score_code += ( + f"{'':4}input_array = {preprocess_function.__name__}(input_array)\n" + ) self.score_code += ( f"{'':4}column_types = {column_types}\n" f"{'':4}h2o_array = h2o.H2OFrame(input_array, " @@ -860,14 +863,14 @@ def _predict_method( input_frame = f'{{{", ".join(input_dict)}}}, index=index' self.score_code += self._wrap_indent_string(input_frame, 8) self.score_code += f"\n{'':4})\n" - if preprocess_function: - self.score_code += ( - f"{'':4}input_array = {preprocess_function.__name__}(input_array)" - ) if missing_values: self.score_code += ( f"{'':4}input_array = impute_missing_values(input_array)\n" ) + if preprocess_function: + self.score_code += ( + f"{'':4}input_array = {preprocess_function.__name__}(input_array)\n" + ) self.score_code += ( f"{'':4}prediction = model.{method.__name__}(input_array)\n" ) @@ -885,14 +888,14 @@ def _predict_method( input_frame = f'{{{", ".join(input_dict)}}}, index=index' self.score_code += self._wrap_indent_string(input_frame, 8) self.score_code += f"\n{'':4})\n" - if preprocess_function: - self.score_code += ( - f"{'':4}input_array = {preprocess_function.__name__}(input_array)" - ) if missing_values: self.score_code += ( f"{'':4}input_array = impute_missing_values(input_array)\n" ) + if preprocess_function: + self.score_code += ( + f"{'':4}input_array = {preprocess_function.__name__}(input_array)\n" + ) self.score_code += ( f"{'':4}prediction = model.{method.__name__}(input_array)\n\n" f"{'':4} # Check if model returns logits or probabilities\n" @@ -921,14 +924,14 @@ def _predict_method( input_frame = f'{{{", ".join(input_dict)}}}, index=index' self.score_code += self._wrap_indent_string(input_frame, 8) self.score_code += f"\n{'':4})\n" - if preprocess_function: - self.score_code += ( - f"{'':4}input_array = {preprocess_function.__name__}(input_array)" - ) if missing_values: self.score_code += ( f"{'':4}input_array = impute_missing_values(input_array)\n" ) + if preprocess_function: + self.score_code += ( + f"{'':4}input_array = {preprocess_function.__name__}(input_array)\n" + ) self.score_code += ( f"{'':4}prediction = model.{method.__name__}(input_array).tolist()\n" ) From 2bd7d8012ba54d00b04f358a1b318276449a3843 Mon Sep 17 00:00:00 2001 From: djm21 Date: Mon, 28 Oct 2024 14:43:25 -0700 Subject: [PATCH 6/7] updates to model card example to include preprocessing function example --- ...> pzmm_generate_complete_model_card.ipynb} | 279 ++++-------------- 1 file changed, 61 insertions(+), 218 deletions(-) rename examples/{pzmm_generate_complete_mode_card.ipynb => pzmm_generate_complete_model_card.ipynb} (93%) diff --git a/examples/pzmm_generate_complete_mode_card.ipynb b/examples/pzmm_generate_complete_model_card.ipynb similarity index 93% rename from examples/pzmm_generate_complete_mode_card.ipynb rename to examples/pzmm_generate_complete_model_card.ipynb index fb8373f7..7ebf0858 100644 --- a/examples/pzmm_generate_complete_mode_card.ipynb +++ b/examples/pzmm_generate_complete_model_card.ipynb @@ -493,7 +493,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The data is looking better, but the education statuses are a bit too granular. Lets combine some of them to make the job easier for our model." + "The data is looking better, but the martial status, education and work class statuses are a bit too granular. Lets combine some of them to make the job easier for our model." ] }, { @@ -549,6 +549,53 @@ "df.columns" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If we want the automatically generated score code to leaverage these steps when scoring new data, we can put them in a preprocessing function and pass them into our import_model function call. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def preprocess_function(df):\n", + " cat_vals = df[[\"WorkClass\", \"Education\", \"MartialStatus\", \"Relationship\", \"Race\", \"Sex\"]]\n", + " df = pd.get_dummies(df, columns=[\"WorkClass\", \"Education\", \"MartialStatus\", \"Relationship\", \"Race\", \"Sex\"])\n", + " df.columns = df.columns.str.replace(' ', '')\n", + " df.columns = df.columns.str.replace('-', '_')\n", + " df = df.drop(['Sex_Male'], axis=1)\n", + " df = pd.concat([df, cat_vals], axis=1).drop('index', axis=1)\n", + " # For the model to score correctly, all OHE columns must exist\n", + " input_cols = [\n", + " \"Education_9th\", \"Education_10th\", \"Education_11th\", \"Education_12th\", \"Education_Assoc_voc\", \"Education_Assoc_acdm\", \"Education_Masters\", \"Education_Prof_school\",\n", + " \"Education_Doctorate\", \"Education_Preschool\", \"Education_1st_4th\", \"Education_5th_6th\", \"Education_7th_8th\", \"WorkClass_Self_emp_inc\", \"WorkClass_Self_emp_not_inc\",\n", + " \"WorkClass_Federal_gov\", \"WorkClass_Local_gov\", \"WorkClass_State_gov\", \"WorkClass_Without_pay\", \"WorkClass_Never_worked\", \"MartialStatus_Married_spouse_absent\",\n", + " \"MartialStatus_Married_AF_spouse\", 'MartialStatus_Married_civ_spouse', 'MartialStatus_Never_married', 'MartialStatus_Divorced', 'MartialStatus_Separated', \n", + " 'MartialStatus_Widowed', 'Race_White', 'Race_Black', 'Race_Asian_Pac_Islander', 'Race_Amer_Indian_Eskimo', 'Race_Other', 'Relationship_Husband', \n", + " 'Relationship_Not_in_family', 'Relationship_Own_child', 'Relationship_Unmarried', 'Relationship_Wife', 'Relationship_Other_relative', 'WorkClass_Private',\n", + " 'Education_Bachelors'\n", + " ]\n", + " for col in input_cols:\n", + " if col not in df.columns:\n", + " df[col] = 0\n", + " df[\"Education_Some_HS\"] = df[\"Education_9th\"] | df[\"Education_10th\"] | df[\"Education_11th\"] | df[\"Education_12th\"]\n", + " df[\"Education_Assoc\"] = df[\"Education_Assoc_voc\"] | df[\"Education_Assoc_acdm\"]\n", + " df[\"Education_Adv_Degree\"] = df[\"Education_Masters\"] | df[\"Education_Prof_school\"] | df[\"Education_Doctorate\"]\n", + " df[\"Education_No_HS\"] = df[\"Education_Preschool\"] | df[\"Education_1st_4th\"] | df[\"Education_5th_6th\"] | df[\"Education_7th_8th\"]\n", + "\n", + " df[\"WorkClass_Self\"] = df[\"WorkClass_Self_emp_inc\"] | df[\"WorkClass_Self_emp_not_inc\"]\n", + " df[\"WorkClass_Gov\"] = df[\"WorkClass_Federal_gov\"] | df[\"WorkClass_Local_gov\"] | df[\"WorkClass_State_gov\"]\n", + " df[\"WorkClass_Other\"] = df[\"WorkClass_Without_pay\"] | df[\"WorkClass_Never_worked\"]\n", + "\n", + " df[\"MartialStatus_Other\"] = df[\"MartialStatus_Married_spouse_absent\"] | df[\"MartialStatus_Married_AF_spouse\"]\n", + "\n", + " return df" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -663,9 +710,8 @@ "1. Write model performance statistics to a file. \n", "1. Assess model bias and fairness (if a potentially sensitive variable is available for assessment). \n", "1. Generate the last few model card files. \n", - "1. Write model score code (Writing the score code yourself is not necessary if you are using the import_model function to automatically generate the score code. Since we've heavily processed the data, I want to write this score code manually to include the data preprocessing code. ). \n", "1. Generate requirements file. \n", - "1. Import model to SAS Model Manager. \n", + "1. Import model to SAS Model Manager and automatically generate the score code. \n", "1. Open the model in SAS Model Manager and begin managing the model lifecycle there. \n", "\n", "So first, be sure that the variables in the block below match your use case." @@ -1608,194 +1654,6 @@ ")" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "I'm going to write the score code myself because I want to include all the data preprocessing we did earlier, but that is not a requirement. If you prefer that this score code be automatically written, I'll provide example code before the conclusion you can use instead. " - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "# Step 13: Write score code\n", - "sn = 'score_' + model_prefix + \".py\"\n", - "sc = Path.cwd() / output_path / sn" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%writefile $sc\n", - "\n", - "import math\n", - "import pickle\n", - "import pandas as pd\n", - "import numpy as np\n", - "from pathlib import Path\n", - "\n", - "import settings\n", - "\n", - "with open(Path(settings.pickle_path) / \"dtc.pickle\", \"rb\") as pickle_model:\n", - " model = pickle.load(pickle_model)\n", - "\n", - "def score(Age, WorkClass, Education, MartialStatus, Relationship, Race, Sex, HoursPerWeek): \n", - " \"Output: EM_EVENTPROBABILITY, EM_CLASSIFICATION\" \n", - "\n", - " # Check if pickle file is loaded, load if unavailable\n", - " try:\n", - " global model\n", - " except NameError:\n", - " with open(settings.pickle_path + 'dtc.pickle', 'rb') as _pFile:\n", - " model = pickle.load(_pFile)\n", - "\n", - " # Encode WorkClass\n", - " WorkClass_Private = 0\n", - " WorkClass_Self = 0\n", - " WorkClass_Gov = 0 \n", - " WorkClass_Other = 0\n", - " if \"Private\" in WorkClass: WorkClass_Private = 1\n", - " elif \"Self-emp-not-inc\" in WorkClass: WorkClass_Self = 1\n", - " elif \"Self-emp-inc\" in WorkClass: WorkClass_Self = 1\n", - " elif \"Local-gov\" in WorkClass: WorkClass_Gov = 1\n", - " elif \"State-gov\" in WorkClass: WorkClass_Gov = 1\n", - " elif \"Federal-gov\" in WorkClass: WorkClass_Gov = 1\n", - " else: WorkClass_Other = 1\n", - "\n", - "\n", - " # Encode Education\n", - " Education_HS_grad = 0\n", - " Education_Some_HS = 0\n", - " Education_Assoc = 0\n", - " Education_Some_college = 0\n", - " Education_Bachelors = 0\n", - " Education_Adv_Degree = 0\n", - " Education_No_HS = 0\n", - "\n", - " if \"HS-grad\" in Education: Education_HS_grad = 1\n", - " elif \"11th\" in Education: Education_Some_HS = 1\n", - " elif \"10th\" in Education: Education_Some_HS = 1\n", - " elif \"9th\" in Education: Education_Some_HS = 1\n", - " elif \"12th\" in Education: Education_Some_HS = 1\n", - " elif \"Assoc-voc\" in Education: Education_Assoc = 1\n", - " elif \"Assoc-acdm\" in Education: Education_Assoc = 1\n", - " elif \"Some-college\" in Education: Education_Some_college = 1\n", - " elif \"Bachelors\" in Education: Education_Bachelors = 1\n", - " elif \"Masters\" in Education: Education_Adv_Degree = 1\n", - " elif \"Prof-school\" in Education: Education_Adv_Degree = 1\n", - " elif \"Doctorate\" in Education: Education_Adv_Degree = 1\n", - " else:Education_No_HS = 1\n", - " \n", - " # Encode MaritalStatus\n", - " MartialStatus_Married_civ_spouse = 0\n", - " MartialStatus_Never_married = 0\n", - " MartialStatus_Divorced = 0\n", - " MartialStatus_Separated = 0 \n", - " MartialStatus_Widowed = 0 \n", - " MartialStatus_Other = 0\n", - " if \"Married-civ-spouse\" in MartialStatus: MartialStatus_Married_civ_spouse = 1\n", - " elif \"Never-married\" in MartialStatus: MartialStatus_Never_married = 1\n", - " elif \"Divorced\" in MartialStatus: MartialStatus_Divorced = 1\n", - " elif \"Separated\" in MartialStatus: MartialStatus_Separated = 1\n", - " elif \"Widowed\" in MartialStatus: MartialStatus_Widowed = 1\n", - " else: MartialStatus_Other = 1\n", - " \n", - " # Encode Relationship\n", - " Relationship_Husband = 0\n", - " Relationship_Not_in_family = 0\n", - " Relationship_Own_child= 0\n", - " Relationship_Unmarried = 0 \n", - " Relationship_Wife = 0\n", - " Relationship_Other_relative = 0\n", - " if \"Husband\" in Relationship: Relationship_Husband = 1\n", - " elif \"Not-in-family\" in Relationship: Relationship_Not_in_family = 1\n", - " elif \"Own-child\" in Relationship: Relationship_Own_child = 1\n", - " elif \"Unmarried\" in Relationship: Relationship_Unmarried = 1\n", - " elif \"Wife\" in Relationship: Relationship_Wife = 1\n", - " else: Relationship_Other_relative = 1\n", - "\n", - " # Encode Race\n", - " Race_White = 0\n", - " Race_Black = 0\n", - " Race_Asian_Pac_Islander = 0\n", - " Race_Amer_Indian_Eskimo = 0\n", - " Race_Other = 0\n", - " if \"White\" in Race: Race_White = 1\n", - " elif \"Black\" in Race: Race_Black = 1\n", - " elif \"Asian-Pac-Islander\" in Race: Race_Asian_Pac_Islander = 1\n", - " elif \"Amer-Indian-Eskimo\" in Race: Race_Amer_Indian_Eskimo = 1\n", - " else: Race_Other = 1\n", - "\n", - " # Encode Sex\n", - " Sex_Female = 0\n", - " if \"Female\" in Sex: Sex_Female = 1\n", - "\n", - " try: \n", - " input_array = pd.DataFrame([[Age, HoursPerWeek, WorkClass_Private, WorkClass_Self, WorkClass_Gov, WorkClass_Other, \n", - " Education_HS_grad, Education_Some_HS, Education_Assoc, Education_Some_college, \n", - " Education_Bachelors, Education_Adv_Degree, Education_No_HS, \n", - " MartialStatus_Married_civ_spouse, MartialStatus_Never_married, \n", - " MartialStatus_Divorced, MartialStatus_Separated, MartialStatus_Widowed, \n", - " MartialStatus_Other, Relationship_Husband, Relationship_Not_in_family, \n", - " Relationship_Own_child, Relationship_Unmarried, Relationship_Wife, \n", - " Relationship_Other_relative, Race_White, Race_Black, Race_Asian_Pac_Islander, \n", - " Race_Amer_Indian_Eskimo, Race_Other, Sex_Female]], \n", - " columns = ['Age', 'HoursPerWeek', 'WorkClass_Private', 'WorkClass_Self', 'WorkClass_Gov', 'WorkClass_Other', \n", - " 'Education_HS_grad', 'Education_Some_HS', 'Education_Assoc', 'Education_Some_college', \n", - " 'Education_Bachelors','Education_Adv_Degree', 'Education_No_HS', \n", - " 'MartialStatus_Married_civ_spouse', 'MartialStatus_Never_married', \n", - " 'MartialStatus_Divorced', 'MartialStatus_Separated', 'MartialStatus_Widowed', \n", - " 'MartialStatus_Other', 'Relationship_Husband', 'Relationship_Not_in_family', \n", - " 'Relationship_Own_child', 'Relationship_Unmarried', 'Relationship_Wife', \n", - " 'Relationship_Other_relative', 'Race_White', 'Race_Black', 'Race_Asian_Pac_Islander', \n", - " 'Race_Amer_Indian_Eskimo', 'Race_Other','Sex_Female'], \n", - " dtype = float)\n", - " except ValueError:\n", - " input_array = pd.DataFrame([[1.0, Age, HoursPerWeek, WorkClass_Private, WorkClass_Self, WorkClass_Gov, WorkClass_Other, \n", - " Education_HS_grad, Education_Some_HS, Education_Assoc, Education_Some_college, \n", - " Education_Bachelors, Education_Adv_Degree, Education_No_HS, \n", - " MartialStatus_Married_civ_spouse, MartialStatus_Never_married, \n", - " MartialStatus_Divorced, MartialStatus_Separated, MartialStatus_Widowed, \n", - " MartialStatus_Other, Relationship_Husband, Relationship_Not_in_family, \n", - " Relationship_Own_child, Relationship_Unmarried, Relationship_Wife, \n", - " Relationship_Other_relative, Race_White, Race_Black, Race_Asian_Pac_Islander, \n", - " Race_Amer_Indian_Eskimo, Race_Other, Sex_Female]], \n", - " columns = ['const', 'Age', 'HoursPerWeek', 'WorkClass_Private', 'WorkClass_Self', 'WorkClass_Gov', 'WorkClass_Other', \n", - " 'Education_HS_grad', 'Education_Some_HS', 'Education_Assoc', 'Education_Some_college', \n", - " 'Education_Bachelors', 'Education_Adv_Degree', 'Education_No_HS', \n", - " 'MartialStatus_Married_civ_spouse', 'MartialStatus_Never_married', \n", - " 'MartialStatus_Divorced', 'MartialStatus_Separated', 'MartialStatus_Widowed', \n", - " 'MartialStatus_Other', 'Relationship_Husband','Relationship_Not_in_family', \n", - " 'Relationship_Own_child', 'Relationship_Unmarried', 'Relationship_Wife', \n", - " 'Relationship_Other_relative', 'Race_White', 'Race_Black', 'Race_Asian_Pac_Islander', \n", - " 'Race_Amer_Indian_Eskimo', 'Race_Other','Sex_Female'], \n", - " dtype = float)\n", - "\n", - " prediction = model.predict_proba(input_array).tolist()\n", - "\n", - " # Check for numpy values and convert to a CAS readable representation\n", - " if isinstance(prediction, np.ndarray):\n", - " prediction = prediction.tolist()\n", - "\n", - " if input_array.shape[0] == 1:\n", - " if prediction[0][1] > 0.5:\n", - " EM_CLASSIFICATION = \"1\"\n", - " else:\n", - " EM_CLASSIFICATION = \"0\"\n", - " return EM_CLASSIFICATION, prediction[0][1]\n", - " else:\n", - " df = pd.DataFrame(prediction)\n", - " proba = df[1]\n", - " classifications = np.where(df[1] > 0.5, '1', '0')\n", - " return pd.DataFrame({'EM_CLASSIFICATION': classifications, 'EM_EVENTPROBABILITY': proba})" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -1844,7 +1702,7 @@ } ], "source": [ - "# Step 14: Generate requirements files\n", + "# Step 13: Generate requirements files\n", "requirements_json = pzmm.JSONFiles.create_requirements_json(output_path)\n", "\n", "import json\n", @@ -1865,7 +1723,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Alright, we should have all our files now, so let's import our model and its files into SAS Model Manager! " + "Alright, we should have all our files now, so let's import our model and its files into SAS Model Manager! With the following function, we can automatically generate the score code and import the model into SAS Model Manager in one step." ] }, { @@ -1874,30 +1732,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Step 15: Import model into SAS Model Manager \n", - "zipIOFile = pzmm.ZipModel.zip_files(model_files=output_path, model_prefix=model_prefix, is_viya4=True)\n", - "\n", - "mr.create_project(mm_project, 'Public')\n", - "\n", - "mr.import_model_from_zip(model_prefix, mm_project, zipIOFile)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And now your model is available in SAS Model Manager where you can run the scoring test, update the model usage properties, add Key Performance Indicators (KPIs) thresholds, and run performance monitoring against the model. Managing the rest of the model lifecycle in SAS Model Manager will complete the rest of the model card.\n", - "\n", - "Instead of writing the score code manually and importing the model, you can run the following function to automatically generate the score code and import the model into SAS Model Manager in one step. Automatically generating the score code makes things easy, but manually writing allows greater customization and control. Only run the block below if you don't want to run steps 13 and 15. \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Alternate to Step 13 and Step 15: Automatically generate score code and import model \n", + "# Step 14: Automatically generate score code and import model \n", "pzmm.ImportModel.import_model(\n", " model_files=output_path, # Where are the model files?\n", " model_prefix=model_prefix, # What is the model name?\n", @@ -1910,12 +1745,20 @@ " target_values=[\"0\", \"1\"], # What are the expected values of the target variable?\n", " target_index=1, # What is the index of the target value in target_values?\n", " model_file_name=model_prefix + \".pickle\", # How was the model file serialized?\n", - " missing_values=False # Does the data include missing values?\n", + " missing_values=False, # Does the data include missing values?\n", + " preprocess_function=preprocess_function # What do we want to do to the data before we score it?\n", " )\n", " # Reinitialize the score_code variable when writing more than one model's score code\n", "pzmm.ScoreCode.score_code = \"\"" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And now your model is available in SAS Model Manager where you can run the scoring test, update the model usage properties, add Key Performance Indicators (KPIs) thresholds, and run performance monitoring against the model. Managing the rest of the model lifecycle in SAS Model Manager will complete the rest of the model card." + ] + }, { "cell_type": "markdown", "metadata": {}, From 9c2bd1e2581b687d8a75cb3bb121dfb31a79636e Mon Sep 17 00:00:00 2001 From: djm21 Date: Mon, 28 Oct 2024 14:51:43 -0700 Subject: [PATCH 7/7] black reformatting --- src/sasctl/pzmm/write_score_code.py | 8 +++++--- tests/unit/test_write_score_code.py | 7 ++++++- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/sasctl/pzmm/write_score_code.py b/src/sasctl/pzmm/write_score_code.py index b44df206..86bd8119 100644 --- a/src/sasctl/pzmm/write_score_code.py +++ b/src/sasctl/pzmm/write_score_code.py @@ -250,7 +250,7 @@ def score(var1, var2, var3, var4): input_var_list, missing_values=missing_values, dtype_list=input_dtypes_list, - preprocess_function=preprocess_function + preprocess_function=preprocess_function, ) self._predictions_to_metrics( score_metrics, @@ -267,7 +267,7 @@ def score(var1, var2, var3, var4): missing_values=missing_values, statsmodels_model="statsmodels_model" in kwargs, tf_model="tf_keras_model" in kwargs or "tf_core_model" in kwargs, - preprocess_function=preprocess_function + preprocess_function=preprocess_function, ) # Include check for numpy values and a conversion operation as needed self.score_code += ( @@ -2263,7 +2263,9 @@ def _viya35_score_code_import( mr.update_model(model) return mas_code, cas_code - def _add_preprocess_code(self, preprocess_function: Callable[[DataFrame], DataFrame]): + def _add_preprocess_code( + self, preprocess_function: Callable[[DataFrame], DataFrame] + ): """ Places the given preprocess function, which must both take a DataFrame as an argument and return a DataFrame, into the score code. If the preprocess function does not diff --git a/tests/unit/test_write_score_code.py b/tests/unit/test_write_score_code.py index 9a1de0ab..25eb1aaf 100644 --- a/tests/unit/test_write_score_code.py +++ b/tests/unit/test_write_score_code.py @@ -188,6 +188,7 @@ def test_impute_missing_values(): assert "'b': 'test'" in sc.score_code assert "'c': 1" in sc.score_code or "'c': np.int64(1)" in sc.score_code + def test_preprocess_function(): """ Test Cases: @@ -199,20 +200,24 @@ def test_preprocess_function(): ) sc = ScoreCode() sc.score_code = " " + def preprocess_function_one(data: pd.DataFrame): print("preprocessing happens here") return data + sc._add_preprocess_code(preprocess_function_one) assert "preprocessing happens here" in sc.score_code assert "preprocess_function_one" in sc.score_code sc = ScoreCode() sc.score_code = " " + def preprocess_function_two(data: pd.DataFrame): print("preprocessing happens here?") + with pytest.raises(ValueError): sc._add_preprocess_code(preprocess_function_two) - + def test_predict_method(): """