fix: use sanitized model names in file names.

jlwalke2 · jlwalke2 · commit 5f79aebc078b · 2024-08-16T13:31:32.000-04:00
diff --git a/src/sasctl/pzmm/pickle_model.py b/src/sasctl/pzmm/pickle_model.py
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # %%
 import codecs
-import gzip
 import pickle
 import shutil
 from pathlib import Path
@@ -77,6 +76,9 @@ def pickle_trained_model(
             models.
 
         """
+        from .write_score_code import ScoreCode
+        sanitized_prefix = ScoreCode.sanitize_model_prefix(model_prefix)
+
         if is_binary_string:
             # For models that use a binary string representation
             binary_string = codecs.encode(
@@ -91,25 +93,25 @@ def pickle_trained_model(
                 # For models imported from MLFlow
                 shutil.copy(ml_pickle_path, pickle_path)
                 pzmm_pickle_path = Path(pickle_path) / mlflow_details["model_path"]
-                pzmm_pickle_path.rename(Path(pickle_path) / (model_prefix + PICKLE))
+                pzmm_pickle_path.rename(Path(pickle_path) / (sanitized_prefix + PICKLE))
             else:
                 with open(ml_pickle_path, "rb") as pickle_file:
-                    return {model_prefix + PICKLE: pickle.load(pickle_file)}
+                    return {sanitized_prefix + PICKLE: pickle.load(pickle_file)}
         else:
             # For all other model types
             if not is_h2o_model:
                 if pickle_path:
                     with open(
-                        Path(pickle_path) / (model_prefix + PICKLE), "wb"
+                        Path(pickle_path) / (sanitized_prefix + PICKLE), "wb"
                     ) as pickle_file:
                         pickle.dump(trained_model, pickle_file)
                     if cls.notebook_output:
                         print(
                             f"Model {model_prefix} was successfully pickled and saved "
-                            f"to {Path(pickle_path) / (model_prefix + PICKLE)}."
+                            f"to {Path(pickle_path) / (sanitized_prefix + PICKLE)}."
                         )
                 else:
-                    return {model_prefix + PICKLE: pickle.dumps(trained_model)}
+                    return {sanitized_prefix + PICKLE: pickle.dumps(trained_model)}
             # For binary H2O models, save the binary file as a "pickle" file
             elif is_h2o_model and is_binary_model and pickle_path:
                 if not h2o:
@@ -121,7 +123,7 @@ def pickle_trained_model(
                     model=trained_model,
                     force=True,
                     path=str(pickle_path),
-                    filename=f"{model_prefix}.pickle",
+                    filename=f"{sanitized_prefix}.pickle",
                 )
             # For MOJO H2O models, save as a mojo file and adjust the extension to .mojo
             elif is_h2o_model and pickle_path:
@@ -130,7 +132,7 @@ def pickle_trained_model(
                         "The h2o package is required to save the model as a mojo model."
                     )
                 trained_model.save_mojo(
-                    force=True, path=str(pickle_path), filename=f"{model_prefix}.mojo"
+                    force=True, path=str(pickle_path), filename=f"{sanitized_prefix}.mojo"
                 )
             elif is_binary_model or is_h2o_model:
                 raise ValueError(
diff --git a/src/sasctl/pzmm/write_json_files.py b/src/sasctl/pzmm/write_json_files.py
@@ -498,18 +498,22 @@ def write_file_metadata_json(
             Dictionary containing a key-value pair representing the file name and json
             dump respectively.
         """
+
+        from .write_score_code import ScoreCode
+        sanitized_prefix = ScoreCode.sanitize_model_prefix(model_prefix)
+
         dict_list = [
             {"role": "inputVariables", "name": INPUT},
             {"role": "outputVariables", "name": OUTPUT},
-            {"role": "score", "name": f"score_{model_prefix}.py"},
+            {"role": "score", "name": f"score_{sanitized_prefix}.py"},
         ]
         if is_h2o_model:
-            dict_list.append({"role": "scoreResource", "name": model_prefix + ".mojo"})
+            dict_list.append({"role": "scoreResource", "name": sanitized_prefix + ".mojo"})
         elif is_tf_keras_model:
-            dict_list.append({"role": "scoreResource", "name": model_prefix + ".h5"})
+            dict_list.append({"role": "scoreResource", "name": sanitized_prefix + ".h5"})
         else:
             dict_list.append(
-                {"role": "scoreResource", "name": model_prefix + ".pickle"}
+                {"role": "scoreResource", "name": sanitized_prefix + ".pickle"}
             )
 
         if json_path:
diff --git a/src/sasctl/pzmm/write_score_code.py b/src/sasctl/pzmm/write_score_code.py
@@ -153,6 +153,8 @@ def write_score_code(
 
         model_id = cls._check_viya_version(model)
 
+        sanitized_model_prefix = cls.sanitize_model_prefix(model_prefix)
+
         # Set the model_file_name based on kwargs input
         if "model_file_name" in kwargs and "binary_string" in kwargs:
             raise ValueError(
@@ -203,7 +205,7 @@ def write_score_code(
         else:
             model_load = None
 
-        model_prefix = cls._check_valid_model_prefix(model_prefix)
+
 
         # Define the score function using the variables found in input_data
         cls.score_code += f"def score({', '.join(input_var_list)}):\n"
@@ -295,7 +297,7 @@ def score(var1, var2, var3, var4):
             )
 
         if score_code_path:
-            py_code_path = Path(score_code_path) / f"score_{model_prefix}.py"
+            py_code_path = Path(score_code_path) / f"score_{sanitized_model_prefix}.py"
             with open(py_code_path, "w") as py_file:
                 py_file.write(cls.score_code)
             if model_id and score_cas:
@@ -306,7 +308,7 @@ def score(var1, var2, var3, var4):
                     # noinspection PyUnboundLocalVariable
                     sas_file.write(cas_code)
         else:
-            output_dict = {f"score_{model_prefix}.py": cls.score_code}
+            output_dict = {f"score_{sanitized_model_prefix}.py": cls.score_code}
             if model_id and score_cas:
                 # noinspection PyUnboundLocalVariable
                 output_dict[MAS_CODE_NAME] = mas_code
@@ -2139,7 +2141,7 @@ def _check_viya_version(cls, model: Union[str, dict, RestObj]) -> Union[str, Non
                 return None
 
     @staticmethod
-    def _check_valid_model_prefix(prefix: str) -> str:
+    def sanitize_model_prefix(prefix: str) -> str:
         """
         Check the model_prefix for a valid Python function name.
 
@@ -2153,6 +2155,7 @@ def _check_valid_model_prefix(prefix: str) -> str:
         -------
         model_prefix : str
             Returns a model_prefix, adjusted as needed for valid Python function names.
+
         """
         # Replace model_prefix if a valid function name is not provided
         if not prefix.isidentifier():
diff --git a/tests/unit/test_write_score_code.py b/tests/unit/test_write_score_code.py
@@ -1170,8 +1170,8 @@ def test_check_valid_model_prefix():
     - check model_prefix validity
         - raise warning and replace if invalid
     """
-    assert sc._check_valid_model_prefix("TestPrefix") == "TestPrefix"
-    assert sc._check_valid_model_prefix("Test Prefix") == "Test_Prefix"
+    assert sc.sanitize_model_prefix("TestPrefix") == "TestPrefix"
+    assert sc.sanitize_model_prefix("Test Prefix") == "Test_Prefix"
 
 
 def test_write_score_code(score_code_mocks):