Implement Custom Content-Disposition Header Parser to get rid of the CGI dependency (#1088)

mrDzurb · web-flow · commit 429464957b57 · 2025-02-27T14:04:21.000-08:00
diff --git a/ads/common/utils.py b/ads/common/utils.py
@@ -1,10 +1,8 @@
 #!/usr/bin/env python
-# -*- coding: utf-8; -*-
 
 # Copyright (c) 2020, 2024 Oracle and/or its affiliates.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
 
-from __future__ import absolute_import, print_function
 
 import collections
 import contextlib
@@ -23,9 +21,8 @@
 from datetime import datetime
 from enum import Enum
 from io import DEFAULT_BUFFER_SIZE
-from pathlib import Path
 from textwrap import fill
-from typing import Dict, Optional, Union
+from typing import Dict, Optional, Tuple, Union
 from urllib import request
 from urllib.parse import urlparse
 
@@ -501,13 +498,13 @@ def print_user_message(
     if is_documentation_mode() and is_notebook():
         if display_type.lower() == "tip":
             if "\n" in msg:
-                t = "<b>{}:</b>".format(title.upper().strip()) if title else ""
+                t = f"<b>{title.upper().strip()}:</b>" if title else ""
 
                 user_message = "{}{}".format(
                     t,
                     "".join(
                         [
-                            "<br>&nbsp;&nbsp;+&nbsp;{}".format(x.strip())
+                            f"<br>&nbsp;&nbsp;+&nbsp;{x.strip()}"
                             for x in msg.strip().split("\n")
                         ]
                     ),
@@ -646,7 +643,7 @@ def ellipsis_strings(raw, n=24):
         else:
             n2 = int(n) // 2 - 3
             n1 = n - n2 - 3
-            result.append("{0}...{1}".format(s[:n1], s[-n2:]))
+            result.append(f"{s[:n1]}...{s[-n2:]}")
 
     return result
 
@@ -942,9 +939,9 @@ def generate_requirement_file(
     with open(os.path.join(file_path, file_name), "w") as req_file:
         for lib in requirements:
             if requirements[lib]:
-                req_file.write("{}=={}\n".format(lib, requirements[lib]))
+                req_file.write(f"{lib}=={requirements[lib]}\n")
             else:
-                req_file.write("{}\n".format(lib))
+                req_file.write(f"{lib}\n")
 
 
 def _get_feature_type_and_dtype(column):
@@ -966,7 +963,7 @@ def to_dataframe(
         pd.Series,
         np.ndarray,
         pd.DataFrame,
-    ]
+    ],
 ):
     """
     Convert to pandas DataFrame.
@@ -1391,7 +1388,7 @@ def remove_file(file_path: str, auth: Optional[Dict] = None) -> None:
     fs = fsspec.filesystem(scheme, **auth)
     try:
         fs.rm(file_path)
-    except FileNotFoundError as e:
+    except FileNotFoundError:
         raise FileNotFoundError(f"`{file_path}` not found.")
     except Exception as e:
         raise e
@@ -1786,3 +1783,36 @@ def get_log_links(
         console_link_url = f"https://cloud.oracle.com/logging/log-groups/{log_group_id}?region={region}"
 
     return console_link_url
+
+
+def parse_content_disposition(header: str) -> Tuple[str, Dict[str, str]]:
+    """
+    Parses a Content-Disposition header into its main disposition and a dictionary of parameters.
+
+    For example:
+        'attachment; filename="example.txt"'
+    will be parsed into:
+        ('attachment', {'filename': 'example.txt'})
+
+    Parameters
+    ----------
+    header (str): The Content-Disposition header string.
+
+    Returns
+    -------
+    Tuple[str, Dict[str, str]]: A tuple containing the disposition and a dictionary of parameters.
+    """
+    if not header:
+        return "", {}
+
+    parts = header.split(";")
+    # The first part is the main disposition (e.g., "attachment").
+    disposition = parts[0].strip().lower()
+    params: Dict[str, str] = {}
+
+    # Process each subsequent part to extract key-value pairs.
+    for part in parts[1:]:
+        if "=" in part:
+            key, value = part.split("=", 1)
+            params[key.strip().lower()] = value.strip().strip('"')
+    return disposition, params
diff --git a/ads/model/datascience_model.py b/ads/model/datascience_model.py
@@ -3,7 +3,6 @@
 # Copyright (c) 2022, 2025 Oracle and/or its affiliates.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
 
-import cgi
 import json
 import logging
 import os
@@ -1776,7 +1775,9 @@ def _update_from_oci_dsc_model(
         # Update artifact info
         try:
             artifact_info = self.dsc_model.get_artifact_info()
-            _, file_name_info = cgi.parse_header(artifact_info["Content-Disposition"])
+            _, file_name_info = utils.parse_content_disposition(
+                artifact_info["Content-Disposition"]
+            )
 
             if self.dsc_model._is_model_by_reference():
                 _, file_extension = os.path.splitext(file_name_info["filename"])
diff --git a/tests/unitary/default_setup/common/test_common_utils.py b/tests/unitary/default_setup/common/test_common_utils.py
@@ -28,6 +28,7 @@
     extract_region,
     folder_size,
     human_size,
+    parse_content_disposition,
     remove_file,
     upload_to_os,
 )
@@ -579,3 +580,55 @@ def __init__(self, status_code):
             progress_callback=ANY,
         )
         assert response.status == 200
+
+
+class TestParseContentDisposition:
+    def test_attachment_with_quotes(self):
+        header = 'attachment; filename="example.txt"'
+        disposition, params = parse_content_disposition(header)
+        assert disposition == "attachment"
+        assert params == {"filename": "example.txt"}
+
+    def test_attachment_without_quotes(self):
+        header = "attachment; filename=example.txt"
+        disposition, params = parse_content_disposition(header)
+        assert disposition == "attachment"
+        assert params == {"filename": "example.txt"}
+
+    def test_inline_no_params(self):
+        header = "inline"
+        disposition, params = parse_content_disposition(header)
+        assert disposition == "inline"
+        assert params == {}
+
+    def test_multiple_params(self):
+        header = 'attachment; filename="example.txt"; size=12345'
+        disposition, params = parse_content_disposition(header)
+        assert disposition == "attachment"
+        assert params == {"filename": "example.txt", "size": "12345"}
+
+    def test_extra_whitespace(self):
+        header = '  attachment ;  filename =   "example.txt" ; param = value  '
+        disposition, params = parse_content_disposition(header)
+        assert disposition == "attachment"
+        assert params == {"filename": "example.txt", "param": "value"}
+
+    def test_form_data(self):
+        header = 'form-data; name="fieldName"; filename="filename.jpg"'
+        disposition, params = parse_content_disposition(header)
+        assert disposition == "form-data"
+        # Note: Keys are lowercased, but values remain as extracted.
+        # Here, 'name' remains "fieldName" since our parser does not modify the case of values.
+        assert params == {"name": "fieldName", "filename": "filename.jpg"}
+
+    def test_no_semicolon(self):
+        header = "attachment"
+        disposition, params = parse_content_disposition(header)
+        assert disposition == "attachment"
+        assert params == {}
+
+    def test_none(self):
+        header = None
+        disposition, params = parse_content_disposition(header)
+        assert disposition == ""
+        assert params == {}