add missing type hints to util code

gruebel · gruebel · commit d6846bd42ab1 · 2023-09-03T00:17:22.000+02:00
diff --git a/policy_sentry/util/arns.py b/policy_sentry/util/arns.py
@@ -8,17 +8,29 @@
 #     Case 5: arn:partition:service:region:account-id:resourcetype:resource
 #     Case 6: arn:partition:service:region:account-id:resourcetype:resource:qualifier
 #     Source: https://docs.aws.amazon.com/general/latest/gr/aws-arns-and-namespaces.html#genref-arns
+from __future__ import annotations
+
 import logging
 import re
 
+ARN_SEPARATOR_PATTERN = re.compile(r"[:/]")
+# Note: Each service can only have one of these, so these are definitely exceptions
+EXCLUSION_LIST = {
+    "${ObjectName}",
+    "${RepositoryName}",
+    "${BucketName}",
+    "table/${TableName}",
+    "${BucketName}/${ObjectName}",
+}
+
 logger = logging.getLogger(__name__)
 
 
 # pylint: disable=too-many-instance-attributes
 class ARN:
     """Class that helps to match ARN resource type formats neatly"""
 
-    def __init__(self, provided_arn):
+    def __init__(self, provided_arn: str) -> None:
         self.arn = provided_arn
         follows_arn_format = re.search(
             r"^arn:([^:]*):([^:]*):([^:]*):([^:]*):(.+)$", provided_arn
@@ -34,18 +46,20 @@ def __init__(self, provided_arn):
             self.account = elements[4]
             self.resource = elements[5]
         except IndexError as error:
-            raise Exception(f"The provided ARN is invalid. IndexError: {error}. Please provide a valid ARN.") from error
+            raise Exception(
+                f"The provided ARN is invalid. IndexError: {error}. Please provide a valid ARN."
+            ) from error
         if "/" in self.resource:
             self.resource, self.resource_path = self.resource.split("/", 1)
         elif ":" in self.resource:
             self.resource, self.resource_path = self.resource.split(":", 1)
         self.resource_string = self._resource_string()
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         return self.arn
 
     # pylint: disable=too-many-return-statements
-    def _resource_string(self):
+    def _resource_string(self) -> str:
         """
         Given an ARN, return the string after the account ID, no matter the ARN format.
         Return:
@@ -62,7 +76,7 @@ def _resource_string(self):
         resource_string = ":".join(split_arn[5:])
         return resource_string
 
-    def same_resource_type(self, arn_in_database):
+    def same_resource_type(self, arn_in_database: str) -> bool:
         """Given an arn, see if it has the same resource type"""
 
         # 1. If the RAW ARN in the database is *, then it doesn't have a resource type
@@ -80,14 +94,16 @@ def same_resource_type(self, arn_in_database):
         #   Previously, this would fail and return empty results.
         #   Now it correctly returns the full list of matching ARNs and corresponding actions.
         resource_type_arn_to_test = parse_arn_for_resource_type(self.arn)
-        if resource_type_arn_to_test == '*':
+        if resource_type_arn_to_test == "*":
             return True
 
         # 4. Match patterns for complicated resource strings, leveraging the standardized format of the Raw ARN format
         # table/${TableName} should not match `table/${TableName}/backup/${BackupName}`
         resource_string_arn_in_database = get_resource_string(arn_in_database)
 
-        split_resource_string_in_database = re.split(':|/', resource_string_arn_in_database)
+        split_resource_string_in_database = re.split(
+            ARN_SEPARATOR_PATTERN, resource_string_arn_in_database
+        )
         # logger.debug(str(split_resource_string_in_database))
         arn_format_list = []
         for elem in split_resource_string_in_database:
@@ -97,47 +113,47 @@ def same_resource_type(self, arn_in_database):
                 # If an element says something like ${TableName}, normalize it to an empty string
                 arn_format_list.append("")
 
-        split_resource_string_to_test = re.split(':|/', self.resource_string)
+        split_resource_string_to_test = re.split(
+            ARN_SEPARATOR_PATTERN, self.resource_string
+        )
         # 4b: If we have a confusing resource string, the length of the split resource string list
         #  should at least be the same
         # Again, table/${TableName} (len of 2) should not match `table/${TableName}/backup/${BackupName}` (len of 4)
         # if len(split_resource_string_to_test) != len(arn_format_list):
         #     return False
 
-        non_empty_arn_format_list = []
-        for i in arn_format_list:
-            if i != "":
-                non_empty_arn_format_list.append(i)
-
-        lower_resource_string = list(map(lambda x:x.lower(),split_resource_string_to_test))
-        for i in non_empty_arn_format_list:
-            if i.lower() not in lower_resource_string:
+        lower_resource_string = [x.lower() for x in split_resource_string_to_test]
+        for elem in arn_format_list:
+            if elem and elem.lower() not in lower_resource_string:
                 return False
 
         # 4c: See if the non-normalized fields match
-        for i in range(len(arn_format_list)):
+        for idx, elem in enumerate(arn_format_list):
             # If the field is not normalized to empty string, then make sure the resource type segments match
             # So, using table/${TableName}/backup/${BackupName} as an example:
             # table should match, backup should match,
             # and length of the arn_format_list should be the same as split_resource_string_to_test
             # If all conditions match, then the ARN format is the same.
-            if arn_format_list[i] != "":
-                if arn_format_list[i] == split_resource_string_to_test[i]:
+            if elem:
+                if elem == split_resource_string_to_test[idx]:
                     pass
-                elif split_resource_string_to_test[i] == "*":
+                elif split_resource_string_to_test[idx] == "*":
                     pass
                 else:
                     return False
 
         # 4. Special type for S3 bucket objects and CodeCommit repos
-        # Note: Each service can only have one of these, so these are definitely exceptions
-        exclusion_list = ["${ObjectName}", "${RepositoryName}", "${BucketName}", "table/${TableName}", "${BucketName}/${ObjectName}"]
         resource_path_arn_in_database = elements[5]
-        if resource_path_arn_in_database in exclusion_list:
-            logger.debug("Special type: %s", resource_path_arn_in_database)
+        if resource_path_arn_in_database in EXCLUSION_LIST:
+            logger.debug(f"Special type: {resource_path_arn_in_database}")
             # handling special case table/${TableName}
-            if resource_string_arn_in_database in ["table/${TableName}", "${BucketName}"]:
-                return len(self.resource_string.split('/')) == len(elements[5].split('/'))
+            if resource_string_arn_in_database in (
+                "table/${TableName}",
+                "${BucketName}",
+            ):
+                return len(self.resource_string.split("/")) == len(
+                    elements[5].split("/")
+                )
             # If we've made it this far, then it is a special type
             # return True
             # Presence of / would mean it's an object in both so it matches
@@ -154,7 +170,7 @@ def same_resource_type(self, arn_in_database):
         return True
 
 
-def parse_arn(arn):
+def parse_arn(arn: str) -> dict[str, str]:
     """
     Given an ARN, split up the ARN into the ARN namespacing schema dictated by the AWS docs.
     """
@@ -167,53 +183,52 @@ def parse_arn(arn):
             "region": elements[3],
             "account": elements[4],
             "resource": elements[5],
-            "resource_path": None,
+            "resource_path": "",
         }
     except IndexError as error:
-        raise Exception(f"IndexError: The provided ARN '{arn}' is invalid. Please provide a valid ARN.") from error
+        raise Exception(
+            f"IndexError: The provided ARN '{arn}' is invalid. Please provide a valid ARN."
+        ) from error
     if "/" in result["resource"]:
         result["resource"], result["resource_path"] = result["resource"].split("/", 1)
     elif ":" in result["resource"]:
         result["resource"], result["resource_path"] = result["resource"].split(":", 1)
     return result
 
 
-def get_service_from_arn(arn):
-    """Given an ARN string, return the service """
+def get_service_from_arn(arn: str) -> str:
+    """Given an ARN string, return the service"""
     result = parse_arn(arn)
     return result["service"]
 
 
-def get_region_from_arn(arn):
+def get_region_from_arn(arn: str) -> str:
     """Given an ARN, return the region in the ARN, if it is available. In certain cases like S3 it is not"""
     result = parse_arn(arn)
     # Support S3 buckets with no values under region
     if result["region"] is None:
-        result = ""
-    else:
-        result = result["region"]
-    return result
+        return ""
+    return result["region"]
 
 
-def get_account_from_arn(arn):
+def get_account_from_arn(arn: str) -> str:
     """Given an ARN, return the account ID in the ARN, if it is available. In certain cases like S3 it is not"""
     result = parse_arn(arn)
     # Support S3 buckets with no values under account
     if result["account"] is None:
-        result = ""
-    else:
-        result = result["account"]
-    return result
+        return ""
+    return result["account"]
 
 
-def get_resource_path_from_arn(arn):
+def get_resource_path_from_arn(arn: str) -> str | None:
     """Given an ARN, parse it according to ARN namespacing and return the resource path. See
-    http://docs.aws.amazon.com/general/latest/gr/aws-arns-and-namespaces.html for more details on ARN namespacing."""
+    http://docs.aws.amazon.com/general/latest/gr/aws-arns-and-namespaces.html for more details on ARN namespacing.
+    """
     result = parse_arn(arn)
     return result["resource_path"]
 
 
-def get_resource_string(arn):
+def get_resource_string(arn: str) -> str:
     """
     Given an ARN, return the string after the account ID, no matter the ARN format.
 
@@ -229,7 +244,7 @@ def get_resource_string(arn):
 
 # In the meantime, we have to skip this pylint check (consider this as tech debt)
 # pylint: disable=inconsistent-return-statements
-def parse_arn_for_resource_type(arn):
+def parse_arn_for_resource_type(arn: str) -> str | None:
     """
     Parses the resource string (resourcetype/resource and other variants) and grab the resource type.
 
@@ -240,15 +255,17 @@ def parse_arn_for_resource_type(arn):
     """
     split_arn = arn.split(":")
     resource_string = ":".join(split_arn[5:])
-    split_resource = re.split("/|:", resource_string)
+    split_resource = re.split(ARN_SEPARATOR_PATTERN, resource_string)
     if len(split_resource) == 1:
         # logger.debug(f"split_resource length is 1: {str(split_resource)}")
         pass
     elif len(split_resource) > 1:
         return split_resource[0]
 
+    return None
+
 
-def does_arn_match(arn_to_test, arn_in_database):
+def does_arn_match(arn_to_test: str, arn_in_database: str) -> bool:
     """
     Given two ARNs, determine if they have the same resource type.
 
diff --git a/policy_sentry/util/conditions.py b/policy_sentry/util/conditions.py
@@ -44,7 +44,7 @@ def get_comma_separated_condition_keys(condition_keys: str) -> str:
 
 
 def is_condition_key_match(document_key: str, some_str: str) -> bool:
-    """ Given a documented condition key and one from a policy, determine if they match
+    """Given a documented condition key and one from a policy, determine if they match
     Examples:
     - s3:prefix and s3:prefix obviously match
     - s3:ExistingObjectTag/<key> and s3:ExistingObjectTag/backup match
diff --git a/policy_sentry/util/policy_files.py b/policy_sentry/util/policy_files.py
@@ -1,15 +1,20 @@
 """A few methods for parsing policies."""
+from __future__ import annotations
+
 import json
 import logging
-from operator import itemgetter
+from pathlib import Path
+from typing import Any
+
+
 from policy_sentry.querying.actions import get_action_data
 
 logger = logging.getLogger(__name__)
 
 
-def get_actions_from_statement(statement):
+def get_actions_from_statement(statement: dict[str, Any]) -> list[str]:
     """Given a statement dictionary, create a list of the actions"""
-    actions_list = []
+    actions_list: list[str] = []
     # We only want to evaluate policies that have Effect = "Allow"
     if statement.get("Effect") == "Deny":
         return actions_list
@@ -30,7 +35,7 @@ def get_actions_from_statement(statement):
 
 
 # pylint: disable=too-many-branches,too-many-statements
-def get_actions_from_policy(data):
+def get_actions_from_policy(data: dict[str, Any]) -> list[str]:
     """Given a policy dictionary, create a list of the actions"""
     actions_list = []
     statement_clause = data.get("Statement")
@@ -43,22 +48,20 @@ def get_actions_from_policy(data):
             actions_list.extend(get_actions_from_statement(statement))
     else:
         logger.critical("Unknown error: The 'Statement' is neither a dict nor a list")
-    actions_list = [x.lower() for x in actions_list]
 
     new_actions_list = []
     for action in actions_list:
         service, action_name = action.split(":")
         action_data = get_action_data(service, action_name)
-        if service in action_data:
-            if action_data[service]:
-                new_actions_list.append(action_data[service][0]["action"])
+        if service in action_data and action_data[service]:
+            new_actions_list.append(action_data[service][0]["action"])
 
     new_actions_list.sort()
     return new_actions_list
 
 
 # pylint: disable=too-many-branches,too-many-statements
-def get_actions_from_json_policy_file(file):
+def get_actions_from_json_policy_file(file: str | Path) -> list[str]:
     """
     read the json policy file and return a list of actions
     """
@@ -77,17 +80,23 @@ def get_actions_from_json_policy_file(file):
     return actions_list
 
 
-def get_sid_names_from_policy(policy_json):
+def get_sid_names_from_policy(policy_json: dict[str, Any]) -> list[str]:
     """
     Given a Policy JSON, get a list of the Statement IDs. This is helpful in unit tests.
     """
-    sid_names = list(map(itemgetter("Sid"), policy_json.get("Statement")))
+    sid_names = [
+        statement["Sid"]
+        for statement in policy_json.get("Statement", [])
+        if "Sid" in statement
+    ]
     return sid_names
 
 
-def get_statement_from_policy_using_sid(policy_json, sid):
+def get_statement_from_policy_using_sid(
+    policy_json: dict[str, Any], sid: str
+) -> dict[str, Any] | None:
     """
     Helper function to get a statement just by providing the policy JSON and the Statement ID
     """
-    res = next((sub for sub in policy_json["Statement"] if sub['Sid'] == sid), None)
+    res = next((sub for sub in policy_json["Statement"] if sub.get("Sid") == sid), None)
     return res
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,13 +1,13 @@
 [tool.mypy]
 files = "policy_sentry"
 strict = true
+pretty = true
 
 exclude = [
     '^policy_sentry/analysis',
     '^policy_sentry/bin',
     '^policy_sentry/command',
     '^policy_sentry/querying',
     '^policy_sentry/shared',
-    '^policy_sentry/util/(arns|policy_files)',
     '^policy_sentry/writing',
 ]
diff --git a/tasks.py b/tasks.py
@@ -247,7 +247,7 @@ def run_linter(c):
 def run_mypy(c):
     """Type checking with `mypy`"""
     try:
-        c.run('mypy policy_sentry/')
+        c.run('mypy')
     except UnexpectedExit as u_e:
         logger.critical(f"FAIL! UnexpectedExit: {u_e}")
         sys.exit(1)