Skip to content

py3-spdx-tools: Apply performance patch #13501

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion py3-spdx-tools.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
package:
name: py3-spdx-tools
version: 0.8.2
epoch: 0
epoch: 1
description: SPDX parser and tools.
copyright:
- license: Apache-2.0
Expand Down Expand Up @@ -36,6 +36,10 @@ pipeline:
tag: v${{package.version}}
expected-commit: 32e74cdc8a39bc3b4119bc6e77f3804d09a05418

- uses: patch
with:
patches: performance.patch

- name: Python Build
uses: python/build-wheel

Expand Down
334 changes: 334 additions & 0 deletions py3-spdx-tools/performance.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,334 @@
From 30ebcf3981a1a1d29260dfefa75690842b0fb2cb Mon Sep 17 00:00:00 2001
From: Jon Johnson <jon.johnson@chainguard.dev>
Date: Mon, 29 Jan 2024 12:12:26 -0800
Subject: [PATCH 1/3] Fix several accidentally quadratic functions

Signed-off-by: Jon Johnson <jon.johnson@chainguard.dev>
---
src/spdx_tools/spdx/model/document.py | 3 +++
src/spdx_tools/spdx/model/relationship.py | 3 +++
.../parser/jsonlikedict/relationship_parser.py | 16 +++++++---------
.../spdx/validation/spdx_id_validators.py | 11 +++++++++--
4 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/src/spdx_tools/spdx/model/document.py b/src/spdx_tools/spdx/model/document.py
index 980c59ca5..e1ec51671 100644
--- a/src/spdx_tools/spdx/model/document.py
+++ b/src/spdx_tools/spdx/model/document.py
@@ -81,3 +81,6 @@ def __init__(
relationships = [] if relationships is None else relationships
extracted_licensing_info = [] if extracted_licensing_info is None else extracted_licensing_info
check_types_and_set_values(self, locals())
+
+ def __hash__(self):
+ return id(self)
diff --git a/src/spdx_tools/spdx/model/relationship.py b/src/spdx_tools/spdx/model/relationship.py
index 02b1326a9..1e6d7ae86 100644
--- a/src/spdx_tools/spdx/model/relationship.py
+++ b/src/spdx_tools/spdx/model/relationship.py
@@ -73,3 +73,6 @@ def __init__(
comment: Optional[str] = None,
):
check_types_and_set_values(self, locals())
+
+ def __hash__(self):
+ return hash("{} -> {} ({})".format(self.spdx_element_id, str(self.related_spdx_element_id), str(self.relationship_type)))
diff --git a/src/spdx_tools/spdx/parser/jsonlikedict/relationship_parser.py b/src/spdx_tools/spdx/parser/jsonlikedict/relationship_parser.py
index 17374bef5..6393470e3 100644
--- a/src/spdx_tools/spdx/parser/jsonlikedict/relationship_parser.py
+++ b/src/spdx_tools/spdx/parser/jsonlikedict/relationship_parser.py
@@ -1,7 +1,7 @@
# SPDX-FileCopyrightText: 2022 spdx contributors
#
# SPDX-License-Identifier: Apache-2.0
-from beartype.typing import Dict, List, Optional
+from beartype.typing import Dict, List, Optional, Set

from spdx_tools.common.typing.constructor_type_errors import ConstructorTypeErrors
from spdx_tools.spdx.model import Relationship, RelationshipType
@@ -35,9 +35,9 @@ def parse_all_relationships(self, input_doc_dict: Dict) -> List[Relationship]:
document_describes: List[str] = delete_duplicates_from_list(input_doc_dict.get("documentDescribes", []))
doc_spdx_id: Optional[str] = input_doc_dict.get("SPDXID")

- existing_relationships_without_comments: List[Relationship] = self.get_all_relationships_without_comments(
+ existing_relationships_without_comments: Set[Relationship] = set(self.get_all_relationships_without_comments(
relationships
- )
+ ))
relationships.extend(
parse_field_or_log_error(
self.logger,
@@ -52,9 +52,6 @@ def parse_all_relationships(self, input_doc_dict: Dict) -> List[Relationship]:
)

package_dicts: List[Dict] = input_doc_dict.get("packages", [])
- existing_relationships_without_comments: List[Relationship] = self.get_all_relationships_without_comments(
- relationships
- )

relationships.extend(
parse_field_or_log_error(
@@ -110,7 +107,7 @@ def parse_relationship_type(relationship_type_str: str) -> RelationshipType:
return relationship_type

def parse_document_describes(
- self, doc_spdx_id: str, described_spdx_ids: List[str], existing_relationships: List[Relationship]
+ self, doc_spdx_id: str, described_spdx_ids: List[str], existing_relationships: Set[Relationship]
) -> List[Relationship]:
logger = Logger()
describes_relationships = []
@@ -131,10 +128,11 @@ def parse_document_describes(
return describes_relationships

def parse_has_files(
- self, package_dicts: List[Dict], existing_relationships: List[Relationship]
+ self, package_dicts: List[Dict], existing_relationships: Set[Relationship]
) -> List[Relationship]:
# assume existing relationships are stripped of comments
logger = Logger()
+
contains_relationships = []
for package in package_dicts:
package_spdx_id: Optional[str] = package.get("SPDXID")
@@ -160,7 +158,7 @@ def parse_has_files(
return contains_relationships

def check_if_relationship_exists(
- self, relationship: Relationship, existing_relationships: List[Relationship]
+ self, relationship: Relationship, existing_relationships: Set[Relationship]
) -> bool:
# assume existing relationships are stripped of comments
if relationship in existing_relationships:
diff --git a/src/spdx_tools/spdx/validation/spdx_id_validators.py b/src/spdx_tools/spdx/validation/spdx_id_validators.py
index 6441236a9..2ae412ff3 100644
--- a/src/spdx_tools/spdx/validation/spdx_id_validators.py
+++ b/src/spdx_tools/spdx/validation/spdx_id_validators.py
@@ -4,7 +4,9 @@

import re

-from beartype.typing import List
+from beartype.typing import List, Set
+
+from functools import cache

from spdx_tools.spdx.document_utils import get_contained_spdx_element_ids
from spdx_tools.spdx.model import Document, File
@@ -22,11 +24,16 @@ def is_spdx_id_present_in_files(spdx_id: str, files: List[File]) -> bool:
return spdx_id in [file.spdx_id for file in files]


+@cache
def is_spdx_id_present_in_document(spdx_id: str, document: Document) -> bool:
- all_spdx_ids_in_document: List[str] = get_list_of_all_spdx_ids(document)
+ all_spdx_ids_in_document: Set[str] = get_set_of_all_spdx_ids(document)

return spdx_id in all_spdx_ids_in_document

+@cache
+def get_set_of_all_spdx_ids(document: Document) -> Set[str]:
+ return set(get_list_of_all_spdx_ids(document))
+

def get_list_of_all_spdx_ids(document: Document) -> List[str]:
all_spdx_ids_in_document: List[str] = [document.creation_info.spdx_id]

From 1c6cd54e52d6a3097ad9b4e9ce14c1019e8bcf8f Mon Sep 17 00:00:00 2001
From: paulgibert <paulgibert98@gmail.com>
Date: Wed, 14 Feb 2024 10:55:04 -0500
Subject: [PATCH 2/3] Created a document cache decorator and handled
Relationships as tuples in parsing to remove hash methods.

Signed-off-by: paulgibert <paulgibert98@gmail.com>
---
.../common/typing/dataclass_with_properties.py | 6 +++++-
src/spdx_tools/spdx/model/document.py | 18 +++++++++++++++---
src/spdx_tools/spdx/model/relationship.py | 3 ---
.../parser/jsonlikedict/relationship_parser.py | 16 +++++++++++-----
.../spdx/validation/spdx_id_validators.py | 5 +++--
5 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/src/spdx_tools/common/typing/dataclass_with_properties.py b/src/spdx_tools/common/typing/dataclass_with_properties.py
index 3f13950d5..ba343af08 100644
--- a/src/spdx_tools/common/typing/dataclass_with_properties.py
+++ b/src/spdx_tools/common/typing/dataclass_with_properties.py
@@ -1,12 +1,16 @@
# SPDX-FileCopyrightText: 2022 spdx contributors
#
# SPDX-License-Identifier: Apache-2.0
-from dataclasses import dataclass
+from dataclasses import dataclass, astuple

from beartype import beartype
from beartype.roar import BeartypeCallHintException


+def freeze_dataclass_with_properties_list(items):
+ return {astuple(itm) for itm in items}
+
+
def dataclass_with_properties(cls):
"""Decorator to generate a dataclass with properties out of the class' value:type list.
Their getters and setters will be subjected to the @typechecked decorator to ensure type conformity."""
diff --git a/src/spdx_tools/spdx/model/document.py b/src/spdx_tools/spdx/model/document.py
index e1ec51671..4ed1b45ae 100644
--- a/src/spdx_tools/spdx/model/document.py
+++ b/src/spdx_tools/spdx/model/document.py
@@ -1,7 +1,7 @@
# SPDX-FileCopyrightText: 2022 spdx contributors
#
# SPDX-License-Identifier: Apache-2.0
-from dataclasses import field
+from dataclasses import field, astuple
from datetime import datetime

from beartype.typing import List, Optional
@@ -82,5 +82,17 @@ def __init__(
extracted_licensing_info = [] if extracted_licensing_info is None else extracted_licensing_info
check_types_and_set_values(self, locals())

- def __hash__(self):
- return id(self)
+
+def document_cache(func):
+ cache = {}
+
+ def cached_function(document: Document):
+ key = id(document)
+ if key in cache.keys():
+ return cache[key]
+ else:
+ value = func(document)
+ cache[key] = value
+ return value
+
+ return cached_function
\ No newline at end of file
diff --git a/src/spdx_tools/spdx/model/relationship.py b/src/spdx_tools/spdx/model/relationship.py
index 1e6d7ae86..02b1326a9 100644
--- a/src/spdx_tools/spdx/model/relationship.py
+++ b/src/spdx_tools/spdx/model/relationship.py
@@ -73,6 +73,3 @@ def __init__(
comment: Optional[str] = None,
):
check_types_and_set_values(self, locals())
-
- def __hash__(self):
- return hash("{} -> {} ({})".format(self.spdx_element_id, str(self.related_spdx_element_id), str(self.relationship_type)))
diff --git a/src/spdx_tools/spdx/parser/jsonlikedict/relationship_parser.py b/src/spdx_tools/spdx/parser/jsonlikedict/relationship_parser.py
index 6393470e3..9ab8a4755 100644
--- a/src/spdx_tools/spdx/parser/jsonlikedict/relationship_parser.py
+++ b/src/spdx_tools/spdx/parser/jsonlikedict/relationship_parser.py
@@ -1,9 +1,12 @@
# SPDX-FileCopyrightText: 2022 spdx contributors
#
# SPDX-License-Identifier: Apache-2.0
+from dataclasses import astuple
+
from beartype.typing import Dict, List, Optional, Set

from spdx_tools.common.typing.constructor_type_errors import ConstructorTypeErrors
+from spdx_tools.common.typing.dataclass_with_properties import freeze_dataclass_with_properties_list
from spdx_tools.spdx.model import Relationship, RelationshipType
from spdx_tools.spdx.parser.error import SPDXParsingError
from spdx_tools.spdx.parser.jsonlikedict.dict_parsing_functions import (
@@ -35,9 +38,12 @@ def parse_all_relationships(self, input_doc_dict: Dict) -> List[Relationship]:
document_describes: List[str] = delete_duplicates_from_list(input_doc_dict.get("documentDescribes", []))
doc_spdx_id: Optional[str] = input_doc_dict.get("SPDXID")

- existing_relationships_without_comments: Set[Relationship] = set(self.get_all_relationships_without_comments(
- relationships
- ))
+ relationship_hash = lambda r: hash("{} -> {} ({})" \
+ .format(r.spdx_element_id,
+ str(r.related_spdx_element_id),
+ str(r.relationship_type)))
+ existing_relationships_without_comments: Set[Relationship] = freeze_dataclass_with_properties_list(
+ self.get_all_relationships_without_comments(relationships))
relationships.extend(
parse_field_or_log_error(
self.logger,
@@ -161,10 +167,10 @@ def check_if_relationship_exists(
self, relationship: Relationship, existing_relationships: Set[Relationship]
) -> bool:
# assume existing relationships are stripped of comments
- if relationship in existing_relationships:
+ if astuple(relationship) in existing_relationships:
return True
relationship_inverted: Relationship = self.invert_relationship(relationship)
- if relationship_inverted in existing_relationships:
+ if astuple(relationship_inverted) in existing_relationships:
return True

return False
diff --git a/src/spdx_tools/spdx/validation/spdx_id_validators.py b/src/spdx_tools/spdx/validation/spdx_id_validators.py
index 2ae412ff3..de3a505b4 100644
--- a/src/spdx_tools/spdx/validation/spdx_id_validators.py
+++ b/src/spdx_tools/spdx/validation/spdx_id_validators.py
@@ -10,6 +10,7 @@

from spdx_tools.spdx.document_utils import get_contained_spdx_element_ids
from spdx_tools.spdx.model import Document, File
+from spdx_tools.spdx.model.document import document_cache


def is_valid_internal_spdx_id(spdx_id: str) -> bool:
@@ -24,13 +25,13 @@ def is_spdx_id_present_in_files(spdx_id: str, files: List[File]) -> bool:
return spdx_id in [file.spdx_id for file in files]


-@cache
+# @cache
def is_spdx_id_present_in_document(spdx_id: str, document: Document) -> bool:
all_spdx_ids_in_document: Set[str] = get_set_of_all_spdx_ids(document)

return spdx_id in all_spdx_ids_in_document

-@cache
+@document_cache
def get_set_of_all_spdx_ids(document: Document) -> Set[str]:
return set(get_list_of_all_spdx_ids(document))


From f45032d642aafc46f9aa59bf2572f434f5ba7813 Mon Sep 17 00:00:00 2001
From: Jon Johnson <jon.johnson@chainguard.dev>
Date: Wed, 14 Feb 2024 09:29:07 -0800
Subject: [PATCH 3/3] Fix lints

Signed-off-by: Jon Johnson <jon.johnson@chainguard.dev>
---
src/spdx_tools/spdx/model/document.py | 2 +-
src/spdx_tools/spdx/validation/spdx_id_validators.py | 3 ---
2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/spdx_tools/spdx/model/document.py b/src/spdx_tools/spdx/model/document.py
index 4ed1b45ae..33018c896 100644
--- a/src/spdx_tools/spdx/model/document.py
+++ b/src/spdx_tools/spdx/model/document.py
@@ -95,4 +95,4 @@ def cached_function(document: Document):
cache[key] = value
return value

- return cached_function
\ No newline at end of file
+ return cached_function
diff --git a/src/spdx_tools/spdx/validation/spdx_id_validators.py b/src/spdx_tools/spdx/validation/spdx_id_validators.py
index de3a505b4..d70c36bb7 100644
--- a/src/spdx_tools/spdx/validation/spdx_id_validators.py
+++ b/src/spdx_tools/spdx/validation/spdx_id_validators.py
@@ -6,8 +6,6 @@

from beartype.typing import List, Set

-from functools import cache
-
from spdx_tools.spdx.document_utils import get_contained_spdx_element_ids
from spdx_tools.spdx.model import Document, File
from spdx_tools.spdx.model.document import document_cache
@@ -25,7 +23,6 @@ def is_spdx_id_present_in_files(spdx_id: str, files: List[File]) -> bool:
return spdx_id in [file.spdx_id for file in files]


-# @cache
def is_spdx_id_present_in_document(spdx_id: str, document: Document) -> bool:
all_spdx_ids_in_document: Set[str] = get_set_of_all_spdx_ids(document)