Skip to content

Commit a7845f7

Browse files
Support patterns in ABOUT resource paths (#982)
* Support regex in ABOUT resource paths Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com> * Refactor ABOUT file mapping in d2d for efficiency Reference: #1004 Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com> * Restructure map_about_files Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com> * Address feedback and review comments Reference: #982 Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com> * Update docstrings and use dataclass Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com> * Use license/notice files from About data Reference: #1004 Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com> * Add tests for AboutFileIndex methods Reference: #982 Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com> * Address feedback and update CHANGELOG Reference: #982 Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com> --------- Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com>
1 parent b7cd623 commit a7845f7

File tree

9 files changed

+411
-92
lines changed

9 files changed

+411
-92
lines changed

CHANGELOG.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,10 @@ v33.0.0 (2024-01-16)
6464
project pipeline.
6565
https://github.com/nexB/scancode.io/issues/997
6666

67+
- In "map_deploy_to_develop" pipeline, add support for path patterns
68+
in About file attributes documenting resource paths.
69+
https://github.com/nexB/scancode.io/issues/1004
70+
6771
- Fix an issue where the pipeline details cannot be fetched when using URLs that
6872
include credentials such as "user:pass@domain".
6973
https://github.com/nexB/scancode.io/issues/998

scanpipe/models.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1801,12 +1801,12 @@ def profile(self, print_results=False):
18011801
print(output_str)
18021802

18031803

1804-
def posix_regex_to_django_regex_lookup(regex_pattern):
1804+
def convert_glob_to_django_regex(glob_pattern):
18051805
"""
1806-
Convert a POSIX-style regex pattern to an equivalent pattern compatible with the
1807-
Django regex lookup.
1806+
Convert a glob pattern to an equivalent django regex pattern
1807+
compatible with the Django regex lookup.
18081808
"""
1809-
escaped_pattern = re.escape(regex_pattern)
1809+
escaped_pattern = re.escape(glob_pattern)
18101810
escaped_pattern = escaped_pattern.replace(r"\*", ".*") # Replace \* with .*
18111811
escaped_pattern = escaped_pattern.replace(r"\?", ".") # Replace \? with .
18121812
escaped_pattern = f"^{escaped_pattern}$" # Add start and end anchors
@@ -1914,8 +1914,8 @@ def has_value(self, field_name):
19141914
return self.filter(~Q((f"{field_name}__in", EMPTY_VALUES)))
19151915

19161916
def path_pattern(self, pattern):
1917-
"""Resources with a path that match the provided ``pattern``."""
1918-
return self.filter(path__regex=posix_regex_to_django_regex_lookup(pattern))
1917+
"""Resources with a path that match the provided glob ``pattern``."""
1918+
return self.filter(path__regex=convert_glob_to_django_regex(pattern))
19191919

19201920
def has_directory_content_fingerprint(self):
19211921
"""

scanpipe/pipes/d2d.py

Lines changed: 236 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@
2323
from collections import Counter
2424
from collections import defaultdict
2525
from contextlib import suppress
26+
from dataclasses import dataclass
2627
from pathlib import Path
28+
from re import match as regex_match
2729

2830
from django.contrib.postgres.aggregates.general import ArrayAgg
2931
from django.core.exceptions import MultipleObjectsReturned
@@ -43,6 +45,7 @@
4345
from scanpipe import pipes
4446
from scanpipe.models import CodebaseRelation
4547
from scanpipe.models import CodebaseResource
48+
from scanpipe.models import convert_glob_to_django_regex
4649
from scanpipe.pipes import LoopProgress
4750
from scanpipe.pipes import flag
4851
from scanpipe.pipes import get_resource_diff_ratio
@@ -781,94 +784,263 @@ def _map_javascript_resource(
781784
resource.update(status=flag.MAPPED)
782785

783786

784-
def _map_about_file_resource(project, about_file_resource, to_resources):
785-
about_file_location = str(about_file_resource.location_path)
786-
package_data = resolve.resolve_about_package(about_file_location)
787+
@dataclass
788+
class AboutFileIndexes:
789+
"""
790+
About file indexes are used to create packages from
791+
About files and map the resources described in them
792+
to the respective packages created, using regex path
793+
patterns and other About file data.
794+
"""
795+
796+
# Mapping of About file paths and the regex pattern
797+
# string for the files documented
798+
regex_by_about_path: dict
799+
# Mapping of About file paths and a list of path pattern
800+
# strings, for the files to be ignored
801+
ignore_regex_by_about_path: dict
802+
# Resource objects for About files present in the codebase,
803+
# by their path
804+
about_resources_by_path: dict
805+
# mapping of package data present in the About file, by path
806+
about_pkgdata_by_path: dict
807+
# List of mapped resources for each About file, by path
808+
mapped_resources_by_aboutpath: dict
809+
810+
@classmethod
811+
def create_indexes(cls, project, from_about_files, logger=None):
812+
"""
813+
Return an ABOUT file index, containing path pattern mappings,
814+
package data, and resources, created from `from_about_files`,
815+
the About file resources.
816+
"""
817+
about_pkgdata_by_path = {}
818+
regex_by_about_path = {}
819+
ignore_regex_by_about_path = {}
820+
about_resources_by_path = {}
821+
mapped_resources_by_aboutpath = {}
822+
823+
count_indexed_about_files = 0
824+
825+
for about_file_resource in from_about_files:
826+
package_data = resolve.resolve_about_package(
827+
input_location=str(about_file_resource.location_path)
828+
)
829+
error_message_details = {
830+
"path": about_file_resource.path,
831+
"package_data": package_data,
832+
}
833+
if not package_data:
834+
project.add_error(
835+
description="Cannot create package from ABOUT file",
836+
model="map_about_files",
837+
details=error_message_details,
838+
)
839+
continue
787840

788-
error_message_details = {
789-
"path": about_file_resource.path,
790-
"package_data": package_data,
791-
}
792-
if not package_data:
793-
project.add_error(
794-
description="Cannot create package from ABOUT file",
795-
model="map_about_files",
796-
details=error_message_details,
841+
about_pkgdata_by_path[about_file_resource.path] = package_data
842+
files_pattern = package_data.get("filename")
843+
if not files_pattern:
844+
# Cannot map anything without the about_resource value.
845+
project.add_error(
846+
description="ABOUT file does not have about_resource",
847+
model="map_about_files",
848+
details=error_message_details,
849+
)
850+
continue
851+
else:
852+
count_indexed_about_files += 1
853+
regex = convert_glob_to_django_regex(files_pattern)
854+
regex_by_about_path[about_file_resource.path] = regex
855+
856+
if extra_data := package_data.get("extra_data"):
857+
ignore_regex = []
858+
for pattern in extra_data.get("ignored_resources", []):
859+
ignore_regex.append(convert_glob_to_django_regex(pattern))
860+
if ignore_regex:
861+
ignore_regex_by_about_path[about_file_resource.path] = ignore_regex
862+
863+
about_resources_by_path[about_file_resource.path] = about_file_resource
864+
mapped_resources_by_aboutpath[about_file_resource.path] = []
865+
866+
if logger:
867+
logger(
868+
f"Created mapping index from {count_indexed_about_files:,d} .ABOUT "
869+
f"files in the from/ codebase."
870+
)
871+
872+
return cls(
873+
about_pkgdata_by_path=about_pkgdata_by_path,
874+
regex_by_about_path=regex_by_about_path,
875+
ignore_regex_by_about_path=ignore_regex_by_about_path,
876+
about_resources_by_path=about_resources_by_path,
877+
mapped_resources_by_aboutpath=mapped_resources_by_aboutpath,
797878
)
798-
return
799879

800-
filename = package_data.get("filename")
801-
if not filename:
802-
# Cannot map anything without the about_resource value.
803-
project.add_error(
804-
description="ABOUT file does not have about_resource",
805-
model="map_about_files",
806-
details=error_message_details,
880+
def get_matched_about_path(self, to_resource):
881+
"""
882+
Map `to_resource` using the about file index, and if
883+
mapped, return the path string to the About file it
884+
was mapped to, and if not mapped or ignored, return
885+
None.
886+
"""
887+
resource_mapped = False
888+
for about_path, regex_pattern in self.regex_by_about_path.items():
889+
if regex_match(pattern=regex_pattern, string=to_resource.path):
890+
resource_mapped = True
891+
break
892+
893+
if not resource_mapped:
894+
return
895+
896+
ignore_regex_patterns = self.ignore_regex_by_about_path.get(about_path, [])
897+
ignore_resource = False
898+
for ignore_regex_pattern in ignore_regex_patterns:
899+
if regex_match(pattern=ignore_regex_pattern, string=to_resource.path):
900+
ignore_resource = True
901+
break
902+
903+
if ignore_resource:
904+
return
905+
906+
return about_path
907+
908+
def map_deployed_to_devel_using_about(self, to_resources):
909+
"""
910+
Return mapped resources which are mapped using the
911+
path patterns in About file indexes. Resources are
912+
mapped for each About file in the index, and
913+
their status is updated accordingly.
914+
"""
915+
mapped_to_resources = []
916+
917+
for to_resource in to_resources:
918+
about_path = self.get_matched_about_path(to_resource)
919+
if not about_path:
920+
continue
921+
922+
mapped_resources_about = self.mapped_resources_by_aboutpath.get(about_path)
923+
if mapped_resources_about:
924+
mapped_resources_about.append(to_resource)
925+
else:
926+
self.mapped_resources_by_aboutpath[about_path] = [to_resource]
927+
mapped_to_resources.append(to_resource)
928+
to_resource.update(status=flag.ABOUT_MAPPED)
929+
930+
return mapped_to_resources
931+
932+
def get_about_file_companions(self, about_path):
933+
"""
934+
Given an ``about_path`` path string to an About file,
935+
get CodebaseResource objects for the companion license
936+
and notice files.
937+
"""
938+
about_file_resource = self.about_resources_by_path.get(about_path)
939+
about_file_extra_data = self.about_pkgdata_by_path.get(about_path).get(
940+
"extra_data"
807941
)
808-
return
809942

810-
ignored_resources = []
811-
if extra_data := package_data.get("extra_data"):
812-
ignored_resources = extra_data.get("ignored_resources")
813-
814-
# Fetch all resources that are covered by the .ABOUT file.
815-
codebase_resources = to_resources.filter(path__contains=f"/{filename.lstrip('/')}")
816-
if not codebase_resources:
817-
# If there's nothing to map on the ``to/`` do not create the package.
818-
project.add_warning(
819-
description=(
820-
"Resource paths listed at about_resource is not found"
821-
" in the to/ codebase"
822-
),
823-
model="map_about_files",
824-
details=error_message_details,
943+
about_file_companion_names = [
944+
about_file_extra_data.get("license_file"),
945+
about_file_extra_data.get("notice_file"),
946+
]
947+
about_file_companions = about_file_resource.siblings().filter(
948+
name__in=about_file_companion_names
825949
)
826-
return
950+
return about_file_companions
951+
952+
def create_about_packages_relations(self, project):
953+
"""
954+
Create packages using About file package data, if the About file
955+
has mapped resources on the to/ codebase and creates the mappings
956+
for the package created and mapped resources.
957+
"""
958+
about_purls = set()
959+
mapped_about_resources = []
960+
961+
for about_path, mapped_resources in self.mapped_resources_by_aboutpath.items():
962+
about_file_resource = self.about_resources_by_path[about_path]
963+
package_data = self.about_pkgdata_by_path[about_path]
964+
965+
if not mapped_resources:
966+
error_message_details = {
967+
"path": about_path,
968+
"package_data": package_data,
969+
}
970+
project.add_warning(
971+
description=(
972+
"Resource paths listed at about_resource is not found"
973+
" in the to/ codebase"
974+
),
975+
model="map_about_files",
976+
details=error_message_details,
977+
)
978+
continue
827979

828-
# Ignore resources for paths in `ignored_resources` attribute
829-
if ignored_resources:
830-
lookups = Q()
831-
for resource_path in ignored_resources:
832-
lookups |= Q(**{"path__contains": resource_path})
833-
codebase_resources = codebase_resources.filter(~lookups)
980+
# Create the Package using .ABOUT data and assign related codebase_resources
981+
about_package = pipes.update_or_create_package(
982+
project=project,
983+
package_data=package_data,
984+
codebase_resources=mapped_resources,
985+
)
986+
about_purls.add(about_package.purl)
987+
mapped_about_resources.append(about_file_resource)
834988

835-
# Create the Package using .ABOUT data and assigned related codebase_resources
836-
pipes.update_or_create_package(project, package_data, codebase_resources)
989+
# Map the .ABOUT file resource to all related resources in the ``to/`` side.
990+
for mapped_resource in mapped_resources:
991+
pipes.make_relation(
992+
from_resource=about_file_resource,
993+
to_resource=mapped_resource,
994+
map_type="about_file",
995+
)
837996

838-
# Map the .ABOUT file resource to all related resources in the ``to/`` side.
839-
for to_resource in codebase_resources:
840-
pipes.make_relation(
841-
from_resource=about_file_resource,
842-
to_resource=to_resource,
843-
map_type="about_file",
844-
)
997+
about_file_resource.update(status=flag.ABOUT_MAPPED)
998+
999+
about_file_companions = self.get_about_file_companions(about_path)
1000+
about_file_companions.update(status=flag.ABOUT_MAPPED)
8451001

846-
codebase_resources.update(status=flag.ABOUT_MAPPED)
847-
about_file_resource.update(status=flag.ABOUT_MAPPED)
1002+
return about_purls, mapped_about_resources
8481003

8491004

8501005
def map_about_files(project, logger=None):
8511006
"""Map ``from/`` .ABOUT files to their related ``to/`` resources."""
8521007
project_resources = project.codebaseresources
853-
from_files = project_resources.files().from_codebase()
854-
from_about_files = from_files.filter(extension=".ABOUT")
855-
to_resources = project_resources.to_codebase()
1008+
from_about_files = (
1009+
project_resources.files().from_codebase().filter(extension=".ABOUT")
1010+
)
1011+
if not from_about_files.exists():
1012+
return
8561013

8571014
if logger:
8581015
logger(
8591016
f"Mapping {from_about_files.count():,d} .ABOUT files found in the from/ "
8601017
f"codebase."
8611018
)
8621019

863-
for about_file_resource in from_about_files:
864-
_map_about_file_resource(project, about_file_resource, to_resources)
1020+
indexes = AboutFileIndexes.create_indexes(
1021+
project=project, from_about_files=from_about_files
1022+
)
8651023

866-
about_file_companions = (
867-
about_file_resource.siblings()
868-
.filter(name__startswith=about_file_resource.name_without_extension)
869-
.filter(extension__in=[".LICENSE", ".NOTICE"])
1024+
# Ignoring empty or ignored files as they are not relevant anyway
1025+
to_resources = project_resources.to_codebase().no_status()
1026+
mapped_to_resources = indexes.map_deployed_to_devel_using_about(
1027+
to_resources=to_resources,
1028+
)
1029+
if logger:
1030+
logger(
1031+
f"Mapped {len(mapped_to_resources):,d} resources from the "
1032+
f"to/ codebase to the About files in the from. codebase."
1033+
)
1034+
1035+
about_purls, mapped_about_resources = indexes.create_about_packages_relations(
1036+
project=project,
1037+
)
1038+
if logger:
1039+
logger(
1040+
f"Created {len(about_purls):,d} new packages from "
1041+
f"{len(mapped_about_resources):,d} About files which "
1042+
f"were mapped to resources in the to/ side."
8701043
)
871-
about_file_companions.update(status=flag.ABOUT_MAPPED)
8721044

8731045

8741046
def map_javascript_post_purldb_match(project, logger=None):

0 commit comments

Comments
 (0)