From 6d291a02f43e0c1dc461074c512b9597589db577 Mon Sep 17 00:00:00 2001 From: Keshav Priyadarshi Date: Wed, 31 Jan 2024 21:25:03 +0530 Subject: [PATCH] Support extraction of JavaScript map files fixes https://github.com/nexB/scancode-toolkit/issues/3637 Signed-off-by: Keshav Priyadarshi --- src/extractcode/__init__.py | 4 ++ src/extractcode/archive.py | 22 ++++++++++ src/extractcode/source_map.py | 78 +++++++++++++++++++++++++++++++++++ 3 files changed, 104 insertions(+) create mode 100644 src/extractcode/source_map.py diff --git a/src/extractcode/__init__.py b/src/extractcode/__init__.py index fb6095d..efe2d88 100644 --- a/src/extractcode/__init__.py +++ b/src/extractcode/__init__.py @@ -42,6 +42,7 @@ file_system = 5 patches = 6 special_package = 7 +sources_map = 8 kind_labels = { 1: 'docs', @@ -51,6 +52,7 @@ 5: 'file_system', 6: 'patches', 7: 'special_package', + 8: 'sources_map', } # note: we do not include special_package in all_kinds by default @@ -62,6 +64,7 @@ docs, patches, special_package, + sources_map ) default_kinds = ( @@ -79,6 +82,7 @@ 'doc': (docs,), 'patch': (patches,), 'special_package': (special_package,), + 'sources_map': (sources_map,), } diff --git a/src/extractcode/archive.py b/src/extractcode/archive.py index d52399e..1ad180b 100644 --- a/src/extractcode/archive.py +++ b/src/extractcode/archive.py @@ -24,10 +24,12 @@ from extractcode import regular_nested from extractcode import file_system from extractcode import patches +from extractcode import sources_map from extractcode import special_package from extractcode import libarchive2 from extractcode import patch +from extractcode import source_map from extractcode import sevenzip from extractcode import vmimage @@ -506,6 +508,7 @@ def try_to_extract(location, target_dir, extractor): extract_ishield = sevenzip.extract extract_Z = sevenzip.extract extract_xarpkg = sevenzip.extract +extract_source_map = source_map.extract # Archive handlers. #################### @@ -1133,6 +1136,24 @@ def try_to_extract(location, target_dir, extractor): strict=True ) +SourceMapFileHandler = Handler( + name='Source Map File', + filetypes=('json data',), + mimetypes=('application/json',), + extensions=( + '.js.map', + '.ts.map', + '.css.map', + '.less.map', + '.scss.map', + '.soy.map', + '.jsx.map', + ), + kind=sources_map, + extractors=[extract_source_map], + strict=True, +) + # Actual list of handlers archive_handlers = [ @@ -1194,6 +1215,7 @@ def try_to_extract(location, target_dir, extractor): QCOWHandler, VMDKHandler, VirtualBoxHandler, + SourceMapFileHandler, ] # only support extracting patches if patch is installed. This is not a default diff --git a/src/extractcode/source_map.py b/src/extractcode/source_map.py new file mode 100644 index 0000000..d3f46a8 --- /dev/null +++ b/src/extractcode/source_map.py @@ -0,0 +1,78 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# ScanCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/extractcode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + +import json +import os.path +import posixpath + +from commoncode import fileutils +from commoncode import paths + +import extractcode + +""" +Utilities to parse source map files and treat them as if they were +archives containing files. +""" + + +def extract(location, target_dir): + """ + Extract each source in sourcesContent list of a map file at `location` as + files in a target_dir directory tree mimicking the directory in which the + sources would be present. + + Return a list of warning messages. Raise Exception errors. + """ + for path, content in extract_source_content_from_map(location): + # Convert path to safe posix path + map_subfile_path = paths.safe_path(path, preserve_spaces=True) + + # Create directories + parent_dir = posixpath.dirname(map_subfile_path) + parent_target_dir = os.path.join(target_dir, parent_dir) + fileutils.create_dir(parent_target_dir) + + subfile_path = os.path.join(target_dir, map_subfile_path) + with open(subfile_path, "w") as subfile: + subfile.write(content) + + return [] + + +def extract_source_content_from_map(location): + """ + Return a list of tuples of (source, content) + for each source in sourcesContent of a map file at location. + + Raise an exception if the file is not a JSON file or cannot be parsed. + """ + try: + with open(location, "r") as map_file: + map_data = json.load(map_file) + except json.JSONDecodeError as e: + msg = f"Unable to decode map file:{location} {e}" + raise extractcode.ExtractErrorFailedToExtract(msg) + + if "sourcesContent" in map_data: + sources_content = map_data["sourcesContent"] + sources = map_data.get("sources", []) + + # Inconsistent source map. In a valid source map, each entry in the ``sources`` + # list should have a corresponding entry in the ``sourcesContent`` list. + # Use dummy filenames as `source` path in such scenario. + if len(sources) != len(sources_content): + sources = [ + f"source_content{i + 1}.txt" for i in range(len(sources_content)) + ] + + sources_and_content = list(zip(sources, sources_content)) + return sources_and_content + + return []