diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 95c3afba6..ba66cb5e2 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 4.1.0 +current_version = 4.1.2 tag_name = {new_version} commit = True tag = True diff --git a/ingestors/__init__.py b/ingestors/__init__.py index 586621e70..4a8917cda 100644 --- a/ingestors/__init__.py +++ b/ingestors/__init__.py @@ -2,7 +2,7 @@ import logging -__version__ = "4.1.0" +__version__ = "4.1.2" logging.getLogger("chardet").setLevel(logging.INFO) logging.getLogger("PIL").setLevel(logging.INFO) diff --git a/ingestors/packages/__init__.py b/ingestors/packages/__init__.py index 4666d8a67..f1da6b99c 100644 --- a/ingestors/packages/__init__.py +++ b/ingestors/packages/__init__.py @@ -4,6 +4,9 @@ import tarfile from pathlib import PurePath +import py7zr +from py7zr.exceptions import ArchiveError + from ingestors.ingestor import Ingestor from ingestors.support.package import PackageSupport from ingestors.support.shell import ShellSupport @@ -24,9 +27,11 @@ def unpack(self, file_path, entity, temp_dir): *pure_file_path.parts[1:-1], reconstructed_filename ) - self.exec_command( - "7z", "x", str(pure_file_path), "-y", "-r", "-bb0", "-bd", f"-oc:{temp_dir}" - ) + try: + with py7zr.SevenZipFile(str(pure_file_path), mode="r") as z: + z.extractall(path=temp_dir) + except ArchiveError as e: + raise ProcessingException(f"Error: {e}") class SingleFilePackageIngestor(PackageSupport, Ingestor): diff --git a/requirements.txt b/requirements.txt index af4b11c5b..af83836b2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,6 +14,7 @@ numpy<2.0.0 # pinned because otherwise spacy requires an incompatible numpy fingerprints==1.1.1 fasttext==0.9.2 pika==1.3.2 +py7zr==1.0.0 # Development pytest==8.2.0 diff --git a/setup.py b/setup.py index 9173a65ae..517c6dfc1 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name="ingest", - version="4.1.0", + version="4.1.2", author="Organized Crime and Corruption Reporting Project", packages=find_packages(exclude=["tests"]), package_dir={"ingestors": "ingestors"}, diff --git a/tests/fixtures/bad7zip.7z b/tests/fixtures/bad7zip.7z new file mode 100644 index 000000000..6e6ced3ab Binary files /dev/null and b/tests/fixtures/bad7zip.7z differ diff --git a/tests/fixtures/badrar.rar b/tests/fixtures/badrar.rar new file mode 100644 index 000000000..a9b8142ac Binary files /dev/null and b/tests/fixtures/badrar.rar differ diff --git a/tests/fixtures/badtar.tar b/tests/fixtures/badtar.tar new file mode 100644 index 000000000..4e00147b7 Binary files /dev/null and b/tests/fixtures/badtar.tar differ diff --git a/tests/fixtures/badzip.zip b/tests/fixtures/badzip.zip new file mode 100644 index 000000000..7a7a9e4a3 Binary files /dev/null and b/tests/fixtures/badzip.zip differ diff --git a/tests/fixtures/secret.txt b/tests/fixtures/secret.txt new file mode 100644 index 000000000..2bd51eeb3 --- /dev/null +++ b/tests/fixtures/secret.txt @@ -0,0 +1 @@ +This is a secret! \ No newline at end of file diff --git a/tests/test_packages.py b/tests/test_packages.py index 084b3a7de..05243aa82 100644 --- a/tests/test_packages.py +++ b/tests/test_packages.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- from pprint import pprint # noqa +from pathlib import Path from .support import TestCase @@ -11,14 +12,80 @@ def test_zip(self): self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS) self.assertEqual(entity.schema.name, "Package") + def test_zip_symlink_escape(self): + fixture_path, entity = self.fixture("badzip.zip") + + # Ensure that the symlink target exists + target = Path("/ingestors/tests/fixtures/secret.txt") + assert target.read_text() == "This is a secret!" + + self.manager.ingest(fixture_path, entity) + + # Python’s zipfile handles symlinks that point to files outside of the archive root + # treating them as normal files + assert len(self.manager.entities) == 2 + assert self.manager.entities[0].first("fileName") == "secret.txt" + assert ( + self.manager.entities[0].first("bodyText") + == "/ingestors/tests/fixtures/secret.txt" + ) + assert self.manager.entities[1].first("fileName") == "badzip.zip" + def test_rar(self): fixture_path, entity = self.fixture("test-documents.rar") self.manager.ingest(fixture_path, entity) self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS) self.assertEqual(entity.schema.name, "Package") + def test_rar_symlink_escape(self): + fixture_path, entity = self.fixture("badrar.rar") + + # Ensure that the symlink target exists + target = Path("/ingestors/tests/fixtures/secret.txt") + assert target.read_text() == "This is a secret!" + + self.manager.ingest(fixture_path, entity) + + # rarfile handles symlinks that point to files outside of the archive root + # treating them as normal files + assert len(self.manager.entities) == 2 + assert self.manager.entities[0].first("fileName") == "secret.txt" + assert ( + self.manager.entities[0].first("bodyText") + == "/ingestors/tests/fixtures/secret.txt" + ) + assert self.manager.entities[1].first("fileName") == "badrar.rar" + def test_tar(self): fixture_path, entity = self.fixture("test-documents.tar") self.manager.ingest(fixture_path, entity) self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS) self.assertEqual(entity.schema.name, "Package") + + def test_tar_symlink_escape(self): + fixture_path, entity = self.fixture("badtar.tar") + + # Ensure that the symlink target exists + target = Path("/ingestors/tests/fixtures/secret.txt") + assert target.read_text() == "This is a secret!" + + self.manager.ingest(fixture_path, entity) + + # Python’s tarfile ignores symlinks that point to files outside of the archive root + assert len(self.manager.entities) == 1 + assert self.manager.entities[0].first("fileName") == "badtar.tar" + + def test_7zip_symlink_escape(self): + fixture_path, entity = self.fixture("bad7zip.7z") + + # Ensure that the symlink target exists + target = Path("/ingestors/tests/fixtures/secret.txt") + assert target.read_text() == "This is a secret!" + + self.manager.ingest(fixture_path, entity) + + # py7zr raises an exception if it encounters a symlink that points to a file + # outside of the archive root + assert len(self.manager.entities) == 1 + assert self.manager.entities[0].first("fileName") == "bad7zip.7z" + assert self.manager.entities[0].first("processingStatus") == "failure"