Skip to content

Release/4.1.2 #725

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 4.1.0
current_version = 4.1.2
tag_name = {new_version}
commit = True
tag = True
Expand Down
2 changes: 1 addition & 1 deletion ingestors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import logging

__version__ = "4.1.0"
__version__ = "4.1.2"

logging.getLogger("chardet").setLevel(logging.INFO)
logging.getLogger("PIL").setLevel(logging.INFO)
Expand Down
11 changes: 8 additions & 3 deletions ingestors/packages/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
import tarfile
from pathlib import PurePath

import py7zr
from py7zr.exceptions import ArchiveError

from ingestors.ingestor import Ingestor
from ingestors.support.package import PackageSupport
from ingestors.support.shell import ShellSupport
Expand All @@ -24,9 +27,11 @@ def unpack(self, file_path, entity, temp_dir):
*pure_file_path.parts[1:-1], reconstructed_filename
)

self.exec_command(
"7z", "x", str(pure_file_path), "-y", "-r", "-bb0", "-bd", f"-oc:{temp_dir}"
)
try:
with py7zr.SevenZipFile(str(pure_file_path), mode="r") as z:
z.extractall(path=temp_dir)
except ArchiveError as e:
raise ProcessingException(f"Error: {e}")


class SingleFilePackageIngestor(PackageSupport, Ingestor):
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ numpy<2.0.0 # pinned because otherwise spacy requires an incompatible numpy
fingerprints==1.1.1
fasttext==0.9.2
pika==1.3.2
py7zr==1.0.0

# Development
pytest==8.2.0
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setup(
name="ingest",
version="4.1.0",
version="4.1.2",
author="Organized Crime and Corruption Reporting Project",
packages=find_packages(exclude=["tests"]),
package_dir={"ingestors": "ingestors"},
Expand Down
Binary file added tests/fixtures/bad7zip.7z
Binary file not shown.
Binary file added tests/fixtures/badrar.rar
Binary file not shown.
Binary file added tests/fixtures/badtar.tar
Binary file not shown.
Binary file added tests/fixtures/badzip.zip
Binary file not shown.
1 change: 1 addition & 0 deletions tests/fixtures/secret.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is a secret!
67 changes: 67 additions & 0 deletions tests/test_packages.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
from pprint import pprint # noqa
from pathlib import Path

from .support import TestCase

Expand All @@ -11,14 +12,80 @@ def test_zip(self):
self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)
self.assertEqual(entity.schema.name, "Package")

def test_zip_symlink_escape(self):
fixture_path, entity = self.fixture("badzip.zip")

# Ensure that the symlink target exists
target = Path("/ingestors/tests/fixtures/secret.txt")
assert target.read_text() == "This is a secret!"

self.manager.ingest(fixture_path, entity)

# Python’s zipfile handles symlinks that point to files outside of the archive root
# treating them as normal files
assert len(self.manager.entities) == 2
assert self.manager.entities[0].first("fileName") == "secret.txt"
assert (
self.manager.entities[0].first("bodyText")
== "/ingestors/tests/fixtures/secret.txt"
)
assert self.manager.entities[1].first("fileName") == "badzip.zip"

def test_rar(self):
fixture_path, entity = self.fixture("test-documents.rar")
self.manager.ingest(fixture_path, entity)
self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)
self.assertEqual(entity.schema.name, "Package")

def test_rar_symlink_escape(self):
fixture_path, entity = self.fixture("badrar.rar")

# Ensure that the symlink target exists
target = Path("/ingestors/tests/fixtures/secret.txt")
assert target.read_text() == "This is a secret!"

self.manager.ingest(fixture_path, entity)

# rarfile handles symlinks that point to files outside of the archive root
# treating them as normal files
assert len(self.manager.entities) == 2
assert self.manager.entities[0].first("fileName") == "secret.txt"
assert (
self.manager.entities[0].first("bodyText")
== "/ingestors/tests/fixtures/secret.txt"
)
assert self.manager.entities[1].first("fileName") == "badrar.rar"

def test_tar(self):
fixture_path, entity = self.fixture("test-documents.tar")
self.manager.ingest(fixture_path, entity)
self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)
self.assertEqual(entity.schema.name, "Package")

def test_tar_symlink_escape(self):
fixture_path, entity = self.fixture("badtar.tar")

# Ensure that the symlink target exists
target = Path("/ingestors/tests/fixtures/secret.txt")
assert target.read_text() == "This is a secret!"

self.manager.ingest(fixture_path, entity)

# Python’s tarfile ignores symlinks that point to files outside of the archive root
assert len(self.manager.entities) == 1
assert self.manager.entities[0].first("fileName") == "badtar.tar"

def test_7zip_symlink_escape(self):
fixture_path, entity = self.fixture("bad7zip.7z")

# Ensure that the symlink target exists
target = Path("/ingestors/tests/fixtures/secret.txt")
assert target.read_text() == "This is a secret!"

self.manager.ingest(fixture_path, entity)

# py7zr raises an exception if it encounters a symlink that points to a file
# outside of the archive root
assert len(self.manager.entities) == 1
assert self.manager.entities[0].first("fileName") == "bad7zip.7z"
assert self.manager.entities[0].first("processingStatus") == "failure"