From 321ef6f5db18560a97fd1b8a363c73c9d2f7aabc Mon Sep 17 00:00:00 2001 From: Till Prochaska <1512805+tillprochaska@users.noreply.github.com> Date: Thu, 12 Jun 2025 15:06:30 +0200 Subject: [PATCH 1/3] Properly handle 7z archives Co-authored-by: catileptic --- ingestors/packages/__init__.py | 11 ++++-- requirements.txt | 1 + tests/fixtures/bad7zip.7z | Bin 0 -> 170 bytes tests/fixtures/badrar.rar | Bin 0 -> 115 bytes tests/fixtures/badtar.tar | Bin 0 -> 10240 bytes tests/fixtures/badzip.zip | Bin 0 -> 206 bytes tests/fixtures/secret.txt | 1 + tests/test_packages.py | 67 +++++++++++++++++++++++++++++++++ 8 files changed, 77 insertions(+), 3 deletions(-) create mode 100644 tests/fixtures/bad7zip.7z create mode 100644 tests/fixtures/badrar.rar create mode 100644 tests/fixtures/badtar.tar create mode 100644 tests/fixtures/badzip.zip create mode 100644 tests/fixtures/secret.txt diff --git a/ingestors/packages/__init__.py b/ingestors/packages/__init__.py index 4666d8a67..f1da6b99c 100644 --- a/ingestors/packages/__init__.py +++ b/ingestors/packages/__init__.py @@ -4,6 +4,9 @@ import tarfile from pathlib import PurePath +import py7zr +from py7zr.exceptions import ArchiveError + from ingestors.ingestor import Ingestor from ingestors.support.package import PackageSupport from ingestors.support.shell import ShellSupport @@ -24,9 +27,11 @@ def unpack(self, file_path, entity, temp_dir): *pure_file_path.parts[1:-1], reconstructed_filename ) - self.exec_command( - "7z", "x", str(pure_file_path), "-y", "-r", "-bb0", "-bd", f"-oc:{temp_dir}" - ) + try: + with py7zr.SevenZipFile(str(pure_file_path), mode="r") as z: + z.extractall(path=temp_dir) + except ArchiveError as e: + raise ProcessingException(f"Error: {e}") class SingleFilePackageIngestor(PackageSupport, Ingestor): diff --git a/requirements.txt b/requirements.txt index af4b11c5b..af83836b2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,6 +14,7 @@ numpy<2.0.0 # pinned because otherwise spacy requires an incompatible numpy fingerprints==1.1.1 fasttext==0.9.2 pika==1.3.2 +py7zr==1.0.0 # Development pytest==8.2.0 diff --git a/tests/fixtures/bad7zip.7z b/tests/fixtures/bad7zip.7z new file mode 100644 index 0000000000000000000000000000000000000000..6e6ced3ab5abd3a4d818e9798dcc4c975dac053d GIT binary patch literal 170 zcmXr7+Ou9=hJj_(BP#<91_(%k(&4k*&M`75>u2Vrrxusw7ZvN50O?}=w9JZ<(xTL2 z{o>T*qSO+-l8O=rMiw>(MotX|c5X%nMny$N1|Ag#4lc%n?tY;R46KZjJWw;il%P06 wF+(atGD8uNEMd?C;tC*UV31@1$%}9?G6V(5Y_qs^n^BaFkwKy1?Lr0y0H6yZI{*Lx literal 0 HcmV?d00001 diff --git a/tests/fixtures/badrar.rar b/tests/fixtures/badrar.rar new file mode 100644 index 0000000000000000000000000000000000000000..a9b8142acf4a8e9b2bc0adb094e720dcdbc8074c GIT binary patch literal 115 zcmWGaEK-zWXJjy*wDl<$BP$yND)YI0F( ziC#%X2^X_)-9(R!is@op8mx>AD*Bmu>8Zsf`9;P0B|y4ZKP|JOq_iltSRbKBwmdA5 InU#eB0QCSN5dZ)H literal 0 HcmV?d00001 diff --git a/tests/fixtures/badtar.tar b/tests/fixtures/badtar.tar new file mode 100644 index 0000000000000000000000000000000000000000..4e00147b7c4f29803132cb4a7f327d23851964da GIT binary patch literal 10240 zcmeIwK?=hl5QSln;t8U$7|&CRE!kAi3FP=Sg>F(Xi!`**e>Q_rQNNeYsKd+F3f`-w z*IM`Ix87Uldh3RMSr>vmDeVuxSv_DXa?V<-oRYN8+nR6f{o%RJlz7QeI+>f#-}8he z;1I$(_t*G6|N0{gHGgNu`z;npxfrRvnCm#plw Date: Thu, 12 Jun 2025 15:42:25 +0200 Subject: [PATCH 2/3] =?UTF-8?q?Bump=20version:=204.1.0=20=E2=86=92=204.1.2?= =?UTF-8?q?-rc1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- ingestors/__init__.py | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 95c3afba6..5810962dc 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 4.1.0 +current_version = 4.1.2-rc1 tag_name = {new_version} commit = True tag = True diff --git a/ingestors/__init__.py b/ingestors/__init__.py index 586621e70..979ce9981 100644 --- a/ingestors/__init__.py +++ b/ingestors/__init__.py @@ -2,7 +2,7 @@ import logging -__version__ = "4.1.0" +__version__ = "4.1.2-rc1" logging.getLogger("chardet").setLevel(logging.INFO) logging.getLogger("PIL").setLevel(logging.INFO) diff --git a/setup.py b/setup.py index 9173a65ae..6c4b18af1 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name="ingest", - version="4.1.0", + version="4.1.2-rc1", author="Organized Crime and Corruption Reporting Project", packages=find_packages(exclude=["tests"]), package_dir={"ingestors": "ingestors"}, From ef90ce520e4def239b6eeb7a9718899854d47c2f Mon Sep 17 00:00:00 2001 From: Till Prochaska <1512805+tillprochaska@users.noreply.github.com> Date: Thu, 12 Jun 2025 18:11:54 +0200 Subject: [PATCH 3/3] =?UTF-8?q?Bump=20version:=204.1.2-rc1=20=E2=86=92=204?= =?UTF-8?q?.1.2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- ingestors/__init__.py | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 5810962dc..ba66cb5e2 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 4.1.2-rc1 +current_version = 4.1.2 tag_name = {new_version} commit = True tag = True diff --git a/ingestors/__init__.py b/ingestors/__init__.py index 979ce9981..4a8917cda 100644 --- a/ingestors/__init__.py +++ b/ingestors/__init__.py @@ -2,7 +2,7 @@ import logging -__version__ = "4.1.2-rc1" +__version__ = "4.1.2" logging.getLogger("chardet").setLevel(logging.INFO) logging.getLogger("PIL").setLevel(logging.INFO) diff --git a/setup.py b/setup.py index 6c4b18af1..517c6dfc1 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name="ingest", - version="4.1.2-rc1", + version="4.1.2", author="Organized Crime and Corruption Reporting Project", packages=find_packages(exclude=["tests"]), package_dir={"ingestors": "ingestors"},