Skip to content

Commit 5fd1e91

Browse files
committed
MAINT: simplify file identifiers generation
1 parent 6df64af commit 5fd1e91

File tree

1 file changed

+28
-21
lines changed

1 file changed

+28
-21
lines changed

pypdf/_writer.py

Lines changed: 28 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
import enum
3434
import hashlib
3535
import re
36+
import time
3637
import uuid
3738
import warnings
3839
from io import BytesIO, FileIO, IOBase
@@ -136,13 +137,6 @@ class ObjectDeletionFlag(enum.IntFlag):
136137
ALL_ANNOTATIONS = enum.auto()
137138

138139

139-
def _rolling_checksum(stream: BytesIO, blocksize: int = 65536) -> str:
140-
hash = hashlib.md5()
141-
for block in iter(lambda: stream.read(blocksize), b""):
142-
hash.update(block)
143-
return hash.hexdigest()
144-
145-
146140
class PdfWriter:
147141
"""
148142
Write a PDF file out, given pages produced by another class.
@@ -206,6 +200,7 @@ def __init__(
206200

207201
self._encryption: Optional[Encryption] = None
208202
self._encrypt_entry: Optional[DictionaryObject] = None
203+
self._ID: Union[ArrayObject, None] = None
209204

210205
def __enter__(self) -> "PdfWriter":
211206
"""Store that writer is initialized by 'with'."""
@@ -1128,25 +1123,35 @@ def cloneDocumentFromReader(
11281123
)
11291124
self.clone_document_from_reader(reader, after_page_append)
11301125

1131-
def _compute_document_identifier_from_content(self) -> ByteStringObject:
1132-
stream = BytesIO()
1133-
self._write_pdf_structure(stream)
1134-
stream.seek(0)
1135-
return ByteStringObject(_rolling_checksum(stream).encode("utf8"))
1126+
def _compute_document_identifier(self) -> ByteStringObject:
1127+
md5 = hashlib.md5()
1128+
md5.update(str(time.time()).encode("utf-8"))
1129+
md5.update(str(self.fileobj).encode("utf-8"))
1130+
md5.update(str(len(self._objects)).encode("utf-8"))
1131+
if hasattr(self, "_info"):
1132+
for k, v in cast(DictionaryObject, self._info.get_object()).items():
1133+
md5.update(f"{k}={v}".encode())
1134+
return ByteStringObject(md5.hexdigest().encode("utf-8"))
11361135

11371136
def generate_file_identifiers(self) -> None:
11381137
"""
11391138
Generate an identifier for the PDF that will be written.
11401139
11411140
The only point of this is ensuring uniqueness. Reproducibility is not
1142-
required; see 14.4 "File Identifiers".
1143-
"""
1144-
if hasattr(self, "_ID") and self._ID and len(self._ID) == 2:
1145-
ID_1 = self._ID[0]
1141+
required;
1142+
When a file is first written, both identifiers shall be set to the same value.
1143+
If both identifiers match when a file reference is resolved, it is very
1144+
likely that the correct and unchanged file has been found. If only the first
1145+
identifier matches, a different version of the correct file has been found.
1146+
see 14.4 "File Identifiers".
1147+
"""
1148+
if self._ID:
1149+
id1 = self._ID[0]
1150+
id2 = self._compute_document_identifier()
11461151
else:
1147-
ID_1 = self._compute_document_identifier_from_content()
1148-
ID_2 = self._compute_document_identifier_from_content()
1149-
self._ID = ArrayObject((ID_1, ID_2))
1152+
id1 = self._compute_document_identifier()
1153+
id2 = ByteStringObject(id1.original_bytes)
1154+
self._ID = ArrayObject((id1, id2))
11501155

11511156
def encrypt(
11521157
self,
@@ -1230,7 +1235,9 @@ def encrypt(
12301235
if not use_128bit:
12311236
alg = EncryptAlgorithm.RC4_40
12321237
self.generate_file_identifiers()
1233-
self._encryption = Encryption.make(alg, permissions_flag, self._ID[0])
1238+
self._encryption = Encryption.make(
1239+
alg, permissions_flag, cast(ArrayObject, self._ID)[0]
1240+
)
12341241
# in case call `encrypt` again
12351242
entry = self._encryption.write_entry(user_password, owner_password)
12361243
if self._encrypt_entry:
@@ -1331,7 +1338,7 @@ def _write_trailer(self, stream: StreamType, xref_location: int) -> None:
13311338
NameObject(TK.INFO): self._info,
13321339
}
13331340
)
1334-
if hasattr(self, "_ID"):
1341+
if self._ID:
13351342
trailer[NameObject(TK.ID)] = self._ID
13361343
if self._encrypt_entry:
13371344
trailer[NameObject(TK.ENCRYPT)] = self._encrypt_entry.indirect_reference

0 commit comments

Comments
 (0)