Skip to content

Commit c864f4e

Browse files
authored
BUG: Process 2bits and 4bits images (#1967)
Closes #1954
1 parent e897809 commit c864f4e

File tree

2 files changed

+55
-12
lines changed

2 files changed

+55
-12
lines changed

pypdf/filters.py

Lines changed: 36 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -641,7 +641,9 @@ def decodeStreamData(stream: Any) -> Union[str, bytes]: # deprecated
641641
return decode_stream_data(stream)
642642

643643

644-
mode_str_type: TypeAlias = Literal["", "1", "RGB", "P", "L", "RGBA", "CMYK"]
644+
mode_str_type: TypeAlias = Literal[
645+
"", "1", "RGB", "2bits", "4bits", "P", "L", "RGBA", "CMYK"
646+
]
645647

646648

647649
def _get_imagemode(
@@ -673,6 +675,8 @@ def _get_imagemode(
673675

674676
mode_map = {
675677
"1bit": "1", # 0 will be used for 1 bit
678+
"2bit": "2bits", # 2 bits images
679+
"4bit": "4bits", # 4 bits
676680
"/DeviceGray": "L",
677681
"palette": "P", # reserved for color_components alignment
678682
"/DeviceRGB": "RGB",
@@ -718,6 +722,24 @@ def _handle_flate(
718722
Process image encoded in flateEncode
719723
Returns img, image_format, extension
720724
"""
725+
726+
def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes:
727+
mask = (2 << bits) - 1
728+
nbuff = bytearray(size[0] * size[1])
729+
by = 0
730+
bit = 8 - bits
731+
for y in range(size[1]):
732+
if (bit != 0) and (bit != 8 - bits):
733+
by += 1
734+
bit = 8 - bits
735+
for x in range(size[0]):
736+
nbuff[y * size[0] + x] = (data[by] >> bit) & mask
737+
bit -= bits
738+
if bit < 0:
739+
by += 1
740+
bit = 8 - bits
741+
return bytes(nbuff)
742+
721743
extension = ".png" # mime_type = "image/png"
722744
lookup: Any
723745
base: Any
@@ -726,6 +748,12 @@ def _handle_flate(
726748
color_space, base, hival, lookup = (
727749
value.get_object() for value in color_space
728750
)
751+
if mode == "2bits":
752+
mode = "P"
753+
data = bits2byte(data, size, 2)
754+
elif mode == "4bits":
755+
mode = "P"
756+
data = bits2byte(data, size, 4)
729757
img = Image.frombytes(mode, size, data)
730758
if color_space == "/Indexed":
731759
from .generic import ByteStringObject
@@ -820,8 +848,8 @@ def _handle_jpx(
820848
):
821849
# https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes
822850
mode: mode_str_type = "RGB"
823-
if x_object_obj.get("/BitsPerComponent", 8) == 1:
824-
mode = _get_imagemode("1bit", 0, "")
851+
if x_object_obj.get("/BitsPerComponent", 8) < 8:
852+
mode = _get_imagemode(f"{x_object_obj.get('/BitsPerComponent', 8)}bit", 0, "")
825853
else:
826854
mode = _get_imagemode(
827855
color_space,
@@ -842,7 +870,11 @@ def _handle_jpx(
842870
lfilters = filters[-1] if isinstance(filters, list) else filters
843871
if lfilters == FT.FLATE_DECODE:
844872
img, image_format, extension = _handle_flate(
845-
size, data, mode, color_space, colors
873+
size,
874+
data,
875+
mode,
876+
color_space,
877+
colors,
846878
)
847879
elif lfilters in (FT.LZW_DECODE, FT.ASCII_85_DECODE, FT.CCITT_FAX_DECODE):
848880
# I'm not sure if the following logic is correct.
@@ -898,14 +930,6 @@ def _handle_jpx(
898930
# TODO : implement mask
899931
if alpha.mode != "L":
900932
alpha = alpha.convert("L")
901-
scale = x_object_obj[IA.S_MASK].get("/Decode", [0.0, 1.0])
902-
if (scale[1] - scale[0]) != 1.0:
903-
alpha = alpha.point(
904-
[
905-
round(255.0 * (v / 255.0 * (scale[1] - scale[0]) + scale[0]))
906-
for v in range(256)
907-
]
908-
)
909933
if img.mode == "P":
910934
img = img.convert("RGB")
911935
img.putalpha(alpha)

tests/test_filters.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -438,3 +438,22 @@ def test_cascaded_filters_images():
438438
for p in reader.pages:
439439
for i in p.images:
440440
_ = i.name, i.image
441+
442+
443+
@pytest.mark.enable_socket()
444+
def test_2bits_image():
445+
"""From #1954, test with 2bits image. TODO: 4bits also"""
446+
url = "https://github.com/py-pdf/pypdf/files/12050253/tt.pdf"
447+
name = "paid.pdf"
448+
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
449+
url_png = "https://user-images.githubusercontent.com/4083478/253568117-ca95cc85-9dea-4145-a5e0-032f1c1aa322.png"
450+
name_png = "Paid.png"
451+
refimg = Image.open(
452+
BytesIO(get_pdf_from_url(url_png, name=name_png))
453+
) # not a pdf but it works
454+
data = reader.pages[0].images[0]
455+
diff = ImageChops.difference(data.image, refimg)
456+
d = sqrt(
457+
sum([(a * a + b * b + c * c + d * d) for a, b, c, d in diff.getdata()])
458+
) / (diff.size[0] * diff.size[1])
459+
assert d < 0.01

0 commit comments

Comments
 (0)