BUG: Process 2bits and 4bits images (#1967)

pubpub-zz · web-flow · commit c864f4e90308 · 2023-07-15T17:26:39.000+02:00
Closes #1954
diff --git a/pypdf/filters.py b/pypdf/filters.py
@@ -641,7 +641,9 @@ def decodeStreamData(stream: Any) -> Union[str, bytes]:  # deprecated
     return decode_stream_data(stream)
 
 
-mode_str_type: TypeAlias = Literal["", "1", "RGB", "P", "L", "RGBA", "CMYK"]
+mode_str_type: TypeAlias = Literal[
+    "", "1", "RGB", "2bits", "4bits", "P", "L", "RGBA", "CMYK"
+]
 
 
 def _get_imagemode(
@@ -673,6 +675,8 @@ def _get_imagemode(
 
     mode_map = {
         "1bit": "1",  # 0 will be used for 1 bit
+        "2bit": "2bits",  # 2 bits images
+        "4bit": "4bits",  # 4 bits
         "/DeviceGray": "L",
         "palette": "P",  # reserved for color_components alignment
         "/DeviceRGB": "RGB",
@@ -718,6 +722,24 @@ def _handle_flate(
         Process image encoded in flateEncode
         Returns img, image_format, extension
         """
+
+        def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes:
+            mask = (2 << bits) - 1
+            nbuff = bytearray(size[0] * size[1])
+            by = 0
+            bit = 8 - bits
+            for y in range(size[1]):
+                if (bit != 0) and (bit != 8 - bits):
+                    by += 1
+                    bit = 8 - bits
+                for x in range(size[0]):
+                    nbuff[y * size[0] + x] = (data[by] >> bit) & mask
+                    bit -= bits
+                    if bit < 0:
+                        by += 1
+                        bit = 8 - bits
+            return bytes(nbuff)
+
         extension = ".png"  # mime_type = "image/png"
         lookup: Any
         base: Any
@@ -726,6 +748,12 @@ def _handle_flate(
             color_space, base, hival, lookup = (
                 value.get_object() for value in color_space
             )
+        if mode == "2bits":
+            mode = "P"
+            data = bits2byte(data, size, 2)
+        elif mode == "4bits":
+            mode = "P"
+            data = bits2byte(data, size, 4)
         img = Image.frombytes(mode, size, data)
         if color_space == "/Indexed":
             from .generic import ByteStringObject
@@ -820,8 +848,8 @@ def _handle_jpx(
     ):
         # https://pillow.readthedocs.io/en/stable/handbook/concepts.html#modes
         mode: mode_str_type = "RGB"
-    if x_object_obj.get("/BitsPerComponent", 8) == 1:
-        mode = _get_imagemode("1bit", 0, "")
+    if x_object_obj.get("/BitsPerComponent", 8) < 8:
+        mode = _get_imagemode(f"{x_object_obj.get('/BitsPerComponent', 8)}bit", 0, "")
     else:
         mode = _get_imagemode(
             color_space,
@@ -842,7 +870,11 @@ def _handle_jpx(
     lfilters = filters[-1] if isinstance(filters, list) else filters
     if lfilters == FT.FLATE_DECODE:
         img, image_format, extension = _handle_flate(
-            size, data, mode, color_space, colors
+            size,
+            data,
+            mode,
+            color_space,
+            colors,
         )
     elif lfilters in (FT.LZW_DECODE, FT.ASCII_85_DECODE, FT.CCITT_FAX_DECODE):
         # I'm not sure if the following logic is correct.
@@ -898,14 +930,6 @@ def _handle_jpx(
             # TODO : implement mask
             if alpha.mode != "L":
                 alpha = alpha.convert("L")
-            scale = x_object_obj[IA.S_MASK].get("/Decode", [0.0, 1.0])
-            if (scale[1] - scale[0]) != 1.0:
-                alpha = alpha.point(
-                    [
-                        round(255.0 * (v / 255.0 * (scale[1] - scale[0]) + scale[0]))
-                        for v in range(256)
-                    ]
-                )
             if img.mode == "P":
                 img = img.convert("RGB")
             img.putalpha(alpha)
diff --git a/tests/test_filters.py b/tests/test_filters.py
@@ -438,3 +438,22 @@ def test_cascaded_filters_images():
     for p in reader.pages:
         for i in p.images:
             _ = i.name, i.image
+
+
+@pytest.mark.enable_socket()
+def test_2bits_image():
+    """From #1954, test with 2bits image. TODO: 4bits also"""
+    url = "https://github.com/py-pdf/pypdf/files/12050253/tt.pdf"
+    name = "paid.pdf"
+    reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
+    url_png = "https://user-images.githubusercontent.com/4083478/253568117-ca95cc85-9dea-4145-a5e0-032f1c1aa322.png"
+    name_png = "Paid.png"
+    refimg = Image.open(
+        BytesIO(get_pdf_from_url(url_png, name=name_png))
+    )  # not a pdf but it works
+    data = reader.pages[0].images[0]
+    diff = ImageChops.difference(data.image, refimg)
+    d = sqrt(
+        sum([(a * a + b * b + c * c + d * d) for a, b, c, d in diff.getdata()])
+    ) / (diff.size[0] * diff.size[1])
+    assert d < 0.01