Converting PDF files to Zarr#

We use a Zarr files where we store images files extracted from the PDF without additional decoding-encoding step. By doing this there are 2 advantages:

  • Faster read of input data.

  • Works in all platforms. This is not the case with pdf2image package which requires PDF-Poppler to be installed.

The following script can convert all PDF files in raw to raw_zarr.

Requirements: pypdf pymupdf tqdm

"""Convert images in PDF as bytes in ZARR format using MsgPack library

# See also

- Package: https://ome-zarr.readthedocs.io/en/stable/
- Viewer: https://www.napari-hub.org/plugins/napari-ome-zarr
- Blog post: https://www.fabriziomusacchio.com/blog/2022-10-24-Zarr_and_images/

"""

from __future__ import annotations

from pathlib import Path
from typing import Iterator

import zarr
from dawsonia.io import get_station_name, get_year
from numcodecs import MsgPack


def iter_pdfimages(pdf_file: Path) -> Iterator[bytes]:
    from pypdf import PdfReader

    reader = PdfReader(pdf_file)

    for page in reader.pages:
        for image_file_object in page.images:
            # yield image_file_object.image
            yield image_file_object.data


def iter_pdfimages2(pdf_file_path) -> Iterator[tuple[bytes, str]]:
    # Open the PDF file using pymupdf
    import fitz

    pdf_document = fitz.open(pdf_file_path)

    for page in pdf_document:
        image_list = page.get_images(full=True)
        if len(image_list) > 1:
            pix = page.get_pixmap(dpi=150)
            ext = "jpg"
            yield pix.tobytes(ext), ext
        else:
            for img_info in image_list:
                xref = img_info[0]
                try:
                    smask = img_info[1]
                except IndexError:
                    smask = 0

                if smask == 0:
                    imgdict = pdf_document.extract_image(xref)
                    ext = imgdict["ext"]
                    image_bytes = imgdict["image"]
                else:
                    # Can be a thumbnail
                    image_bytes, ext = recover_pix(pdf_document, xref, smask)

                # Append the image bytes to the list
                yield image_bytes, ext


def recover_pix(pdf_document, xref, smask):
    import fitz

    # https://github.com/pymupdf/PyMuPDF-Utilities/blob/master/examples/extract-images/extract-from-xref.py
    # https://pymupdf.readthedocs.io/en/latest/recipes-images.html#how-to-handle-image-masks
    pix0 = fitz.Pixmap(pdf_document.extract_image(xref)["image"])
    mask = fitz.Pixmap(pdf_document.extract_image(smask)["image"])
    pix = fitz.Pixmap(pix0, mask)
    if pix0.n > 3:
        ext = "pam"
    else:
        ext = "png"

    return pix.tobytes(ext), ext


def pdf_to_zarr(pdf_file, zarr_file):
    pages_ext = iter_pdfimages2(pdf_file)

    if zarr_file.exists():
        zarr_file.unlink()
    zarr_file.parent.mkdir(parents=True, exist_ok=True)

    # with zarr.DirectoryStore(zarr_file) as store:
    image_exts = set()
    store = zarr.ZipStore(zarr_file, mode="w")
    root = zarr.group(store=store, overwrite=True)

    for page_nb, (page, ext) in enumerate(pages_ext):
        group = root.create_group(f"page_{page_nb+1}", overwrite=True)
        group.array("byteimage", data=[page], object_codec=MsgPack())
        group.attrs["ext"] = ext
        image_exts.add(ext)

    root.attrs.update({
        "first_page": 1,
        "last_page": page_nb + 1,
        "station_name": get_station_name(pdf_file),
        "year": get_year(pdf_file),
        "byteimage_exts": tuple(image_exts),
        "api": "dawsonia.io.read_book",
    })

    store.close()  # required to persist metadata

    # print(byteimage.info)


if __name__ == "__main__":
    import os

    for root, dirs, files in os.walk("data/raw"):
        for pdf_file in (
            pf for file in files if (pf := Path(root) / file).suffix == ".pdf"
        ):
            zarr_file = (
                Path(root.replace("data/raw", "data/raw_zarr"))
                / pdf_file.with_suffix(".zarr.zip").name
            )
            pdf_to_zarr(pdf_file, zarr_file)