Converting PDF files to Zarr#
We use a Zarr files where we store images files extracted from the PDF without additional decoding-encoding step. By doing this there are 2 advantages:
Faster read of input data.
Works in all platforms. This is not the case with
pdf2imagepackage which requires PDF-Poppler to be installed.
The following script can convert all PDF files in raw to
raw_zarr.
Requirements: pypdf pymupdf tqdm
"""Convert images in PDF as bytes in ZARR format using MsgPack library
# See also
- Package: https://ome-zarr.readthedocs.io/en/stable/
- Viewer: https://www.napari-hub.org/plugins/napari-ome-zarr
- Blog post: https://www.fabriziomusacchio.com/blog/2022-10-24-Zarr_and_images/
"""
from __future__ import annotations
from pathlib import Path
from typing import Iterator
import zarr
from dawsonia.io import get_station_name, get_year
from numcodecs import MsgPack
def iter_pdfimages(pdf_file: Path) -> Iterator[bytes]:
from pypdf import PdfReader
reader = PdfReader(pdf_file)
for page in reader.pages:
for image_file_object in page.images:
# yield image_file_object.image
yield image_file_object.data
def iter_pdfimages2(pdf_file_path) -> Iterator[tuple[bytes, str]]:
# Open the PDF file using pymupdf
import fitz
pdf_document = fitz.open(pdf_file_path)
for page in pdf_document:
image_list = page.get_images(full=True)
if len(image_list) > 1:
pix = page.get_pixmap(dpi=150)
ext = "jpg"
yield pix.tobytes(ext), ext
else:
for img_info in image_list:
xref = img_info[0]
try:
smask = img_info[1]
except IndexError:
smask = 0
if smask == 0:
imgdict = pdf_document.extract_image(xref)
ext = imgdict["ext"]
image_bytes = imgdict["image"]
else:
# Can be a thumbnail
image_bytes, ext = recover_pix(pdf_document, xref, smask)
# Append the image bytes to the list
yield image_bytes, ext
def recover_pix(pdf_document, xref, smask):
import fitz
# https://github.com/pymupdf/PyMuPDF-Utilities/blob/master/examples/extract-images/extract-from-xref.py
# https://pymupdf.readthedocs.io/en/latest/recipes-images.html#how-to-handle-image-masks
pix0 = fitz.Pixmap(pdf_document.extract_image(xref)["image"])
mask = fitz.Pixmap(pdf_document.extract_image(smask)["image"])
pix = fitz.Pixmap(pix0, mask)
if pix0.n > 3:
ext = "pam"
else:
ext = "png"
return pix.tobytes(ext), ext
def pdf_to_zarr(pdf_file, zarr_file):
pages_ext = iter_pdfimages2(pdf_file)
if zarr_file.exists():
zarr_file.unlink()
zarr_file.parent.mkdir(parents=True, exist_ok=True)
# with zarr.DirectoryStore(zarr_file) as store:
image_exts = set()
store = zarr.ZipStore(zarr_file, mode="w")
root = zarr.group(store=store, overwrite=True)
for page_nb, (page, ext) in enumerate(pages_ext):
group = root.create_group(f"page_{page_nb+1}", overwrite=True)
group.array("byteimage", data=[page], object_codec=MsgPack())
group.attrs["ext"] = ext
image_exts.add(ext)
root.attrs.update({
"first_page": 1,
"last_page": page_nb + 1,
"station_name": get_station_name(pdf_file),
"year": get_year(pdf_file),
"byteimage_exts": tuple(image_exts),
"api": "dawsonia.io.read_book",
})
store.close() # required to persist metadata
# print(byteimage.info)
if __name__ == "__main__":
import os
for root, dirs, files in os.walk("data/raw"):
for pdf_file in (
pf for file in files if (pf := Path(root) / file).suffix == ".pdf"
):
zarr_file = (
Path(root.replace("data/raw", "data/raw_zarr"))
/ pdf_file.with_suffix(".zarr.zip").name
)
pdf_to_zarr(pdf_file, zarr_file)