Source code for dawsonia.ml.cli

import os
from pathlib import Path
from typing import Annotated

import typer

from ..config import config_cli_names, config_kwargs

app = typer.Typer()



[docs]
@app.callback(invoke_without_command=True)
def command(
    source: str = "washington",
    arch: str = "flor",
    transform: bool = False,
    opencv: bool = False,
    image: str = "",
    train: bool = False,
    test: bool = False,
    norm_accentuation: bool = False,
    norm_punctuation: bool = False,
    epochs: int = 1000,
    batch_size: int = 16,
    learning_rate: Annotated[float, typer.Option("-lr", "--learning-rate")] = None,  # type: ignore[assignment]
    model_path: Path = Path(
        "/local_disk/", "data", "ai-for-obs", "interim", "model_tmp"
    ),
    config: Annotated[Path, typer.Option(*config_cli_names, **config_kwargs)] = Path(
        "dawsonia.toml"
    ),
):  # pylint: disable=unused-argument, too-many-arguments, too-many-locals
    """MLops: transform data, train, test"""
    # Paths
    model_path = model_path.expanduser()
    raw_path = Path(model_path, "raw", source)
    source_path = Path(model_path, "data", f"{source}.hdf5")
    output_path = Path(model_path, "output", source, arch)
    checkpoint_path = Path(output_path, "checkpoint_weights.hdf5")

    if source != "unified" and not raw_path.resolve().exists():
        raise NotADirectoryError(f"Expected {raw_path = }. Check --source argument.")

    import tensorflow  # noqa
    from dawsonia.ml.data import preproc
    from dawsonia.ml.ml import (
        CHARSET_BASE,
        INPUT_SIZE,
        MAX_TEXT_LENGTH,
        make_datagen,
        make_htr_model,
        model_predict,
        model_test,
        model_train,
    )

    print("charset:", CHARSET_BASE)

    if transform:
        from dawsonia.ml.data.reader import Dataset

        print(f"{source} dataset will be transformed...")

        if source == "unified":
            Dataset(source=source_path.parent, name=source).create_unified_dataset(
                source_path,
                INPUT_SIZE,
                MAX_TEXT_LENGTH,
            )
        else:
            ds = Dataset(source=raw_path, name=source)
            ds.read_partitions()
            ds.save_partitions(source_path, INPUT_SIZE, MAX_TEXT_LENGTH)

    elif opencv:
        import cv2
        import h5py

        with h5py.File(source_path, "r") as hdf:
            hdf_data = hdf["test"]["dt"][:256]
            hdf_ground_truth = hdf["test"]["gt"][:256]

        predicts_hdf = [""] * len(hdf_data)

        if (predict_file := output_path / "predict.txt").is_file():
            with predict_file.open(encoding="utf-8") as fp:
                predicts_hdf = [line[5:] for line in fp if line.startswith("TE_P")]

        for x in range(hdf_data.size):
            print(f"Image shape:\t{hdf_data[x].shape}")
            print(f"Ground truth:\t{hdf_ground_truth[x].decode()}")
            print(f"Predict:\t{predicts_hdf[x]}\n")

            cv2.imshow("img", preproc.adjust_to_see(hdf_data[x]))
            cv2.waitKey(0)

    elif image:
        import cv2

        predicts_image, probabilities = model_predict(
            image, arch, checkpoint_path, INPUT_SIZE, MAX_TEXT_LENGTH, CHARSET_BASE
        )

        print("\n####################################")
        for i, (pred, prob) in enumerate(zip(predicts_image, probabilities)):
            print("\nProb.  - Predict")

            for pred_value, prob_value in zip(pred, prob):
                print(f"{prob_value:.4f} - {pred_value}")

            cv2.imshow(f"Image {i + 1}", cv2.imread(image))
        print("\n####################################")
        cv2.waitKey(0)

    else:
        assert os.path.isfile(source_path) or os.path.isfile(checkpoint_path)
        os.makedirs(output_path, exist_ok=True)

        dtgen = make_datagen(
            batch_size,
            source_path,
            MAX_TEXT_LENGTH,
            CHARSET_BASE,
        )

        model = make_htr_model(
            arch,
            checkpoint_path,
            vocab_size=dtgen.tokenizer.vocab_size,
            test_mode=test and not train,
            stop_tolerance=20,
            reduce_tolerance=15,
            learning_rate=learning_rate,
        )

        if train:
            model_train(epochs, output_path, checkpoint_path, model, dtgen)

        elif test:
            model_test(norm_accentuation, norm_punctuation, output_path, model, dtgen)