Source code for dawsonia.ml.cli

import os
from pathlib import Path
from typing import Annotated

import typer

from ..config import config_cli_names, config_kwargs

app = typer.Typer()


[docs] @app.callback(invoke_without_command=True) def command( source: str = "washington", arch: str = "flor", transform: bool = False, opencv: bool = False, image: str = "", train: bool = False, test: bool = False, norm_accentuation: bool = False, norm_punctuation: bool = False, epochs: int = 1000, batch_size: int = 16, learning_rate: Annotated[float, typer.Option("-lr", "--learning-rate")] = None, # type: ignore[assignment] model_path: Path = Path( "/local_disk/", "data", "ai-for-obs", "interim", "model_tmp" ), config: Annotated[Path, typer.Option(*config_cli_names, **config_kwargs)] = Path( "dawsonia.toml" ), ): # pylint: disable=unused-argument, too-many-arguments, too-many-locals """MLops: transform data, train, test""" # Paths model_path = model_path.expanduser() raw_path = Path(model_path, "raw", source) source_path = Path(model_path, "data", f"{source}.hdf5") output_path = Path(model_path, "output", source, arch) checkpoint_path = Path(output_path, "checkpoint_weights.hdf5") if source != "unified" and not raw_path.resolve().exists(): raise NotADirectoryError(f"Expected {raw_path = }. Check --source argument.") import tensorflow # noqa from dawsonia.ml.data import preproc from dawsonia.ml.ml import ( CHARSET_BASE, INPUT_SIZE, MAX_TEXT_LENGTH, make_datagen, make_htr_model, model_predict, model_test, model_train, ) print("charset:", CHARSET_BASE) if transform: from dawsonia.ml.data.reader import Dataset print(f"{source} dataset will be transformed...") if source == "unified": Dataset(source=source_path.parent, name=source).create_unified_dataset( source_path, INPUT_SIZE, MAX_TEXT_LENGTH, ) else: ds = Dataset(source=raw_path, name=source) ds.read_partitions() ds.save_partitions(source_path, INPUT_SIZE, MAX_TEXT_LENGTH) elif opencv: import cv2 import h5py with h5py.File(source_path, "r") as hdf: hdf_data = hdf["test"]["dt"][:256] hdf_ground_truth = hdf["test"]["gt"][:256] predicts_hdf = [""] * len(hdf_data) if (predict_file := output_path / "predict.txt").is_file(): with predict_file.open(encoding="utf-8") as fp: predicts_hdf = [line[5:] for line in fp if line.startswith("TE_P")] for x in range(hdf_data.size): print(f"Image shape:\t{hdf_data[x].shape}") print(f"Ground truth:\t{hdf_ground_truth[x].decode()}") print(f"Predict:\t{predicts_hdf[x]}\n") cv2.imshow("img", preproc.adjust_to_see(hdf_data[x])) cv2.waitKey(0) elif image: import cv2 predicts_image, probabilities = model_predict( image, arch, checkpoint_path, INPUT_SIZE, MAX_TEXT_LENGTH, CHARSET_BASE ) print("\n####################################") for i, (pred, prob) in enumerate(zip(predicts_image, probabilities)): print("\nProb. - Predict") for pred_value, prob_value in zip(pred, prob): print(f"{prob_value:.4f} - {pred_value}") cv2.imshow(f"Image {i + 1}", cv2.imread(image)) print("\n####################################") cv2.waitKey(0) else: assert os.path.isfile(source_path) or os.path.isfile(checkpoint_path) os.makedirs(output_path, exist_ok=True) dtgen = make_datagen( batch_size, source_path, MAX_TEXT_LENGTH, CHARSET_BASE, ) model = make_htr_model( arch, checkpoint_path, vocab_size=dtgen.tokenizer.vocab_size, test_mode=test and not train, stop_tolerance=20, reduce_tolerance=15, learning_rate=learning_rate, ) if train: model_train(epochs, output_path, checkpoint_path, model, dtgen) elif test: model_test(norm_accentuation, norm_punctuation, output_path, model, dtgen)