import os
from pathlib import Path
from typing import Annotated
import typer
from ..config import config_cli_names, config_kwargs
app = typer.Typer()
[docs]
@app.callback(invoke_without_command=True)
def command(
source: str = "washington",
arch: str = "flor",
transform: bool = False,
opencv: bool = False,
image: str = "",
train: bool = False,
test: bool = False,
norm_accentuation: bool = False,
norm_punctuation: bool = False,
epochs: int = 1000,
batch_size: int = 16,
learning_rate: Annotated[float, typer.Option("-lr", "--learning-rate")] = None, # type: ignore[assignment]
model_path: Path = Path(
"/local_disk/", "data", "ai-for-obs", "interim", "model_tmp"
),
config: Annotated[Path, typer.Option(*config_cli_names, **config_kwargs)] = Path(
"dawsonia.toml"
),
): # pylint: disable=unused-argument, too-many-arguments, too-many-locals
"""MLops: transform data, train, test"""
# Paths
model_path = model_path.expanduser()
raw_path = Path(model_path, "raw", source)
source_path = Path(model_path, "data", f"{source}.hdf5")
output_path = Path(model_path, "output", source, arch)
checkpoint_path = Path(output_path, "checkpoint_weights.hdf5")
if source != "unified" and not raw_path.resolve().exists():
raise NotADirectoryError(f"Expected {raw_path = }. Check --source argument.")
import tensorflow # noqa
from dawsonia.ml.data import preproc
from dawsonia.ml.ml import (
CHARSET_BASE,
INPUT_SIZE,
MAX_TEXT_LENGTH,
make_datagen,
make_htr_model,
model_predict,
model_test,
model_train,
)
print("charset:", CHARSET_BASE)
if transform:
from dawsonia.ml.data.reader import Dataset
print(f"{source} dataset will be transformed...")
if source == "unified":
Dataset(source=source_path.parent, name=source).create_unified_dataset(
source_path,
INPUT_SIZE,
MAX_TEXT_LENGTH,
)
else:
ds = Dataset(source=raw_path, name=source)
ds.read_partitions()
ds.save_partitions(source_path, INPUT_SIZE, MAX_TEXT_LENGTH)
elif opencv:
import cv2
import h5py
with h5py.File(source_path, "r") as hdf:
hdf_data = hdf["test"]["dt"][:256]
hdf_ground_truth = hdf["test"]["gt"][:256]
predicts_hdf = [""] * len(hdf_data)
if (predict_file := output_path / "predict.txt").is_file():
with predict_file.open(encoding="utf-8") as fp:
predicts_hdf = [line[5:] for line in fp if line.startswith("TE_P")]
for x in range(hdf_data.size):
print(f"Image shape:\t{hdf_data[x].shape}")
print(f"Ground truth:\t{hdf_ground_truth[x].decode()}")
print(f"Predict:\t{predicts_hdf[x]}\n")
cv2.imshow("img", preproc.adjust_to_see(hdf_data[x]))
cv2.waitKey(0)
elif image:
import cv2
predicts_image, probabilities = model_predict(
image, arch, checkpoint_path, INPUT_SIZE, MAX_TEXT_LENGTH, CHARSET_BASE
)
print("\n####################################")
for i, (pred, prob) in enumerate(zip(predicts_image, probabilities)):
print("\nProb. - Predict")
for pred_value, prob_value in zip(pred, prob):
print(f"{prob_value:.4f} - {pred_value}")
cv2.imshow(f"Image {i + 1}", cv2.imread(image))
print("\n####################################")
cv2.waitKey(0)
else:
assert os.path.isfile(source_path) or os.path.isfile(checkpoint_path)
os.makedirs(output_path, exist_ok=True)
dtgen = make_datagen(
batch_size,
source_path,
MAX_TEXT_LENGTH,
CHARSET_BASE,
)
model = make_htr_model(
arch,
checkpoint_path,
vocab_size=dtgen.tokenizer.vocab_size,
test_mode=test and not train,
stop_tolerance=20,
reduce_tolerance=15,
learning_rate=learning_rate,
)
if train:
model_train(epochs, output_path, checkpoint_path, model, dtgen)
elif test:
model_test(norm_accentuation, norm_punctuation, output_path, model, dtgen)