Source code for dawsonia.ml.data.evaluation
"""Tool to metrics calculation through data and label (string and string).
* Calculation from Optical Character Recognition (OCR) metrics with editdistance.
"""
# pylint: skip-file
import string
import unicodedata
import editdistance
import numpy as np
[docs]
def ocr_metrics(
predicts, ground_truth, norm_accentuation=False, norm_punctuation=False
):
"""Calculate Character Error Rate (CER), Word Error Rate (WER) and Sequence
Error Rate (SER)"""
if len(predicts) == 0 or len(ground_truth) == 0:
return (1, 1, 1)
cer, wer, ser = [], [], []
for pd, gt in zip(predicts, ground_truth):
pd, gt = pd.lower(), gt.lower()
if norm_accentuation:
pd = (
unicodedata.normalize("NFKD", pd)
.encode("ASCII", "ignore")
.decode("ASCII")
)
gt = (
unicodedata.normalize("NFKD", gt)
.encode("ASCII", "ignore")
.decode("ASCII")
)
if norm_punctuation:
pd = pd.translate(str.maketrans("", "", string.punctuation))
gt = gt.translate(str.maketrans("", "", string.punctuation))
# Error rates
# Character
pd_cer, gt_cer = list(pd), list(gt)
dist = editdistance.eval(pd_cer, gt_cer)
cer.append(dist / (max(len(pd_cer), len(gt_cer))))
# Word
pd_wer, gt_wer = pd.split(), gt.split()
dist = editdistance.eval(pd_wer, gt_wer)
wer.append(dist / (max(len(pd_wer), len(gt_wer))))
# Sequence
pd_ser, gt_ser = [pd], [gt]
dist = editdistance.eval(pd_ser, gt_ser)
ser.append(dist / (max(len(pd_ser), len(gt_ser))))
metrics = [cer, wer, ser]
metrics = np.mean(metrics, axis=1)
return metrics