Source code for dawsonia.ml.data.evaluation

"""Tool to metrics calculation through data and label (string and string).

* Calculation from Optical Character Recognition (OCR) metrics with editdistance.
"""
# pylint: skip-file

import string
import unicodedata

import editdistance
import numpy as np


[docs] def ocr_metrics( predicts, ground_truth, norm_accentuation=False, norm_punctuation=False ): """Calculate Character Error Rate (CER), Word Error Rate (WER) and Sequence Error Rate (SER)""" if len(predicts) == 0 or len(ground_truth) == 0: return (1, 1, 1) cer, wer, ser = [], [], [] for pd, gt in zip(predicts, ground_truth): pd, gt = pd.lower(), gt.lower() if norm_accentuation: pd = ( unicodedata.normalize("NFKD", pd) .encode("ASCII", "ignore") .decode("ASCII") ) gt = ( unicodedata.normalize("NFKD", gt) .encode("ASCII", "ignore") .decode("ASCII") ) if norm_punctuation: pd = pd.translate(str.maketrans("", "", string.punctuation)) gt = gt.translate(str.maketrans("", "", string.punctuation)) # Error rates # Character pd_cer, gt_cer = list(pd), list(gt) dist = editdistance.eval(pd_cer, gt_cer) cer.append(dist / (max(len(pd_cer), len(gt_cer)))) # Word pd_wer, gt_wer = pd.split(), gt.split() dist = editdistance.eval(pd_wer, gt_wer) wer.append(dist / (max(len(pd_wer), len(gt_wer)))) # Sequence pd_ser, gt_ser = [pd], [gt] dist = editdistance.eval(pd_ser, gt_ser) ser.append(dist / (max(len(pd_ser), len(gt_ser)))) metrics = [cer, wer, ser] metrics = np.mean(metrics, axis=1) return metrics