# coding=utf-8
# Copyright (c) 2020 Alibaba PAI team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
import torch
from torch import nn
import numpy as np
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import f1_score, matthews_corrcoef, \
roc_auc_score, classification_report
from sklearn.metrics import f1_score, accuracy_score, roc_curve, auc, precision_score, recall_score
from torch.utils.data import DataLoader
from .. import losses
from ..utils.labeling_eval_utils import evaluate_sequence_labeling
from ..utils.logger import logger
[docs]class Evaluator(object):
def __init__(self, metrics=("accuracy",)):
self.metrics = metrics
[docs] def evaluate(self, model, valid_loader=None,
valid_dataset=None, eval_batch_size=32, teacher_model=None):
assert valid_dataset is not None or valid_loader is not None
if valid_loader is None:
valid_loader = DataLoader(valid_dataset,
batch_size=eval_batch_size,
shuffle=False,
collate_fn=valid_dataset.batch_fn)
logger.info("=" * 10 + " Evaluate Start " + "=" * 10)
logger.info("Eval batch size: {}".format(eval_batch_size))
logger.info("Evaluation steps: {}".format(len(valid_loader)))
model = model.cuda()
model.eval()
if teacher_model is not None:
teacher_model = teacher_model.cuda()
teacher_model.eval()
if hasattr(model, "module"):
model = model.module
student_num_params = sum([p.nelement() for n, p in model.named_parameters()])
logger.info("Total parameters = %s" % format(student_num_params, ","))
evaluate_task = "none"
if hasattr(model, "model_name"):
if model.model_name.startswith("text_classify"):
evaluate_task = "text_classify"
elif model.model_name.startswith("text_multi_label_classify"):
evaluate_task = "text_multi_label_classify"
elif model.model_name.startswith("language_modeling"):
evaluate_task = "language_modeling"
elif model.model_name.startswith("sequence_labeling"):
evaluate_task = "sequence_labeling"
if evaluate_task == "text_classify":
return self.evaluate_text_classify(model, valid_loader)
elif evaluate_task == "text_multi_label_classify":
return self.evaluate_multi_label_text_classify(model, valid_loader)
elif evaluate_task == "language_modeling":
return self.evaluate_language_modeling(model, valid_loader)
elif evaluate_task == "sequence_labeling":
return self.evaluate_sequence_labeling(model, valid_loader)
else:
return self.evaluate_none_task(model, teacher_model, valid_loader)
[docs] def evaluate_text_classify(self, model, valid_loader):
total_loss = 0
total_steps = 0
total_samples = 0
hit_num = 0
total_num = 0
logits_list = list()
y_trues = list()
total_spent_time = 0.0
for _step, batch in enumerate(valid_loader):
batch = {key: val.cuda() if isinstance(val, torch.Tensor) else val
for key, val in batch.items()}
infer_start_time = time.time()
with torch.no_grad():
student_outputs = model(batch)
infer_end_time = time.time()
total_spent_time += infer_end_time - infer_start_time
assert "logits" in student_outputs and "label_ids" in batch
logits, label_ids = student_outputs["logits"], batch["label_ids"]
y_trues.extend(label_ids.tolist())
logits_list.extend(logits.tolist())
hit_num += torch.sum(torch.argmax(logits, dim=-1) == label_ids).item()
total_num += label_ids.shape[0]
if len(logits.shape) == 1 or logits.shape[-1] == 1:
tmp_loss = losses.mse_loss(logits, label_ids)
elif len(logits.shape) == 2:
tmp_loss = losses.cross_entropy(logits, label_ids)
else:
raise RuntimeError
total_loss += tmp_loss.mean().item()
total_steps += 1
total_samples += valid_loader.batch_size
if (_step + 1) % 100 == 0:
logger.info(
"Eval: %d/%d steps finished" % (_step + 1, len(valid_loader.dataset) // valid_loader.batch_size))
logger.info("Inference time = {:.2f}s, [{:.4f} ms / sample] ".format(
total_spent_time, total_spent_time * 1000 / total_samples))
eval_loss = total_loss / total_steps
logger.info("Eval loss: {}".format(eval_loss))
logits_list = np.array(logits_list)
eval_outputs = list()
for metric in self.metrics:
if metric.endswith("accuracy"):
acc = hit_num / total_num
logger.info("Accuracy: {}".format(acc))
eval_outputs.append(("accuracy", acc))
elif metric == "f1":
if model.config.num_labels == 2:
f1 = f1_score(y_trues, np.argmax(logits_list, axis=-1))
logger.info("F1: {}".format(f1))
eval_outputs.append(("f1", f1))
else:
f1 = f1_score(y_trues, np.argmax(logits_list, axis=-1), average="macro")
logger.info("Macro F1: {}".format(f1))
eval_outputs.append(("macro-f1", f1))
f1 = f1_score(y_trues, np.argmax(logits_list, axis=-1), average="micro")
logger.info("Micro F1: {}".format(f1))
eval_outputs.append(("micro-f1", f1))
elif metric == "auc":
auc = roc_auc_score(y_trues, np.argmax(logits_list, axis=-1))
logger.info("AUC: {}".format(auc))
eval_outputs.append(("auc", auc))
elif metric == "matthews_corrcoef":
mcc = matthews_corrcoef(y_trues, np.argmax(logits_list, axis=-1))
logger.info("Matthews Corrcoef: {}".format(mcc))
eval_outputs.append(("matthews_corrcoef", mcc))
elif metric == "pearson_and_spearman":
preds = logits_list[:, 0]
pearson_corr = pearsonr(preds, y_trues)[0]
spearman_corr = spearmanr(preds, y_trues)[0]
logger.info("Peasrson: {}".format(pearson_corr))
logger.info("Spearmanr: {}".format(spearman_corr))
corr = (pearson_corr + spearman_corr) / 2.0
logger.info("Peasrson_and_spearmanr: {}".format(corr))
eval_outputs.append(("pearson_and_spearman", corr))
elif metric == "classification_report":
logger.info("\n{}".format(
classification_report(y_trues, np.argmax(logits_list, axis=-1), digits=4)))
elif "last_layer_mse" in self.metrics:
logger.info("Last layer MSE: {}".format(eval_loss))
eval_outputs.append(("last_layer_mse", -eval_loss))
else:
raise NotImplementedError("Metric %s not implemented" % metric)
return eval_outputs
[docs] def evaluate_multi_label_text_classify(self, model, valid_loader):
total_loss = 0
total_steps = 0
total_samples = 0
#hit_num = 0
#total_num = 0
preds_list = list()
y_trues = list()
total_spent_time = 0.0
for _step, batch in enumerate(valid_loader):
batch = {key: val.cuda() if isinstance(val, torch.Tensor) else val
for key, val in batch.items()}
infer_start_time = time.time()
with torch.no_grad():
student_outputs = model(batch)
infer_end_time = time.time()
total_spent_time += infer_end_time - infer_start_time
assert "logits" in student_outputs and "label_ids" in batch
logits, label_ids = student_outputs["logits"], batch["label_ids"]
y_trues.extend(label_ids.long().tolist())
probs = nn.Sigmoid()(logits)
predictions = (probs > 0.5).long()
preds_list.extend(predictions.tolist())
tmp_loss = losses.multi_label_sigmoid_cross_entropy(logits, label_ids)
total_loss += tmp_loss.mean().item()
total_steps += 1
total_samples += valid_loader.batch_size
if (_step + 1) % 100 == 0:
logger.info(
"Eval: %d/%d steps finished" % (_step + 1, len(valid_loader.dataset) // valid_loader.batch_size))
logger.info("Inference time = {:.2f}s, [{:.4f} ms / sample] ".format(
total_spent_time, total_spent_time * 1000 / total_samples))
eval_loss = total_loss / total_steps
logger.info("Eval loss: {}".format(eval_loss))
preds_list = np.array(preds_list)
eval_outputs = list()
n_class = preds_list.shape[1]
y_trues = np.array(y_trues)
precs = [precision_score(y_trues[:, i], preds_list[:, i]) for i in range(n_class)]
recalls = [recall_score(y_trues[:, i], preds_list[:, i]) for i in range(n_class)]
prec_ma = np.mean(precs)
recall_ma = np.mean(recalls)
if prec_ma == 0 and recall_ma == 0:
f1_ma = 0
else:
f1_ma = 2 * prec_ma * recall_ma / (prec_ma + recall_ma)
for metric in self.metrics:
if metric.endswith("accuracy"):
logger.info("precision macro average: {}".format(prec_ma))
eval_outputs.append(("prec_ma", prec_ma))
elif metric == "f1":
logger.info("Macro F1 average: {}".format(f1_ma))
eval_outputs.append(("micro-f1", f1_ma))
else:
raise NotImplementedError("Metric %s not implemented" % metric)
return eval_outputs
[docs] def evaluate_language_modeling(self, model, valid_loader):
total_loss = 0
total_steps = 0
total_samples = 0
hit_num = 0
total_num = 0
total_spent_time = 0.0
for _step, batch in enumerate(valid_loader):
batch = {key: val.cuda() if isinstance(val, torch.Tensor) else val
for key, val in batch.items()}
infer_start_time = time.time()
with torch.no_grad():
student_outputs = model(batch)
infer_end_time = time.time()
total_spent_time += infer_end_time - infer_start_time
assert "logits" in student_outputs and "label_ids" in batch
logits, label_ids = student_outputs["logits"], batch["label_ids"]
for b in range(label_ids.shape[0]):
_logits = logits[b]
_label_ids = label_ids[b]
mask_span_indices = batch["mask_span_indices"][b]
for span_indices in mask_span_indices:
pred = list()
label = list()
for span_idx in span_indices:
pred.append(torch.argmax(_logits[span_idx]).item())
label.append(_label_ids[span_idx].item())
hit_num += (tuple(pred) == tuple(label))
total_num += 1
logits = logits.view(-1, logits.size(-1))
label_ids = label_ids.view(-1)
indices = (label_ids != -100)
logits = logits[indices]
label_ids = label_ids[indices]
tmp_loss = losses.cross_entropy(logits, label_ids)
total_loss += tmp_loss.mean().item()
total_steps += 1
total_samples += valid_loader.batch_size
if (_step + 1) % 100 == 0:
logger.info(
"Eval: %d/%d steps finished" % (_step + 1, len(valid_loader.dataset) // valid_loader.batch_size))
logger.info("Inference time = {:.2f}s, [{:.4f} ms / sample] ".format(
total_spent_time, total_spent_time * 1000 / total_samples))
eval_loss = total_loss / total_steps
logger.info("Eval loss: {}".format(eval_loss))
acc = hit_num / total_num
logger.info("Accuracy: {}".format(acc))
eval_outputs = [("accuracy", acc)]
return eval_outputs
[docs] def evaluate_sequence_labeling(self, model, valid_loader):
def predict_sequence_labeling(raw_preds, raw_label_ids, label_enumerate_values, tok_to_orig_indexes):
new_preds = list()
new_labels = list()
idx_label_map = dict({idx: value for idx, value in enumerate(label_enumerate_values)})
for idx, (raw_pred, tok_to_orig_index) in enumerate(zip(raw_preds, tok_to_orig_indexes)):
raw_label = raw_label_ids[idx]
final_pred = list()
final_label = list()
prev_token_idx = -1
for k in range(min(len(raw_pred), len(tok_to_orig_index))):
token_pred = raw_pred[k]
token_label = raw_label[k]
token_orig_idx = tok_to_orig_index[k]
if token_orig_idx == -100 or token_label == -100:
continue
if token_orig_idx == prev_token_idx:
continue
final_pred.append(idx_label_map[token_pred])
final_label.append(idx_label_map[token_label])
prev_token_idx = token_orig_idx
raw_sequence_length = max(tok_to_orig_index) + 1
while len(final_pred) < raw_sequence_length:
final_pred.append(idx_label_map[len(idx_label_map) - 1])
new_preds.extend(final_pred + ["O"])
new_labels.extend(final_label + ["O"])
return new_preds, new_labels
total_loss = 0
total_steps = 0
total_samples = 0
true_seqs = list()
pred_seqs = list()
total_spent_time = 0.0
for _step, batch in enumerate(valid_loader):
batch = {key: val.cuda() if isinstance(val, torch.Tensor) else val
for key, val in batch.items()}
infer_start_time = time.time()
with torch.no_grad():
student_outputs = model(batch)
infer_end_time = time.time()
total_spent_time += infer_end_time - infer_start_time
assert "logits" in student_outputs and "label_ids" in batch
logits, label_ids = student_outputs["logits"], batch["label_ids"]
raw_preds = torch.argmax(logits, dim=-1).tolist()
raw_label_ids = label_ids.tolist()
new_preds, new_labels = predict_sequence_labeling(raw_preds,
raw_label_ids,
valid_loader.dataset.label_enumerate_values,
batch["tok_to_orig_index"])
pred_seqs.extend(new_preds)
true_seqs.extend(new_labels)
logits = logits.view(-1, logits.size(-1))
label_ids = label_ids.view(-1)
tmp_loss = losses.cross_entropy(logits, label_ids)
total_loss += tmp_loss.mean().item()
total_steps += 1
total_samples += valid_loader.batch_size
if (_step + 1) % 100 == 0:
logger.info(
"Eval: %d/%d steps finished" % (_step + 1, len(valid_loader.dataset) // valid_loader.batch_size))
logger.info("Inference time = {:.2f}s, [{:.4f} ms / sample] ".format(
total_spent_time, total_spent_time * 1000 / total_samples))
eval_loss = total_loss / total_steps
logger.info("Eval loss: {}".format(eval_loss))
(prec, rec, f1) = evaluate_sequence_labeling(true_seqs, pred_seqs)
logger.info("Labeling F1: {}".format(f1))
logger.info("Labeling Precision: {}".format(prec))
logger.info("Labeling Recall: {}".format(rec))
eval_outputs = list()
eval_outputs.append(("labeling_f1", f1))
eval_outputs.append(("labeling_precision", prec))
eval_outputs.append(("labeling_recall", rec))
return eval_outputs
[docs] def evaluate_none_task(self, model, teacher_model, valid_loader):
if teacher_model is None:
return [('no-metric', float("-inf"))]
total_loss = 0
total_steps = 0
total_samples = 0
total_spent_time = 0.0
for _step, batch in enumerate(valid_loader):
batch = {key: val.cuda() if isinstance(val, torch.Tensor) else val
for key, val in batch.items()}
infer_start_time = time.time()
with torch.no_grad():
student_outputs = model(batch)
infer_end_time = time.time()
total_spent_time += infer_end_time - infer_start_time
with torch.no_grad():
teacher_outputs = teacher_model(batch)
student_hidn = student_outputs["hidden"][-1]
teacher_hidn = teacher_outputs["hidden"][-1]
tmp_loss = losses.mse_loss(student_hidn, teacher_hidn)
total_loss += tmp_loss.mean().item()
total_steps += 1
total_samples += valid_loader.batch_size
if (_step + 1) % 100 == 0:
logger.info(
"Eval: %d/%d steps finished" % (_step + 1, len(valid_loader.dataset) // valid_loader.batch_size))
logger.info("Inference time = {:.2f}s, [{:.4f} ms / sample] ".format(
total_spent_time, total_spent_time * 1000 / total_samples))
eval_loss = total_loss / total_steps
logger.info("Eval loss: {}".format(eval_loss))
return [("last_layer_mse", -eval_loss)]