Source code for chemicalchecker.util.performance.performance
"""Binary performances."""
import json
import numpy as np
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import f1_score, roc_curve, auc, matthews_corrcoef
from sklearn.metrics import average_precision_score, davies_bouldin_score
from sklearn.metrics import confusion_matrix, precision_score, silhouette_score
[docs]class PerformanceBinary():
"""PerformanceBinary class.
Compute performance metric for a binary classiier.
"""
metrics = {
"auc_roc": "AUC-ROC",
"auc_pr": "AUC-PR",
"thr": "Threshold",
"sens": "Sensitivity",
"spec": "Specificity",
"mcc": "MCC",
"f1": "F1",
"prec": "Precision"
}
def __init__(self, y_true, y_pred):
"""Initialize a PerformanceBinary instance.
Args:
y_true(array): Array of truth labels.
y_pred(array): Array of predicted labels.
"""
fpr, tpr, thresholds = roc_curve(y_true, y_pred)
self.auc_roc = auc(fpr, tpr)
self.auc_pr = average_precision_score(y_true, y_pred)
max_bacc = 0
self.thr = 0
for i in range(len(thresholds)):
bacc = np.mean([(1. - fpr[i]), tpr[i]])
if bacc > max_bacc:
self.thr = thresholds[i]
max_bacc = bacc
y_class = []
for yp in y_pred:
if yp >= self.thr:
y_class += [1]
else:
y_class += [0]
self.M = confusion_matrix(y_true, y_class)
TN, FP, FN, TP = self.M.ravel()
self.sens = float(TP) / (TP + FN) # recall_score
self.spec = float(TN) / (TN + FP)
self.mcc = matthews_corrcoef(y_true, y_class)
self.Bacc = (self.sens + self.spec) / 2.
self.f1 = f1_score(y_true, y_class)
self.prec = precision_score(y_true, y_class)
def __str__(self):
to_str = ""
for attr in sorted(self.metrics):
to_str += "{:15}{:15}\n".format(
self.metrics[attr], getattr(self, attr))
return to_str
[docs] def toJSON(self, filename):
"""Save in stats in json format."""
tmp = dict()
for attr, name in self.metrics.items():
tmp[name] = str(getattr(self, attr))
with open(filename, 'w') as fh:
json.dump(tmp, fh)
[docs]class PerformanceCluster():
metrics = {
"nr_clusters": "nr_clusters",
"fraction_noisy": "fraction_noisy",
"davies_bouldin_score": "davies_bouldin_score",
"calinski_harabaz_score": "calinski_harabaz_score",
"silhouette_score": "silhouette_score",
"davies_bouldin_score_nonoisy": "davies_bouldin_score_nonoisy",
"calinski_harabaz_score_nonoisy": "calinski_harabaz_score_nonoisy",
"silhouette_score_nonoisy": "silhouette_score_nonoisy"
}
def __init__(self, X, labels, strengths):
self.nr_clusters = len(set(labels))
self.fraction_noisy = sum(labels == -1) / float(len(labels))
self.silhouette_score = silhouette_score(X, labels)
self.davies_bouldin_score = davies_bouldin_score(X, labels)
self.calinski_harabaz_score = calinski_harabaz_score(X, labels)
# same but without noisy data
labels_nonoisy = labels[labels >= 0]
X_nonoisy = X[labels >= 0]
self.silhouette_score_nonoisy = silhouette_score(
X_nonoisy, labels_nonoisy)
self.davies_bouldin_score_nonoisy = davies_bouldin_score(
X_nonoisy, labels_nonoisy)
self.calinski_harabaz_score_nonoisy = calinski_harabaz_score(
X_nonoisy, labels_nonoisy)
def __str__(self):
to_str = ""
for attr in sorted(self.metrics):
to_str += "{:15}{:15}\n".format(
self.metrics[attr], getattr(self, attr))
return to_str
def toJSON(self, filename):
tmp = dict()
for attr, name in self.metrics.items():
tmp[name] = str(getattr(self, attr))
with open(filename, 'w') as fh:
json.dump(tmp, fh)