Source code for chemicalchecker.util.plot.ccstatsplot

"""CC statistics plots for CC web page."""
import os
import copy
import inspect
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import cm
from matplotlib import patches
from .util import canvas, cc_grid, cc_colors, homogenous_ticks, set_style, cc_coords_name

from chemicalchecker.util import logged

set_style()


[docs]@logged
class CCStatsPlot(object):
    """CCStatsPlot class."""

    def __init__(self, cc, width=30, height=30, dpi=70, transparent=True,
                 save=True, save_format='png', save_dir='./'):
        """Initialize a CCStatsPlot instance.

        The plotter works on data precomputed mainly using
        :mod:`~chemicalchecker.core.diagnostics`.

            Args:
                cc (ChemicalChecker): A ChemicalChecker object.
        """
        self.cc = cc
        self.width = width
        self.height = height
        self.dpi = dpi
        self.transparent = transparent
        self.save_format = save_format
        self.save_dir = save_dir

[docs]    def plot_all(self):
        """Run all plots reported in the 'available' table."""
        # TODO: take kwargs dict for each plot
        for method, _ in self.available().values:
            self.__log.info('Plotting: %s' % method)
            eval('self.%s()' % method)

[docs]    def available(self):
        """Resume of possible plots."""
        d = {
            "matrices": "Number of molecules and signature lengths in each of the 25 Chemical Checker datasets. Signature lengths can be read as a measure of complexity or sparsity of the data.",
            "moa_validations": "Chemical Checker datasets correlates with mechanisms of action (MoA). The receiver-operating characteristic (ROC) curves measure how similar molecules tend to share MoA. Note that the almost-perfect performance in the 'Mechanism of action' dataset is trivial.",
            "correlations": "Degree of correlation between data types measured as the ability to use one signature (x-axis) to recover neighbors defined with another signature (y-axis). Red denotes high correlation and blue denotes low correlation.",
        }
        R = []
        for k in sorted(d.keys()):
            R += [(k, d[k])]
        df = pd.DataFrame(R, columns=["method", "description"])
        return df

    def save(self, fig):
        filename = '%s.%s' % (inspect.stack()[1][3], self.save_format)
        fig.savefig(os.path.join(self.save_dir, filename),
                    bbox_inches='tight', transparent=self.transparent)

    def matrices(self, cctype='sign0', molset='full', max_mols=6.5,
                  max_feat=4.5):
        # create the grid
        fig, grid = canvas(width=self.width, height=self.height, dpi=self.dpi)
        axes = cc_grid(fig, grid, legend_out=False, cc_space_names=True,
                       hspace=0.2, wspace=0.2,
                       shared_ylabel='Molecules (log10)',
                       shared_xlabel='Variables (log10)')
        # check maximum sizes
        dims = dict()
        for ds in self.cc.datasets_exemplary():
            try:
                nr_mol, nr_feat = self.cc.metadata[
                    'dimensions'][molset][ds][cctype]
            except Exception as ex:
                self.__log.error('Cannot fetch cc.metadata: %s' % str(ex))
                continue
            max_mols = np.max([np.log10(nr_mol), max_mols])
            max_feat = np.max([np.log10(nr_feat), max_feat])
            dims[ds] = (nr_mol, nr_feat)
        # plot a rectangle for each space
        for ax, ds in zip(axes, self.cc.datasets_exemplary()):
            color = cc_colors(ds, lighness=.3, alternate=True, dark_first=True)
            if ds not in dims:
                continue
            nr_mol, nr_feat = dims[ds]
            rect = patches.Rectangle(
                (0, 0), np.log10(nr_feat), np.log10(nr_mol),
                facecolor=color, edgecolor='k', linewidth=1, alpha=0.9)
            ax.add_patch(rect)
            ax.set_ylim(0, max_mols)
            ax.set_xlim(0, max_feat)
            ax.set_aspect('auto')
            ax.set_yticks(range(int(np.ceil(max_mols))))
            ax.set_xticks(range(int(np.ceil(max_feat))))
            if 'E' in ds[:2]:
                ax.set_xticklabels(range(int(np.ceil(max_feat))))
            if '1' in ds[:2]:
                ax.set_yticklabels(range(int(np.ceil(max_mols))))
        # save or return
        if self.save:
            self.save(fig)
        else:
            return fig

    def moa_validations(self, cctype='sign1'):
        fig, grid = canvas(width=self.width, height=self.height, dpi=self.dpi)
        axes = cc_grid(fig, grid, legend_out=False, cc_space_names=True,
                       hspace=0.2, wspace=0.2,
                       shared_ylabel='True positive rate',
                       shared_xlabel='False positive rate')
        for ax, ds in zip(axes, self.cc.datasets_exemplary()):
            color = cc_colors(ds, lighness=.3, alternate=True, dark_first=True)
            sign = self.cc.signature(ds, cctype)
            diag = sign.diagnosis(ref_cc=self.cc)
            try:
                diag.plotter.moa_roc(title=False, color=color,
                                     xylabels=False, ax=ax)
            except Exception as ex:
                self.__log.error('Cannot fetch moa_roc: %s' % str(ex))
                continue
            ax.set_aspect('auto')
            ax.set_xticks([0,0.5,1])
            ax.set_xticklabels(['0','.5','1'])
            ax.set_yticks([0,0.5,1])
            ax.set_yticklabels(['0','.5','1'])
            if 'E' not in ds[:2]:
                ax.set_xticklabels('')
            if '1' not in ds[:2]:
                ax.set_yticklabels('')
        # save or return
        if self.save:
            self.save(fig)
        else:
            return fig

    def correlations(self, cctype='sign1'):
        rocauc_matrix = np.full((25, 25), np.nan)
        for row, ds1 in enumerate(self.cc.datasets_exemplary()):
            sign = self.cc.signature(ds1, cctype)
            diag = sign.diagnosis(ref_cc=self.cc)
            try:
                res = diag._load_diagnosis_pickle('across_roc.pkl')
            except Exception as ex:
                self.__log.error('Cannot fetch across_roc: %s' % str(ex))
                continue
            for col, ds2 in enumerate(self.cc.datasets_exemplary()):
                if ds2 not in res or res[ds2] is None:
                    continue
                rocauc_matrix[row, col] = res[ds2]['auc']
        fig, grid = canvas(width=self.width, height=self.height, dpi=self.dpi)
        ax = fig.add_subplot(grid[:])
        cmap = copy.copy(cm.get_cmap("RdYlBu_r"))
        cmap.set_bad(".5")
        names = [cc_coords_name(ds[:2]) for ds in self.cc.datasets_exemplary()]
        sns.heatmap(
            rocauc_matrix, vmin=0.5, vmax=1, square=True,
            mask=np.isnan(rocauc_matrix), xticklabels=names, yticklabels=names,
            linewidth=1, linecolor='.5', cbar=False, annot=False, cmap=cmap,
            ax=ax)
        for i in range(0, 30, 5):
            ax.hlines(i, 0, 25, color='k', lw=2,
                      capstyle='round').set_clip_on(False)
            ax.vlines(i, 0, 25, color='k', lw=2,
                      capstyle='round').set_clip_on(False)
        # save or return
        if self.save:
            self.save(fig)
        else:
            return fig