Source code for chemicalchecker.core.sign3

"""Signature type 3.

Fixed-length (e.g. 128-d) representation of the data, capturing and inferring
the original (signature type 1) similarity of the data. Signatures type 3 are
available for all molecules of the ChemicalChecker univers (~1M molecules) and
have a confidence/applicability measure assigned to them.
"""
import os
import h5py
import pickle
import shutil
import numpy as np
from tqdm import tqdm
from time import time
from scipy import stats
from functools import partial
from scipy.stats import gaussian_kde
from scipy.spatial.distance import pdist
from scipy.stats import pearsonr, ks_2samp
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import LocalOutlierFactor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import robust_scale, StandardScaler

from .signature_base import BaseSignature
from .signature_data import DataSignature
from .preprocess import Preprocess

from chemicalchecker.util import logged
from chemicalchecker.util.splitter import OldTripletSampler, TripletIterator
from chemicalchecker.util.splitter import BaseTripletSampler
from chemicalchecker.util.parser.converter import Converter


[docs]@logged
class sign3(BaseSignature, DataSignature):
    """Signature type 3 class."""

    def __init__(self, signature_path, dataset, **params):
        """Initialize a Signature.

        Args:
            signature_path(str): The signature root directory.
            dataset(`Dataset`): `chemicalchecker.database.Dataset` object.
            params(): Parameters, expected keys are:
                * 'sign2' for learning based on sign2
                * 'prior' for learning prior in predictions
        """
        # Calling init on the base class to trigger file existence checks
        BaseSignature.__init__(self, signature_path,
                               dataset, **params)
        self.data_path = os.path.join(self.signature_path, 'sign3.h5')
        DataSignature.__init__(self, self.data_path)
        # get parameters or default values
        self.params = dict()
        # parameters to learn from sign2
        default_sign2 = {
            'epochs': 10,
            'cpu': 8,
            'layers': ['Dense', 'Dense', 'Dense', 'Dense'],
            'layers_sizes': [1024, 512, 256, 128],
            'activations': ['selu', 'selu', 'selu', 'tanh'],
            'dropouts': [0.2, 0.2, 0.2, None],
            'learning_rate': 'auto',
            'batch_size': 128,
            'patience': 200,
            'loss_func': 'only_self_loss',
            'margin': 1.0,
            'alpha': 1.0,
            'num_triplets': 1000000,
            't_per': 0.01,
            'onlyself_notself': True,
            'augment_fn': subsample,
            'augment_kwargs': {
                'dataset': [dataset],
                'p_self': 0.1,
            },
            'limit_mols': 100000
        }

        if not self.is_fit():
            # we load this param only if signature is not fitted yet
            s1_ref = self.get_sign('sign1').get_molset("reference")
            try:
                opt_t = s1_ref.optimal_t()
                default_sign2.update({'t_per': opt_t})
                self.t_per = opt_t
            except Exception as ex:
                self.__log.warning('Failed setting opt_t: %s' % str(ex))
                self.t_per = 0.01

        default_sign2.update(params.get('sign2', {}))
        self.params['sign2_lr'] = default_sign2.copy()
        self.params['sign2'] = default_sign2
        self._sharedx = None
        self._sharedx_trim = None
        self.traintest_file = None
        self.trim_mask = None
        #np.seterr(divide='ignore', invalid='ignore')

    @property
    def sharedx(self):
        if self._sharedx is None:
            if self.traintest_file is None:
                self.__log.debug("traintest_file is not set.")
                return None
            self.__log.debug("Reading sign2 universe lookup,"
                             " this should only be loaded once.")
            traintest_ds = DataSignature(self.traintest_file)
            self._sharedx = traintest_ds.get_h5_dataset('x')
        self.__log.debug("sharedx shape: %s" % str(self._sharedx.shape))
        return self._sharedx

    @property
    def sharedx_trim(self):
        if self._sharedx_trim is None:
            if self.traintest_file is None:
                self.__log.debug("traintest_file is not set.")
                return None
            if self.trim_mask is None:
                self.__log.debug("trim_mask is not set.")
                return None
            full_trim = np.argwhere(np.repeat(self.trim_mask, 128))
            self._sharedx_trim = self.sharedx[:, full_trim.ravel()]
        self.__log.debug("sharedx_trim shape: %s" %
                         str(self._sharedx_trim.shape))
        return self._sharedx_trim

[docs]    @staticmethod
    def save_sign2_universe(sign2_list, destination):
        """Create a file with all signatures 2 for each molecule in the CC.

        Args:
            sign2_list(list): List of signature 2 objects to learn from.
            destination(str): Path where the H5 is saved.
        """
        # get sorted universe inchikeys and CC signatures
        sign3.__log.info("Generating signature 2 universe matrix.")
        if os.path.isfile(destination):
            sign3.__log.warning("Skipping as destination %s already exists." %
                                destination)
            return
        inchikeys = set()
        for sign in sign2_list:
            inchikeys.update(sign.unique_keys)
        inchikeys = sorted(list(inchikeys))
        tot_inks = len(inchikeys)
        tot_ds = len(sign2_list)
        # build matrix stacking horizontally signature
        with h5py.File(destination, "w") as fh:
            fh.create_dataset('x_test', (tot_inks, 128 * tot_ds),
                              dtype=np.float32)
            fh.create_dataset('keys', data=np.array(
                inchikeys, DataSignature.string_dtype()))
            for idx, sign in enumerate(sign2_list):
                sign3.__log.info("Fetching from %s" % sign.data_path)
                # including NaN we have the correct number of molecules
                _, vectors = sign.get_vectors(inchikeys, include_nan=True)
                fh['x_test'][:, idx * 128:(idx + 1) * 128] = vectors
                del vectors

[docs]    @staticmethod
    def save_sign2_coverage(sign2_list, destination):
        """Create a file with all signatures 2 coverage of molecule in the CC.

        Args:
            sign2_list(list): List of signature 2 objects to learn from.
            destination(str): Path where the H5 is saved.
        """
        # get sorted universe inchikeys and CC signatures
        sign3.__log.info("Generating signature 2 coverage matrix.")
        if os.path.isfile(destination):
            sign3.__log.warning("Skipping as destination %s already exists." %
                                destination)
            return
        inchikeys = set()
        for sign in sign2_list:
            inchikeys.update(sign.unique_keys)
        inchikeys = sorted(list(inchikeys))
        tot_inks = len(inchikeys)
        tot_ds = len(sign2_list)
        sign3.__log.info("Saving coverage for %s dataset and %s molecules." %
                         (tot_ds, tot_inks))
        # build matrix stacking horizontally signature
        with h5py.File(destination, "w") as fh:
            fh.create_dataset('x_test', (tot_inks, tot_ds), dtype=np.float32)
            fh.create_dataset('keys', data=np.array(
                inchikeys, DataSignature.string_dtype()))
            for idx, sign in enumerate(sign2_list):
                sign3.__log.info("Fetching from %s" % sign.data_path)
                # including NaN we have the correct number of molecules
                coverage = np.isin(
                    list(inchikeys), list(sign.keys), assume_unique=True)
                sign3.__log.info("%s has %s Signature 2." %
                                 (sign.dataset, np.count_nonzero(coverage)))
                fh['x_test'][:, idx:(idx + 1)] = np.expand_dims(coverage, 1)

[docs]    def complete_sign2_universe(self, sign2_self, sign2_universe,
                                sign2_coverage, tmp_path=None,
                                calc_ds_idx=[0, 1, 2, 3, 4],
                                calc_ds_names=['A1.001', 'A2.001', 'A3.001',
                                               'A4.001', 'A5.001'],
                                ref_cc=None):
        """Completes the universe for extra molecules.

        Important if the dataset we are fitting is defined on molecules
        that largely do not overlap with CC molecules. In that case there is no
        orthogonal information to derive sign3. We should always have at least
        the chemistry (calculated) level available for all molecules of the
        dataset.

        Args:
            sign2_self(sign2): Signature 2 of the current space.
            sign2_universe(str): Path to the union of all signatures 2 for all
                molecules in the CC universe. (~1M x 3200)
            sign2_coverage(str): Path to the coverage of all signatures 2 for
                all molecules in the CC universe. (~1M x 25)
            tmp_path(str): Temporary path where to save extra molecules'
                signatures.
            calc_spaces(list): List of indexes for calculated spaces in
                the coverage matrix.
        Returns:
            Paths ot the new sign2 universe and coverage file.
        """
        from chemicalchecker import ChemicalChecker

        # make a copy of the sign2 universe and coverage
        sign3.__log.info('Completing universe, making copy of original files')
        sign2_universe_ext = sign2_universe + '.complete.h5'
        sign2_coverage_ext = sign2_coverage + '.complete.h5'
        # shortcut if already computed
        if os.path.isfile(sign2_universe_ext):
            if os.path.isfile(sign2_coverage_ext):
                sign3.__log.debug('Completed universe already available')
                return sign2_universe_ext, sign2_coverage_ext
        shutil.copyfile(sign2_universe, sign2_universe_ext)
        shutil.copyfile(sign2_coverage, sign2_coverage_ext)
        # which molecules in which space should be calculated?
        cov = DataSignature(sign2_coverage)
        inks, cov_ds = cov.get_vectors(sign2_self.keys, dataset_name='x_test')
        sign3.__log.info('Coverage matrix shape: %s' % str(cov_ds.shape))
        conv = Converter()
        # check coverage of calculated spaces
        missing = np.sum(~cov_ds[:, calc_ds_idx].astype(bool), axis=0)
        missing = missing.ravel().tolist()
        sign3.__log.info(
            "Completing universe for missing molecules: %s" %
            ', '.join(['%s: %s' % a for a in zip(calc_ds_names, missing)]))
        # reference CC
        if ref_cc is not None:
            cc_ref = ChemicalChecker(ref_cc)
        else:
            cc_ref = self.get_cc()
        sign3.__log.info(
            "Reference CC (for predict methods): %s" % cc_ref.cc_root)
        # create CC instance to host signatures
        cc_extra = ChemicalChecker(os.path.join(tmp_path, 'tmp_cc'))
        # for each calculated space
        for ds_id, ds in zip(calc_ds_idx, calc_ds_names):
            # prepare input file with missing inchikeys
            input_file = os.path.join(tmp_path, '%s_input.tsv' % ds)
            if not os.path.isfile(input_file):
                fh = open(input_file, 'w')
                miss_ink = inks[cov_ds[:, ds_id] == 0]
                mi = miss_ink
                # new inks must be sorted
                assert(all(mi[i] <= mi[i + 1] for i in range(len(mi) - 1)))
                sign3.__log.info('Getting InChI for %s molecules' %
                                 (len(miss_ink)))
                miss_count = 0
                for ink in miss_ink:
                    try:
                        inchi = conv.inchikey_to_inchi(ink)
                    except Exception as ex:
                        sign3.__log.warning(str(ex))
                        continue
                    fh.write('%s\t%s\n' % (ink, inchi))
                    miss_count += 1
                fh.close()
                sign3.__log.info('Adding %s molecules to %s' %
                                 (miss_count, ds))
                if miss_count == 0:
                    continue
            # call preprocess with predict
            s0_ref = cc_ref.get_signature('sign0', 'full', ds)
            raw_file = os.path.join(tmp_path, '%s_raw.h5' % ds)
            if not os.path.isfile(raw_file):
                Preprocess.preprocess_predict(s0_ref, input_file, raw_file,
                                              'inchi')
            # run sign0 predict
            s0_ext = cc_extra.get_signature('sign0', 'full', ds)
            if not os.path.isfile(s0_ext.data_path):
                s0_ref.predict(data_file=raw_file, destination=s0_ext)
            # run sign1 predict
            s1_ext = cc_extra.get_signature('sign1', 'full', ds)
            s1_ref = cc_ref.get_signature('sign1', 'reference', ds)
            if not os.path.isfile(s1_ext.data_path):
                s1_ref.predict(s0_ext, destination=s1_ext)
            # run sign2 predict
            s2_ext = cc_extra.get_signature('sign2', 'full', ds)
            s2_ref = cc_ref.get_signature('sign2', 'reference', ds)
            if not os.path.isfile(s2_ext.data_path):
                s2_ref.predict(s1_ext, destination=s2_ext)
            # update sign2_universe and sign2_coverage
            upd_uni = h5py.File(sign2_universe_ext, "a")
            upd_cov = h5py.File(sign2_coverage_ext, "a")
            rows = np.isin(upd_uni['keys'][:].astype(str), s2_ext.keys)
            rows_idx = np.argwhere(rows).ravel()
            sign3.__log.info('Updating %s universe rows in %s' %
                             (len(rows_idx), ds))
            # new keys must be a subset of universe
            assert(len(rows_idx) == len(s2_ext.keys))
            for rid, sig in zip(rows_idx, s2_ext[:]):
                old_sig = upd_uni['x_test'][rid][ds_id * 128:(ds_id + 1) * 128]
                # old signature in universe must be nan
                assert(np.all(np.isnan(old_sig)))
                new_row = np.copy(upd_uni['x_test'][rid])
                new_row[ds_id * 128:(ds_id + 1) * 128] = sig
                upd_uni['x_test'][rid] = new_row
                new_row = np.copy(upd_cov['x_test'][rid])
                new_row[ds_id:ds_id + 1] = 1
                upd_cov['x_test'][rid] = new_row
            upd_uni.close()
            upd_cov.close()
        # final report
        upd_uni = h5py.File(sign2_universe_ext, "r")
        upd_cov = h5py.File(sign2_coverage_ext, "r")
        old_cov = h5py.File(sign2_coverage, "r")
        sign3.__log.info('Checking updated universe...')

        for col, name in enumerate(self.src_datasets):  # cc_ref.datasets
            tot_upd = sum(upd_cov['x_test'][:, col])
            cov_delta = int(tot_upd - sum(old_cov['x_test'][:, col]))
            if cov_delta == 0:
                continue
            sign3.__log.info('Added %s molecules to %s' % (cov_delta, name))
            # check head and tail of signature
            sig_head = upd_uni['x_test'][:, col * 128]
            sig_tail = upd_uni['x_test'][:, (col + 1) * 128 - 1]
            assert(sum(~np.isnan(sig_head)) == tot_upd)
            assert(sum(~np.isnan(sig_tail)) == tot_upd)
        return sign2_universe_ext, sign2_coverage_ext

[docs]    def save_sign2_matrix(self, destination):
        """Save matrix of pairs of horizontally stacked signature 2.

        This is the matrix for training the signature 3. It is defined for all
        molecules for which we have a signature 2 in the current space.
        It's a subset of the universe of stacked sign2 file.

        Args:
            destination(str): Path where to save the matrix (HDF5 file).
        """
        self.__log.debug('Saving sign2 traintest to: %s' % destination)
        universe = DataSignature(self.sign2_universe)
        mask = np.isin(list(universe.keys), list(self.sign2_self.keys))
        universe.make_filtered_copy(destination, mask)
        # rename the dataset to what the splitter expects
        with h5py.File(destination, 'a') as hf:
            hf['x'] = hf['x_test']
            del hf['x_test']

[docs]    def train_SNN(self, params, reuse=True, suffix=None, evaluate=True,
                  plots_train=True, triplets_sampler=None):
        """Train the Siamese Neural Network model.

        This method is used twice. First to evaluate the performances of the
        Siamese model. Second to train the final model on the full set of data.
        Triplets file are generated and SNN are trained. When evaluating also
        save the confidence model.

        Args:
            params(dict): Dictionary with algorithm parameters.
            reuse(bool): Whether to reuse intermediate files (e.g. the
                aggregated signature 2 matrix).
            suffix(str): A suffix for the Siamese model path (e.g.
                'sign3/models/siamese_<suffix>').
            evaluate(bool): Whether we are performing a train-test split and
                evaluating the performances (N.B. this is required for complete
                confidence scores)
            plots_train(bool): plotting outcomes of train models.
        """
        try:
            from chemicalchecker.tool.siamese import SiameseTriplets
        except ImportError:
            raise ImportError("requires tensorflow https://tensorflow.org")
        # get params and set folder
        self.update_status("Training %s model" % suffix)
        self.__log.debug('Siamese suffix %s' % suffix)
        if suffix:
            siamese_path = os.path.join(self.model_path, 'siamese_%s' % suffix)
            traintest_file = os.path.join(self.model_path,
                                          'traintest_%s.h5' % suffix)
            params['traintest_file'] = params.get(
                'traintest_file', traintest_file)
        else:
            siamese_path = os.path.join(self.model_path, 'siamese')
        if 'model_dir' in params:
            siamese_path = params.get('model_dir')
        if not reuse or not os.path.isdir(siamese_path):
            os.makedirs(siamese_path)
        # generate input matrix, how molecules will be represented
        sign2_matrix = os.path.join(self.model_path, 'train.h5')
        if not reuse or not os.path.isfile(sign2_matrix):
            self.save_sign2_matrix(sign2_matrix)
        X = DataSignature(sign2_matrix)
        # initialize triplet sampler
        self.traintest_file = params.get('traintest_file')
        num_triplets = params.get('num_triplets', 1e6)
        cpu = params.get('cpu', 1)
        if triplets_sampler is None:
            sampler_class = OldTripletSampler
            sampler_args = (self.triplet_sign, X, self.traintest_file)
            sample_obj = sampler_class(*sampler_args)
            sampler_kwargs = {'t_per': params['t_per'],
                              'limit': params['limit_mols'], 'cpu': cpu,
                              'num_triplets': num_triplets}
        else:
            sampler_class = triplets_sampler[0]
            sampler_args = triplets_sampler[1]
            if sampler_args is None:
                sampler_args = (self.triplet_sign, X, self.traintest_file)
            sample_obj = sampler_class(*sampler_args)
            sampler_kwargs = triplets_sampler[2]
        # if evaluating, perform the train-test split
        if evaluate:
            save_kwargs = {
                'mean_center_x': True,
                'shuffle': True,
                'split_names': ['train', 'test'],
                'split_fractions': [.8, .2],
                'suffix': suffix
            }
        else:
            save_kwargs = {
                'mean_center_x': True,
                'shuffle': True,
                'split_names': ['train'],
                'split_fractions': [1.0],
                'suffix': suffix
            }
        # generate triplets
        sample_obj = sampler_class(*sampler_args, save_kwargs=save_kwargs)
        if not reuse or not os.path.isfile(self.traintest_file):
            sample_obj.generate_triplets(**sampler_kwargs)

        # define the augment with the dataset subsampling parameter
        if 'augment_kwargs' in params:
            ds = params['augment_kwargs']['dataset']
            dataset_idx = np.argwhere(np.isin(self.src_datasets, ds)).flatten()
            if len(dataset_idx) > 1:
                self.__log.warning('Dataset %s is repeated' % ds)
            dataset_idx = np.array(dataset_idx[:1])
            # compute probabilities for subsampling
            trim_mask, p_nr_unknown, p_keep_unknown, p_nr_known, p_keep_known = subsampling_probs(
                self.sign2_coverage, dataset_idx)
            trim_dataset_idx = np.argwhere(
                np.arange(len(trim_mask))[trim_mask] == dataset_idx).ravel()[0]
            params['augment_kwargs']['p_nr'] = (p_nr_unknown, p_nr_known)
            params['augment_kwargs']['p_keep'] = (p_keep_unknown, p_keep_known)
            params['augment_kwargs']['dataset_idx'] = [trim_dataset_idx]
            params['augment_kwargs']['p_only_self'] = 0.0
            params['trim_mask'] = trim_mask
            self.trim_mask = trim_mask
        # train siamese network
        self.__log.debug('Siamese training on %s' % traintest_file)
        siamese = SiameseTriplets(
            siamese_path, evaluate=evaluate, sharedx=self.sharedx,
            sharedx_trim=self.sharedx_trim, **params)
        self.__log.info('Plot Subsampling (what the SNN will get).')
        fname = 'known_unknown_sampling.png'
        plot_file = os.path.join(siamese.model_dir, fname)
        plot_subsample(self, plot_file, self.sign2_coverage, traintest_file,
                       ds=self.dataset, sign2_list=self.sign2_list)
        siamese.fit()
        self.__log.debug('Model saved to: %s' % siamese_path)
        # if final we are done
        if not evaluate:
            return siamese
        # save validation plots
        try:
            self.plot_validations(siamese, dataset_idx, traintest_file)
        except Exception as ex:
            self.__log.debug('Plot problem: %s' % str(ex))
        # when evaluating also save prior and confidence models
        conf_res = self.train_confidence(siamese, plots_train=plots_train)
        prior_model, prior_sign_model, confidence_model = conf_res
        # update the parameters with the new nr_of epochs and lr
        self.params['sign2']['epochs'] = siamese.last_epoch
        return siamese, prior_model, prior_sign_model, confidence_model

[docs]    def train_confidence(self, siamese, suffix='eval', traintest_file=None,
                         train_file=None, max_x=10000, max_neig=50000,
                         p_self=0.0, plots_train=True):
        """Train confidence and prior models."""
        # get sorted keys from siamese traintest file
        self.update_status('Training applicability')
        if traintest_file is None:
            traintest_file = os.path.join(self.model_path,
                                          'traintest_%s.h5' % suffix)
        if not os.path.isfile(traintest_file):
            raise Exception('Traintest_file not found: %s' % traintest_file)
        if train_file is None:
            train_file = os.path.join(self.model_path, 'train.h5')
        if not os.path.isfile(train_file):
            raise Exception('Train_file not found: %s' % train_file)
        self.traintest_file = traintest_file
        traintest = DataSignature(self.traintest_file)
        test_inks = traintest.get_h5_dataset('keys_test')[:max_x]
        train_inks = traintest.get_h5_dataset('keys_train')[:max_neig]
        # confidence is going to be trained only on siamese test data
        test_mask = np.isin(list(self.sign2_self.keys), list(test_inks),
                            assume_unique=True)
        train_mask = np.isin(list(self.sign2_self.keys), list(train_inks),
                             assume_unique=True)
        train = DataSignature(train_file)
        confidence_train_x = train.get_h5_dataset('x', mask=test_mask)
        #s2_test = self.sign2_self.get_h5_dataset('V', mask=test_mask)
        _, s2_test = self.sign2_self.get_vectors(test_inks)
        s2_test_x = confidence_train_x[:, self.dataset_idx[0]
                                       * 128: (self.dataset_idx[0] + 1) * 128]
        assert(np.all(s2_test == s2_test_x))
        # siamese train is going to be used for appticability domain
        known_x = train.get_h5_dataset('x', mask=train_mask)
        # generate train-test split for confidence estimation
        split_names = ['train', 'test']
        split_fractions = [0.8, 0.2]

        def get_split_indeces(rows, fractions):
            """Get random indexes for different splits."""
            if not sum(fractions) == 1.0:
                raise Exception("Split fractions should sum to 1.0")
            # shuffle indexes
            idxs = list(range(rows))
            np.random.shuffle(idxs)
            # from frequencies to indices
            splits = np.cumsum(fractions)
            splits = splits[:-1]
            splits *= len(idxs)
            splits = splits.round().astype(np.int)
            return np.split(idxs, splits)

        split_idxs = get_split_indeces(confidence_train_x.shape[0], split_fractions)
        splits = list(zip(split_names, split_fractions, split_idxs))

        # train prior model
        prior_path = os.path.join(self.model_path, 'prior_%s' % suffix)
        os.makedirs(prior_path, exist_ok=True)
        prior_model = self.train_prior_model(siamese, confidence_train_x,
                                             splits, prior_path,
                                             max_x=max_x, p_self=p_self, plots=plots_train)

        # train prior signature model
        prior_sign_path = os.path.join(self.model_path,
                                       'prior_sign_%s' % suffix)
        os.makedirs(prior_sign_path, exist_ok=True)
        prior_sign_model = self.train_prior_signature_model(
            siamese, confidence_train_x, splits, prior_sign_path,
            max_x=max_x, p_self=p_self, plots=plots_train)

        # train confidence model
        confidence_path = os.path.join(self.model_path,
                                       'confidence_%s' % suffix)
        os.makedirs(confidence_path, exist_ok=True)
        confidence_model = self.train_confidence_model(
            siamese, known_x, confidence_train_x, splits,
            prior_model, prior_sign_model,
            confidence_path, p_self=p_self, plots=plots_train)
        return prior_model, prior_sign_model, confidence_model

[docs]    def rerun_confidence(self, cc, suffix, train=True, update_sign=True,
                         chunk_size=10000, sign2_universe=None,
                         sign2_coverage=None, plots_train=True):
        """Rerun confidence trainining and estimation"""
        try:
            import faiss
            from chemicalchecker.tool.siamese import SiameseTriplets
        except ImportError:
            raise ImportError("requires tensorflow https://tensorflow.org")
        triplet_sign = cc.get_signature("sign1", "full", self.dataset)
        sign2_self = cc.get_signature("sign2", "full", self.dataset)
        sign2_list = [cc.get_signature("sign2", "full", d)
                      for d in cc.datasets_exemplary()]
        if sign2_universe is None:
            sign2_universe = os.path.join(cc.cc_root, 'full', 'all_sign2.h5')
        if sign2_coverage is None:
            sign2_coverage = os.path.join(
                cc.cc_root, 'full', 'all_sign2_coverage.h5')

        self.src_datasets = [sign.dataset for sign in sign2_list]
        self.triplet_sign = triplet_sign
        self.sign2_self = sign2_self
        self.sign2_list = sign2_list
        self.sign2_coverage = sign2_coverage
        self.sign2_universe = sign2_universe
        self.dataset_idx = np.argwhere(
            np.isin(self.src_datasets, self.dataset)).flatten()

        siamese_path = os.path.join(self.model_path, 'siamese_%s' % suffix)
        siamese = SiameseTriplets(siamese_path, predict_only=True)
        siamese_cp = SiameseTriplets(siamese.model_dir, predict_only=True)

        if train:
            prior_mdl, prior_sign_mdl, conf_mdl = self.train_confidence(
                siamese, suffix=suffix, plots_train=plots_train)
            if not update_sign:
                return
        else:
            # part of confidence is the priors
            prior_path = os.path.join(self.model_path, 'prior_eval')
            prior_file = os.path.join(prior_path, 'prior.pkl')
            prior_mdl = pickle.load(open(prior_file, 'rb'))

            # part of confidence is the priors based on signatures
            prior_sign_path = os.path.join(
                self.model_path, 'prior_sign_eval')
            prior_sign_file = os.path.join(prior_sign_path, 'prior.pkl')
            prior_sign_mdl = pickle.load(open(prior_sign_file, 'rb'))

            # and finally the linear combination of scores
            confidence_path = os.path.join(self.model_path, 'confidence_eval')
            confidence_file = os.path.join(
                confidence_path, 'confidence.pkl')
            calibration_file = os.path.join(
                confidence_path, 'calibration.pkl')
            conf_mdl = (pickle.load(open(confidence_file, 'rb')),
                        pickle.load(open(calibration_file, 'rb')))

        # another part of confidence is the applicability
        confidence_path = os.path.join(self.model_path, 'confidence_eval')
        neig_file = os.path.join(confidence_path, 'neig.index')
        app_neig = faiss.read_index(neig_file)
        known_dist = os.path.join(confidence_path, 'known_dist.h5')
        app_range = DataSignature(known_dist).get_h5_dataset(
            'applicability_range')
        _, trim_mask = self.realistic_subsampling_fn()

        # get sorted universe inchikeys
        self.universe_inchikeys = self.get_universe_inchikeys()
        tot_inks = len(self.universe_inchikeys)
        known_mask = np.isin(list(self.universe_inchikeys),
                             list(self.sign2_self.keys),
                             assume_unique=True)

        with h5py.File(self.data_path, "a") as results:
            # the actual confidence value will be stored here
            safe_create(results, 'confidence', (tot_inks,),
                        dtype=np.float32)
            # this is to store robustness
            safe_create(results, 'robustness',
                        (tot_inks,), dtype=np.float32)
            # this is to store applicability
            safe_create(results, 'applicability',
                        (tot_inks,), dtype=np.float32)
            # this is to store priors
            safe_create(results, 'prior',
                        (tot_inks,), dtype=np.float32)
            # this is to store priors based on signature
            safe_create(results, 'prior_signature',
                        (tot_inks,), dtype=np.float32)

            # predict signature 3 for universe molecules
            with h5py.File(self.sign2_universe, "r") as features:
                # reference prediction (based on no information)
                nan_feat = np.full((1, features['x_test'].shape[1]),
                                   np.nan, dtype=np.float32)
                nan_pred = siamese.predict(nan_feat)
                # read input in chunks
                for idx in tqdm(range(0, tot_inks, chunk_size),
                                desc='Predicting'):
                    chunk = slice(idx, idx + chunk_size)
                    feat = features['x_test'][chunk]
                    # save confidence natural scores
                    # compute prior from coverage
                    cov = ~np.isnan(feat[:, 0::128])
                    prior = prior_mdl.predict(cov[:, trim_mask])
                    results['prior'][chunk] = prior
                    # and from prediction
                    preds = siamese.predict(feat)
                    prior_sign = prior_sign_mdl.predict(preds)
                    results['prior_signature'][chunk] = prior_sign
                    # conformal prediction
                    ints, robs, cons = self.conformal_prediction(
                        siamese_cp, feat, nan_pred=nan_pred)
                    results['robustness'][chunk] = robs
                    # distance from known predictions
                    app, centrality, _ = self.applicability_domain(
                        app_neig, feat, siamese, app_range=app_range,
                        n_samples=1)
                    results['applicability'][chunk] = app
                    # and estimate confidence
                    conf_feats = np.vstack(
                        [app, robs, prior, prior_sign, ints]).T
                    conf_estimate = conf_mdl[0].predict(conf_feats)
                    conf_calib = conf_mdl[1].predict(
                        np.expand_dims(conf_estimate, 1))
                    results['confidence'][chunk] = conf_calib
                # conpute confidence where self is known
                known_idxs = np.argwhere(known_mask).flatten()
                # iterate on chunks of knowns
                for idx in tqdm(range(0, len(known_idxs), 10000),
                                desc='Computing Confidence'):
                    chunk = slice(idx, idx + 10000)
                    feat = features['x_test'][known_idxs[chunk]]
                    # predict with all features
                    preds_all = siamese.predict(feat)
                    # predict with only-self features
                    feat_onlyself = mask_keep(self.dataset_idx, feat)
                    preds_onlyself = siamese.predict(feat_onlyself)
                    # confidence is correlation ALL vs. ONLY-SELF
                    corrs = row_wise_correlation(
                        preds_onlyself, preds_all, scaled=True)
                    results['confidence'][known_idxs[chunk]] = corrs

    def realistic_subsampling_fn(self):
        # realistic subsampling function
        res = subsampling_probs(self.sign2_coverage, self.dataset_idx)
        trim_mask, p_nr_unk, p_keep_unk, p_nr_kno, p_keep_kno = res
        p_nr = (p_nr_unk, p_nr_kno)
        p_keep = (p_keep_unk, p_keep_kno)
        trim_dataset_idx = np.argwhere(
            np.arange(len(trim_mask))[trim_mask] == self.dataset_idx)
        trim_dataset_idx = trim_dataset_idx.ravel()[0]
        realistic_fn = partial(subsample, p_only_self=0.0, p_self=0.0,
                               dataset_idx=trim_dataset_idx,
                               p_nr=p_nr, p_keep=p_keep)
        return realistic_fn, trim_mask

[docs]    def train_prior_model(self, siamese, train_x, splits, save_path,
                          max_x=10000, n_samples=5, p_self=0.0, plots=True):
        """Train prior predictor."""
        def get_weights(y, p=2):
            h, b = np.histogram(y[~np.isnan(y)], 20)
            b = [np.mean([b[i], b[i + 1]]) for i in range(0, len(h))]
            w = np.interp(y, b, h).ravel()
            w = -(w / np.sum(w)) + 1e-10
            w = (w - np.min(w)) / (np.max(w) - np.min(w))
            w = w**p
            return w

        def histograms(ax, yp, yt, title):
            ax.hist(yp, 10, range=(-1, 1), color="red",
                    label="Pred", alpha=0.5)
            ax.hist(yt, 10, range=(-1, 1), color="blue",
                    label="True", alpha=0.5)
            ax.legend()
            ax.set_xlabel("Value")
            ax.set_ylabel("Counts")
            ax.set_title(title)

        def scatter(ax, yp, yt, joint_lim=True):
            x = yp
            y = yt
            xy = np.vstack([x, y])
            try:
                z = gaussian_kde(xy)(xy)
            except Exception as ex:
                self.__log.warning(str(ex))
                z = np.ones(x.shape)
            idx = z.argsort()
            x, y, z = x[idx], y[idx], z[idx]
            ax.scatter(x, y, c=z, s=10, linewidth=0)
            ax.set_xlabel("Pred")
            ax.set_ylabel("True")
            ax.set_xlim((-1, 1))
            ax.set_ylim((-1, 1))
            xlim = ax.get_xlim()
            ylim = ax.get_ylim()
            if joint_lim:
                lim = (np.min([xlim[0], ylim[0]]), np.max([xlim[1], ylim[1]]))
                ax.set_xlim(lim)
                ax.set_ylim(lim)
                ax.plot([lim[0], lim[1]], [lim[0], lim[1]],
                        color="gray", ls='--', lw=1)
            slope, intercept, r, p_val, stde = stats.linregress(x, y)
            line = slope * x + intercept
            ax.plot(x, line, 'r',
                    label='y={:.2f}x+{:.2f}'.format(slope, intercept))
            nas = np.logical_or(np.isnan(x), np.isnan(y))
            title = "rho = %.2f" % pearsonr(x[~nas], y[~nas])[0]
            ax.set_title(title)
            ax.legend()

        def importances(ax, mod, trim_mask):
            from chemicalchecker.util.plot import coord_color
            y = mod.feature_importances_
            datasets = [s.dataset[:2] for s in self.sign2_list]
            if trim_mask is not None:
                datasets = np.array(datasets)[trim_mask]
            else:
                datasets = np.array(datasets)
            x = np.array([i for i in range(0, len(datasets))])
            idxs = np.argsort(y)
            datasets = datasets[idxs]
            y = y[idxs]
            colors = [coord_color(ds) for ds in datasets]
            ax.scatter(y, x, color=colors)
            ax.set_xlabel("Importance")
            ax.set_yticks(x)
            ax.set_yticklabels(datasets)
            ax.set_title("Importance")
            ax.axvline(0, color="red", lw=1)

        def analyze(mod, x_tr, y_tr, x_te, y_te, trim_mask):
            import matplotlib.pyplot as plt
            y_tr_p = mod.predict(x_tr)
            y_te_p = mod.predict(x_te)
            plt.close('all')
            fig = plt.figure(constrained_layout=True, figsize=(8, 6))
            gs = fig.add_gridspec(2, 3)
            ax = fig.add_subplot(gs[0, 0])
            histograms(ax, y_tr_p, y_tr, "Train")
            ax = fig.add_subplot(gs[0, 1])
            histograms(ax, y_te_p, y_te, "Test")
            ax = fig.add_subplot(gs[1, 0])
            nas = np.logical_or(np.isnan(y_tr_p), np.isnan(y_tr))
            scatter(ax, y_tr_p[~nas], y_tr[~nas])
            ax = fig.add_subplot(gs[1, 1])
            nas = np.logical_or(np.isnan(y_te_p), np.isnan(y_te))
            scatter(ax, y_te_p[~nas], y_te[~nas])
            ax = fig.add_subplot(gs[0:2, 2])
            importances(ax, mod, trim_mask)
            if plots:
                try:
                    plt.savefig(os.path.join(save_path, 'prior_stats.png'))
                    plt.close()
                except Exception as ex:
                    self.__log.warning('SKIPPING PLOT: %s' % str(ex))

        def find_p(mod, x_tr, y_tr, x_te, y_te):
            import matplotlib.pyplot as plt
            test = ks_2samp
            ps = []
            ss_te = []
            for p in np.linspace(0, 3, 10):
                w = get_weights(y_tr, p=p)
                mod.fit(x_tr, y_tr, sample_weight=w)
                y_te_p = mod.predict(x_te)
                s_te = test(y_te, y_te_p)[0]
                ps += [p]
                ss_te += [s_te]
            p = ps[np.argmin(ss_te)]
            if plots:
                plt.close('all')
                plt.scatter(ps, ss_te)
                plt.title("%.2f" % p)
                plt.savefig(os.path.join(save_path, 'prior_p.png'))
                plt.close()
            return p

        self.__log.info('Training PRIOR model')
        # define subsampling
        realistic_fn, trim_mask = self.realistic_subsampling_fn()
        # generate train test split
        out_file = os.path.join(save_path, 'data.h5')
        with h5py.File(out_file, "w") as fh:
            for split_name, split_frac, split_idx in splits:
                split_x = train_x[split_idx]
                split_total_x = int(max_x * split_frac)
                available_x = split_x.shape[0]
                X = np.zeros((split_total_x, np.sum(trim_mask)))
                Y = np.zeros((split_total_x, 1))
                preds_onlyselfs = np.zeros((split_total_x, 128))
                preds_noselfs = np.zeros((split_total_x, 128))
                feats = np.zeros((split_total_x, 128 * len(self.sign2_list)))
                # prepare X and Y in chunks
                chunk_size = max(10000, int(np.floor(available_x / 10)))
                reached_max = False
                for i in range(0, int(np.ceil(split_total_x / available_x))):
                    for idx in range(0, available_x, chunk_size):
                        # define source chunk
                        src_start = idx
                        src_end = idx + chunk_size
                        if src_end > available_x:
                            src_end = available_x
                        # define destination chunk
                        dst_start = src_start + (int(available_x) * i)
                        dst_end = src_end + (available_x * i)
                        if dst_end > split_total_x:
                            dst_end = split_total_x
                            reached_max = True
                            src_end = dst_end - (int(available_x) * i)
                        src_chunk = slice(src_start, src_end)
                        dst_chunk = slice(dst_start, dst_end)
                        # get only-self and not-self predictions
                        feat = split_x[src_chunk]
                        feats[dst_chunk] = feat
                        preds_onlyself = siamese.predict(
                            mask_keep(self.dataset_idx, feat))
                        preds_onlyselfs[dst_chunk] = preds_onlyself
                        preds_noself = siamese.predict(
                            mask_exclude(self.dataset_idx, feat))
                        preds_noselfs[dst_chunk] = preds_noself
                        # the prior is only-self vs not-self predictions
                        corrs = row_wise_correlation(
                            preds_onlyself, preds_noself, scaled=True)
                        Y[dst_chunk] = np.expand_dims(corrs, 1)
                        # the X is the dataset presence in the not-self
                        presence = ~np.isnan(feat[:, ::128])[:, trim_mask]
                        X[dst_chunk] = presence.astype(int)
                        # check if enough
                        if reached_max:
                            break
                variables = [X, Y, feat, preds_onlyselfs, preds_noselfs]
                names = ['x', 'y', 'feat', 'preds_onlyselfs', 'preds_noselfs']
                for var, name in zip(variables, names):
                    ds_name = '%s_%s' % (name, split_name)
                    self.__log.debug('writing %s: %s' % (ds_name, var.shape))
                    fh.create_dataset(ds_name, data=var)
        traintest = DataSignature(out_file)
        x_tr = traintest.get_h5_dataset('x_train')
        y_tr = traintest.get_h5_dataset('y_train').ravel()
        x_te = traintest.get_h5_dataset('x_test')
        y_te = traintest.get_h5_dataset('y_test').ravel()
        # TODO: check the generation of data.h5 file: NaNs?
        if np.isnan(x_tr).any():
            nans_xtr = np.argwhere(np.isnan(x_tr))
            self.__log.debug("Len NaNs in x_tr: {}".format(len(nans_xtr)))
        if np.isnan(y_tr).any():
            nans_ytr = np.argwhere(np.isnan(y_tr))
            self.__log.debug("Len NaNs in y_tr: {}".format(len(nans_ytr)))
        if np.isnan(x_te).any():
            nans_xte = np.argwhere(np.isnan(x_te))
            self.__log.debug("Len NaNs in x_te: {}".format(len(nans_xte)))
        if np.isnan(y_te).any():
            nans_yte = np.argwhere(np.isnan(y_te))
            self.__log.debug("Len NaNs in y_te: {}".format(len(nans_yte)))

        # fit model
        model = RandomForestRegressor(n_estimators=1000, max_features=None,
                                      min_samples_leaf=0.01, n_jobs=4)
        p = find_p(model, x_tr, y_tr, x_te, y_te)
        model.fit(x_tr, y_tr, sample_weight=get_weights(y_tr, p=p))
        if plots:
            analyze(model, x_tr, y_tr, x_te, y_te, trim_mask)
        predictor_path = os.path.join(save_path, 'prior.pkl')
        pickle.dump(model, open(predictor_path, 'wb'))
        return model

[docs]    def train_prior_signature_model(self, siamese, train_x, splits,
                                    save_path, max_x=10000, n_samples=5,
                                    p_self=0.0, plots=True):
        """Train prior predictor."""
        def get_weights(y, p=2):
            h, b = np.histogram(y[~np.isnan(y)], 20)
            b = [np.mean([b[i], b[i + 1]]) for i in range(0, len(h))]
            w = np.interp(y, b, h).ravel()
            w = -(w / np.sum(w)) + 1e-10
            w = (w - np.min(w)) / (np.max(w) - np.min(w))
            w = w**p
            return w

        def histograms(ax, yp, yt, title):
            ax.hist(yp, 10, range=(-1, 1), color="red",
                    label="Pred", alpha=0.5)
            ax.hist(yt, 10, range=(-1, 1), color="blue",
                    label="True", alpha=0.5)
            ax.legend()
            ax.set_xlabel("Value")
            ax.set_ylabel("Counts")
            ax.set_title(title)

        def scatter(ax, yp, yt, joint_lim=True):
            x = yp
            y = yt
            xy = np.vstack([x, y])
            try:
                z = gaussian_kde(xy)(xy)
            except Exception as ex:
                self.__log.warning(str(ex))
                z = np.ones(x.shape)
            idx = z.argsort()
            x, y, z = x[idx], y[idx], z[idx]
            ax.scatter(x, y, c=z, s=10, linewidth=0)
            ax.set_xlabel("Pred")
            ax.set_ylabel("True")
            ax.set_xlim((-1, 1))
            ax.set_ylim((-1, 1))
            xlim = ax.get_xlim()
            ylim = ax.get_ylim()
            if joint_lim:
                lim = (np.min([xlim[0], ylim[0]]), np.max([xlim[1], ylim[1]]))
                ax.set_xlim(lim)
                ax.set_ylim(lim)
                ax.plot([lim[0], lim[1]], [lim[0], lim[1]],
                        color="gray", ls='--', lw=1)
            slope, intercept, r, p_val, stde = stats.linregress(x, y)
            line = slope * x + intercept
            ax.plot(x, line, 'r',
                    label='y={:.2f}x+{:.2f}'.format(slope, intercept))
            nas = np.logical_or(np.isnan(x), np.isnan(y))
            title = "rho = %.2f" % pearsonr(x[~nas], y[~nas])[0]
            ax.set_title(title)
            ax.legend()

        def importances(ax, mod):
            y = mod.feature_importances_
            datasets = np.arange(128)
            datasets = np.array(datasets)
            x = np.array([i for i in range(0, len(datasets))])
            idxs = np.argsort(y)
            datasets = datasets[idxs]
            y = y[idxs]
            ax.scatter(y, x)
            ax.set_xlabel("Importance")
            ax.set_yticks(x)
            ax.set_yticklabels(datasets)
            ax.set_title("Importance")
            ax.axvline(0, color="red", lw=1)

        def analyze(mod, x_tr, y_tr, x_te, y_te):
            import matplotlib.pyplot as plt
            y_tr_p = mod.predict(x_tr)
            y_te_p = mod.predict(x_te)
            plt.close('all')
            fig = plt.figure(constrained_layout=True, figsize=(8, 6))
            gs = fig.add_gridspec(2, 3)
            ax = fig.add_subplot(gs[0, 0])
            histograms(ax, y_tr_p, y_tr, "Train")
            ax = fig.add_subplot(gs[0, 1])
            histograms(ax, y_te_p, y_te, "Test")
            ax = fig.add_subplot(gs[1, 0])
            nas = np.logical_or(np.isnan(y_tr_p), np.isnan(y_tr))
            scatter(ax, y_tr_p[~nas], y_tr[~nas])
            ax = fig.add_subplot(gs[1, 1])
            nas = np.logical_or(np.isnan(y_te_p), np.isnan(y_te))
            scatter(ax, y_te_p[~nas], y_te[~nas])
            ax = fig.add_subplot(gs[0:2, 2])
            importances(ax, mod)
            if plots:
                try:
                    plt.savefig(os.path.join(save_path, 'prior_stats.png'))
                    plt.close()
                except Exception as ex:
                    self.__log.warning('SKIPPING PLOT: %s' % str(ex))

        def find_p(mod, x_tr, y_tr, x_te, y_te):
            import matplotlib.pyplot as plt
            test = ks_2samp
            ps = []
            ss_te = []
            for p in np.linspace(0, 3, 10):
                w = get_weights(y_tr, p=p)
                mod.fit(x_tr, y_tr, sample_weight=w)
                y_te_p = mod.predict(x_te)
                s_te = test(y_te, y_te_p)[0]
                ps += [p]
                ss_te += [s_te]
            p = ps[np.argmin(ss_te)]
            if plots:
                plt.close('all')
                plt.scatter(ps, ss_te)
                plt.title("%.2f" % p)
                plt.savefig(os.path.join(save_path, 'prior_p.png'))
                plt.close()
            return p

        self.__log.info('Training PRIOR SIGNATURE model')
        # define subsampling
        realistic_fn, trim_mask = self.realistic_subsampling_fn()
        # generate train test split
        out_file = os.path.join(save_path, 'data.h5')
        with h5py.File(out_file, "w") as fh:
            for split_name, split_frac, split_idx in splits:
                split_x = train_x[split_idx]
                split_total_x = int(max_x * split_frac)
                available_x = split_x.shape[0]
                X = np.zeros((split_total_x, 128))
                Y = np.zeros((split_total_x, 1))
                preds_onlyselfs = np.zeros((split_total_x, 128))
                preds_noselfs = np.zeros((split_total_x, 128))
                feats = np.zeros((split_total_x, 128 * len(self.sign2_list)))
                # prepare X and Y in chunks
                chunk_size = max(10000, int(np.floor(available_x / 10)))
                reached_max = False
                for i in range(0, int(np.ceil(split_total_x / available_x))):
                    for idx in range(0, available_x, chunk_size):
                        # define source chunk
                        src_start = idx
                        src_end = idx + chunk_size
                        if src_end > available_x:
                            src_end = available_x
                        # define destination chunk
                        dst_start = src_start + (int(available_x) * i)
                        dst_end = src_end + (available_x * i)
                        if dst_end > split_total_x:
                            dst_end = split_total_x
                            reached_max = True
                            src_end = dst_end - (int(available_x) * i)
                        src_chunk = slice(src_start, src_end)
                        dst_chunk = slice(dst_start, dst_end)
                        # get only-self and not-self predictions
                        feat = split_x[src_chunk]
                        feats[dst_chunk] = feat
                        preds_onlyself = siamese.predict(
                            mask_keep(self.dataset_idx, feat))
                        preds_onlyselfs[dst_chunk] = preds_onlyself
                        preds_noself = siamese.predict(
                            mask_exclude(self.dataset_idx, feat))
                        preds_noselfs[dst_chunk] = preds_noself
                        # the prior is only-self vs not-self predictions
                        corrs = row_wise_correlation(
                            preds_onlyself, preds_noself, scaled=True)
                        Y[dst_chunk] = np.expand_dims(corrs, 1)
                        X[dst_chunk] = preds_noself
                        # check if enought
                        if reached_max:
                            break
                variables = [X, Y, feat, preds_onlyselfs, preds_noselfs]
                names = ['x', 'y', 'feat', 'preds_onlyselfs', 'preds_noselfs']
                for var, name in zip(variables, names):
                    ds_name = '%s_%s' % (name, split_name)
                    self.__log.debug('writing %s: %s' % (ds_name, var.shape))
                    fh.create_dataset(ds_name, data=var)
        traintest = DataSignature(out_file)
        x_tr = traintest.get_h5_dataset('x_train')
        y_tr = traintest.get_h5_dataset('y_train').ravel()
        x_te = traintest.get_h5_dataset('x_test')
        y_te = traintest.get_h5_dataset('y_test').ravel()
        # TODO: check the generation of data.h5 file: NaNs?
        if np.isnan(x_tr).any():
            nans_xtr = np.argwhere(np.isnan(x_tr))
            self.__log.debug("Len NaNs in x_tr: {}".format(len(nans_xtr)))
        if np.isnan(y_tr).any():
            nans_ytr = np.argwhere(np.isnan(y_tr))
            self.__log.debug("Len NaNs in y_tr: {}".format(len(nans_ytr)))
        if np.isnan(x_te).any():
            nans_xte = np.argwhere(np.isnan(x_te))
            self.__log.debug("Len NaNs in x_te: {}".format(len(nans_xte)))
        if np.isnan(y_te).any():
            nans_yte = np.argwhere(np.isnan(y_te))
            self.__log.debug("Len NaNs in y_te: {}".format(len(nans_yte)))
        # fit model
        model = RandomForestRegressor(n_estimators=1000, max_features='sqrt',
                                      min_samples_leaf=0.01, n_jobs=4)
        p = find_p(model, x_tr, y_tr, x_te, y_te)
        model.fit(x_tr, y_tr, sample_weight=get_weights(y_tr, p=p))
        if plots:
            analyze(model, x_tr, y_tr, x_te, y_te)
        predictor_path = os.path.join(save_path, 'prior.pkl')
        pickle.dump(model, open(predictor_path, 'wb'))
        return model

    def train_confidence_model(self, siamese, neig_x, train_x, splits,
                               prior_model, prior_sign_model, save_path,
                               p_self=0.0, plots=True):
        # save linear model combining confidence natural scores

        def get_weights(y, p=2):
            h, b = np.histogram(y, 20)
            b = [np.mean([b[i], b[i + 1]]) for i in range(0, len(h))]
            w = np.interp(y, b, h).ravel()
            w = -(w / np.sum(w)) + 1e-10
            w = (w - np.min(w)) / (np.max(w) - np.min(w))
            w = w**p
            return w

        def histograms(ax, yp, yt, title):
            ax.hist(yp, 10, range=(-1, 1), color="red",
                    label="Pred", alpha=0.5)
            ax.hist(yt, 10, range=(-1, 1), color="blue",
                    label="True", alpha=0.5)
            ax.legend()
            ax.set_xlabel("Value")
            ax.set_ylabel("Counts")
            ax.set_xlim((-1, 1))
            ax.set_title(title)

        def scatter(ax, yp, yt, joint_lim=True):
            x = yp
            y = yt
            xy = np.vstack([x, y])
            try:
                z = gaussian_kde(xy)(xy)
            except Exception as ex:
                self.__log.warning(str(ex))
                z = np.ones(x.shape)
            idx = z.argsort()
            x, y, z = x[idx], y[idx], z[idx]
            ax.scatter(x, y, c=z, s=10, linewidth=0)
            ax.set_xlabel("Pred")
            ax.set_ylabel("True")
            ax.set_xlim((-1, 1))
            ax.set_ylim((-1, 1))
            xlim = ax.get_xlim()
            ylim = ax.get_ylim()
            if joint_lim:
                lim = (np.min([xlim[0], ylim[0]]), np.max([xlim[1], ylim[1]]))
                ax.set_xlim(lim)
                ax.set_ylim(lim)
                ax.plot([lim[0], lim[1]], [lim[0], lim[1]],
                        color="gray", ls='--', lw=1)
            slope, intercept, r, p_val, stde = stats.linregress(x, y)
            line = slope * x + intercept
            ax.plot(x, line, 'r',
                    label='y={:.2f}x+{:.2f}'.format(slope, intercept))
            title = "rho = %.2f" % pearsonr(x, y)[0]
            ax.set_title(title)
            ax.legend()

        def analyze(mod, cal, x_tr, y_tr, x_te, y_te):
            import matplotlib.pyplot as plt
            y_tr_p = mod.predict(x_tr)
            y_te_p = mod.predict(x_te)
            y_tr_cal = cal.predict(np.expand_dims(y_tr_p, 1))
            y_te_cal = cal.predict(np.expand_dims(y_te_p, 1))
            plt.close('all')
            fig = plt.figure(constrained_layout=True, figsize=(14, 10))
            gs = fig.add_gridspec(4, 5)
            ax = fig.add_subplot(gs[0, 0])
            histograms(ax, y_tr_p, y_tr, "Train")
            ax = fig.add_subplot(gs[1, 0])
            histograms(ax, y_te_p, y_te, "Test")

            ax = fig.add_subplot(gs[0, 1])
            scatter(ax, y_tr_p, y_tr)
            ax = fig.add_subplot(gs[1, 1])
            scatter(ax, y_te_p, y_te)

            ax = fig.add_subplot(gs[0, 2])
            scatter(ax, y_tr_cal, y_tr)
            ax.set_xlabel("Pred Calibrated")
            ax = fig.add_subplot(gs[1, 2])
            scatter(ax, y_te_cal, y_te)
            ax.set_xlabel("Pred Calibrated")

            ax = fig.add_subplot(gs[2, 0])
            scatter(ax, y_tr, x_tr[:, 0].ravel(), joint_lim=False)
            ax.set_title('Applicability (%s) Train' % ax.get_title())
            ax.set_ylabel("Applicability")
            ax.set_xlabel("Correlation")
            ax = fig.add_subplot(gs[2, 1])
            scatter(ax, y_tr, x_tr[:, 1].ravel(), joint_lim=False)
            ax.set_title('Robustness (%s) Train' % ax.get_title())
            ax.set_ylabel("Robustness")
            ax.set_xlabel("Correlation")
            ax = fig.add_subplot(gs[2, 2])
            scatter(ax, y_tr, x_tr[:, 2].ravel(), joint_lim=False)
            ax.set_title('Prior (%s) Train' % ax.get_title())
            ax.set_ylabel("Prior")
            ax.set_xlabel("Correlation")
            ax = fig.add_subplot(gs[2, 3])
            scatter(ax, y_tr, x_tr[:, 3].ravel(), joint_lim=False)
            ax.set_title('Prior Signature (%s) Train' % ax.get_title())
            ax.set_ylabel("Prior")
            ax.set_xlabel("Correlation")
            ax = fig.add_subplot(gs[2, 4])
            scatter(ax, y_tr, x_tr[:, 4].ravel(), joint_lim=False)
            ax.set_title('Intensity (%s) Train' % ax.get_title())
            ax.set_ylabel("Intensity")
            ax.set_xlabel("Correlation")

            ax = fig.add_subplot(gs[3, 0])
            scatter(ax, y_te, x_te[:, 0].ravel(), joint_lim=False)
            ax.set_title('Applicability (%s) Test' % ax.get_title())
            ax.set_ylabel("Applicability")
            ax.set_xlabel("Correlation")
            ax = fig.add_subplot(gs[3, 1])
            scatter(ax, y_te, x_te[:, 1].ravel(), joint_lim=False)
            ax.set_title('Robustness (%s) Test' % ax.get_title())
            ax.set_ylabel("Robustness")
            ax.set_xlabel("Correlation")
            ax = fig.add_subplot(gs[3, 2])
            scatter(ax, y_te, x_te[:, 2].ravel(), joint_lim=False)
            ax.set_title('Prior (%s) Test' % ax.get_title())
            ax.set_ylabel("Prior")
            ax.set_xlabel("Correlation")
            ax = fig.add_subplot(gs[3, 3])
            scatter(ax, y_te, x_te[:, 3].ravel(), joint_lim=False)
            ax.set_title('Prior Signature (%s) Test' % ax.get_title())
            ax.set_ylabel("Prior")
            ax.set_xlabel("Correlation")
            ax = fig.add_subplot(gs[3, 4])
            scatter(ax, y_te, x_te[:, 4].ravel(), joint_lim=False)
            ax.set_title('Intensity (%s) Test' % ax.get_title())
            ax.set_ylabel("Intensity")
            ax.set_xlabel("Correlation")

            # ax = fig.add_subplot(gs[1, 3])
            if plots:
                try:
                    plt.savefig(os.path.join(
                        save_path, 'confidence_stats.png'))
                    plt.close()
                except Exception as ex:
                    self.__log.warning('SKIPPING PLOT: %s' % str(ex))

        def find_p(mod, x_tr, y_tr, x_te, y_te):
            import matplotlib.pyplot as plt
            test = ks_2samp
            ps = []
            ss_te = []
            for p in np.linspace(0, 3, 10):
                w = get_weights(y_tr, p=p)
                mod.fit(x_tr, y_tr, linearregression__sample_weight=w)
                y_te_p = mod.predict(x_te)
                s_te = test(y_te, y_te_p)[0]
                ps += [p]
                ss_te += [s_te]
            p = ps[np.argmin(ss_te)]
            if plots:
                plt.close('all')
                plt.scatter(ps, ss_te)
                plt.title("%.2f" % p)
                plt.savefig(os.path.join(save_path, 'confidence_p.png'))
                plt.close()
            return p

        self.__log.info('Training CONFIDENCE model')

        X, Y = self.save_confidence_distributions(
            siamese, neig_x, train_x, prior_model, prior_sign_model,
            save_path, splits, p_self=p_self)

        # generate train test split
        out_file = os.path.join(save_path, 'data.h5')
        with h5py.File(out_file, "w") as fh:
            for split_name, split_frac, split_idx in splits:
                xs_name = "x_%s" % split_name
                ys_name = "y_%s" % split_name
                self.__log.debug('writing %s: %s' %
                                 (xs_name, str(X[split_idx].shape)))
                fh.create_dataset(xs_name, data=X[split_idx])
                self.__log.debug('writing %s: %s' %
                                 (ys_name, str(Y[split_idx].shape)))
                fh.create_dataset(ys_name, data=Y[split_idx])
        traintest = DataSignature(out_file)
        x_tr = traintest.get_h5_dataset('x_train')
        y_tr = traintest.get_h5_dataset('y_train').ravel()
        x_te = traintest.get_h5_dataset('x_test')
        y_te = traintest.get_h5_dataset('y_test').ravel()

        model = make_pipeline(StandardScaler(), LinearRegression())
        p = find_p(model, x_tr, y_tr, x_te, y_te)
        model.fit(x_te, y_te,
                  linearregression__sample_weight=get_weights(y_te, p=p))
        calibration_model = make_pipeline(StandardScaler(), LinearRegression())
        y_pr = np.expand_dims(model.predict(x_te), 1)
        calibration_model.fit(y_pr, y_te)
        if plots:
            analyze(model, calibration_model, x_tr, y_tr, x_te, y_te)
        model_file = os.path.join(save_path, 'confidence.pkl')
        pickle.dump(model, open(model_file, 'wb'))
        calibration_file = os.path.join(save_path, 'calibration.pkl')
        pickle.dump(calibration_model, open(calibration_file, 'wb'))
        return model, calibration_model

    def save_confidence_distributions(self, siamese, known_x, train_x,
                                      prior_model, prior_sign_model, save_path,
                                      splits, p_self=0.0):
        try:
            from chemicalchecker.tool.siamese import SiameseTriplets
            import faiss
        except ImportError as err:
            raise err

        realistic_fn, trim_mask = self.realistic_subsampling_fn()

        # save neighbors faiss index based on only self train prediction
        self.__log.info('Computing Neighbor Index')
        known_onlyself = mask_keep(self.dataset_idx, known_x)
        known_onlyself_pred = siamese.predict(known_onlyself)
        known_onlyself_neig = faiss.IndexFlatL2(known_onlyself_pred.shape[1])
        known_onlyself_neig.add(known_onlyself_pred.astype(np.float32))
        known_onlyself_neig_file = os.path.join(save_path, 'neig.index')
        faiss.write_index(known_onlyself_neig, known_onlyself_neig_file)
        self.__log.info('Neighbor Index saved: %s' % known_onlyself_neig_file)

        # only self prediction is the ground truth
        unk_onlyself = mask_keep(self.dataset_idx, train_x)
        unk_onlyself_pred = siamese.predict(unk_onlyself)
        unk_notself = mask_exclude(self.dataset_idx, train_x)
        unk_notself_pred = siamese.predict(unk_notself)

        # do applicability domain prediction
        self.__log.info('Computing Applicability Domain')
        applicability, app_range, _ = self.applicability_domain(
            known_onlyself_neig, train_x, siamese, p_self=p_self)
        self.__log.info('Applicability Domain DONE')

        # do conformal prediction (dropout)
        self.__log.info('Computing Conformal Prediction')
        siamese_cp = SiameseTriplets(siamese.model_dir, predict_only=True)
        intensities, robustness, consensus_cp = self.conformal_prediction(
            siamese_cp, train_x, p_self=p_self)
        self.__log.info('Conformal Prediction DONE')

        # predict expected prior
        unk_notself_presence = ~np.isnan(unk_notself[:, ::128])[:, trim_mask]
        prior = prior_model.predict(unk_notself_presence.astype(int))
        prior_sign = prior_sign_model.predict(unk_notself_pred)

        # calculate the error
        log_mse = np.log10(
            np.mean(((unk_onlyself_pred - unk_notself_pred)**2), axis=1))
        log_mse_ad = np.log10(
            np.mean(((unk_onlyself_pred - consensus_cp)**2), axis=1))

        # get correlation between prediction and only self predictions
        correlation = row_wise_correlation(
            unk_onlyself_pred, unk_notself_pred, scaled=True)
        correlation_cp = row_wise_correlation(
            unk_onlyself_pred, consensus_cp, scaled=True)

        # we have all the data to train the confidence model
        self.__log.debug('Saving Confidence Features...')
        conf_features = (
            ('applicability', applicability),
            ('robustness', robustness),
            ('prior', prior),
            ('prior_sign', prior_sign),
            ('intensities', intensities)
        )
        for name, arr in conf_features:
            self.__log.debug('%s %s' % (name, arr.shape))

        features = np.vstack([x[1] for x in conf_features]).T

        know_dist_file = os.path.join(save_path, 'known_dist.h5')
        with h5py.File(know_dist_file, "w") as hf:
            hf.create_dataset('robustness', data=robustness)
            hf.create_dataset('intensity', data=intensities)
            hf.create_dataset('consensus', data=consensus_cp)
            hf.create_dataset('applicability', data=applicability)
            hf.create_dataset('applicability_range', data=app_range)
            hf.create_dataset('prior', data=prior)
            hf.create_dataset('prior_sign', data=prior_sign)
            hf.create_dataset('correlation', data=correlation)
            hf.create_dataset('correlation_consensus', data=correlation_cp)
            hf.create_dataset('log_mse', data=log_mse)
            hf.create_dataset('log_mse_consensus', data=log_mse_ad)

        # save plot
        import pandas as pd
        import seaborn as sns
        import matplotlib.pyplot as plt
        variables = ['applicability', 'robustness', 'intensity', 'prior',
                     'prior_sign', 'correlation', 'correlation_consensus_cp']

        def corr(x, y, **kwargs):
            coef = np.corrcoef(x, y)[0][1]
            label = r'$\rho$ = ' + str(round(coef, 2))
            ax = plt.gca()
            ax.annotate(label, xy=(0.2, 0.95), size=20, xycoords=ax.transAxes)

        # Create a pair grid instance
        df = pd.DataFrame(columns=variables)
        df['applicability'] = applicability
        df['robustness'] = robustness
        df['intensity'] = intensities
        df['prior'] = prior
        df['prior_sign'] = prior_sign
        df['correlation'] = correlation
        df['correlation_consensus_cp'] = correlation_cp

        # Map the plots to the locations
        for split_name, split_frac, split_idx in splits:
            grid = sns.PairGrid(
                data=df.loc[split_idx], vars=variables, height=4)
            grid = grid.map_upper(plt.scatter, color='darkred')
            grid = grid.map_upper(corr)
            grid = grid.map_lower(sns.kdeplot, cmap='Reds')
            grid = grid.map_diag(
                plt.hist, bins=10, edgecolor='k', color='darkred')
            plot_file = os.path.join(
                save_path, 'known_dist_%s.png' % split_name)
            plt.savefig(plot_file)

        return features, correlation

    def applicability_domain(self, neig_index, features, siamese,
                             dropout_fn=None, app_range=None, n_samples=1,
                             p_self=0.0, subsampling=False):
        # applicability is whether not-self preds is close to only-self preds
        # neighbors between 5 and 25 depending on the size of the dataset
        app_thr = int(np.clip(np.log10(self.triplet_sign.shape[0])**2, 5, 25))
        if subsampling:
            if dropout_fn is None:
                dropout_fn, _ = self.realistic_subsampling_fn()
            preds, dists, ranges = list(), list(), list()
            for i in range(n_samples):
                pred = siamese.predict(features,
                                       dropout_fn=partial(
                                           dropout_fn, p_self=p_self),
                                       dropout_samples=1)
                pred = np.mean(pred, axis=1)
                only_self_dists, _ = neig_index.search(pred, app_thr)
                if app_range is None:
                    d_min = np.min(only_self_dists)
                    d_max = np.max(only_self_dists)
                else:
                    d_min = app_range[0]
                    d_max = app_range[1]
                curr_app_range = np.array([d_min, d_max])
                preds.append(pred)
                dists.append(only_self_dists)
                ranges.append(curr_app_range)
            consensus = np.mean(np.stack(preds, axis=2), axis=2)
            app_range = [np.min(np.vstack(ranges)[:, 0]),
                         np.max(np.vstack(ranges)[:, 1])]
            # scale and invert distances to get applicability
            apps = list()
            for dist in dists:
                d_min = app_range[0]
                d_max = app_range[1]
                norm_dist = (dist - d_max) / (d_min - d_max)
                applicability = np.mean(norm_dist, axis=1).flatten()
                apps.append(applicability)
            applicability = np.mean(np.vstack(apps), axis=0)
            return applicability, app_range, consensus
        else:
            pred = siamese.predict(mask_exclude(self.dataset_idx, features))
            dists, _ = neig_index.search(pred, app_thr)
            d_min = np.min(dists)
            d_max = np.max(dists)
            app_range = np.array([d_min, d_max])
            norm_dist = (dists - d_max) / (d_min - d_max)
            applicability = np.mean(norm_dist, axis=1).flatten()
            return applicability, app_range, None

    def conformal_prediction(self, siamese_cp, features, dropout_fn=None,
                             nan_pred=None, n_samples=5, p_self=0.0):
        # draw prediction with sub-sampling
        if dropout_fn is None:
            dropout_fn, _ = self.realistic_subsampling_fn()
        # reference prediction (based on no information)
        if nan_pred is None:
            nan_feat = np.full(
                (1, features.shape[1]), np.nan, dtype=np.float32)
            nan_pred = siamese_cp.predict(nan_feat)
        samples = siamese_cp.predict(mask_exclude(self.dataset_idx, features),
                                     dropout_fn=partial(
            dropout_fn, p_self=p_self),
            dropout_samples=n_samples, cp=True)
        # summarize the predictions as consensus
        consensus = np.mean(samples, axis=1)
        # zeros input (no info) as intensity reference
        centered = consensus - nan_pred
        # measure the intensity (mean of absolute comps)
        intensities = np.mean(np.abs(centered), axis=1).flatten()
        # summarize the standard deviation of components
        robustness = 1 - np.mean(np.std(samples, axis=1), axis=1).flatten()
        return intensities, robustness, consensus

    def plot_validations(self, siamese, dataset_idx, traintest_file,
                         chunk_size=10000, imit=1000, dist_limit=1000):

        def no_mask(idxs, x1_data):
            return x1_data

        def read_h5(sign, idxs):
            with h5py.File(sign.data_path, "r") as hf:
                V = hf["x_test"][idxs]
            return V

        def read_unknown(sign, forbidden_idxs, max_n=100000):
            with h5py.File(sign.data_path, "r") as hf:
                V = hf["x_test"][:max_n]
            forbidden_idxs = set(forbidden_idxs)
            unknown_idxs = [i for i in range(
                0, max_n) if i not in forbidden_idxs]
            return V[unknown_idxs]

        import matplotlib.pyplot as plt
        import seaborn as sns
        import itertools
        from MulticoreTSNE import MulticoreTSNE

        mask_fns = {
            'ALL': partial(no_mask, dataset_idx),
            'NOT-SELF': partial(mask_exclude, dataset_idx),
            'ONLY-SELF': partial(mask_keep, dataset_idx),
        }

        all_inchikeys = self.get_universe_inchikeys()
        traintest = DataSignature(traintest_file)
        train_inks = traintest.get_h5_dataset('keys_train')
        test_inks = traintest.get_h5_dataset('keys_test')
        train_idxs = np.argwhere(np.isin(all_inchikeys, train_inks)).flatten()
        test_idxs = np.argwhere(np.isin(all_inchikeys, test_inks)).flatten()
        try:
            all_idxs = set(np.arange(len(all_inchikeys)))
            unknown_idxs = np.array(
                list(all_idxs - (set(train_idxs) | set(test_idxs))))
            unknown_idxs = np.sort(np.random.choice(
                unknown_idxs, 5000, replace=False))
        except Exception:
            unknown_idxs = np.array([])

        # predict
        pred = dict()
        pred_file = os.path.join(siamese.model_dir, 'plot_preds.pkl')
        if not os.path.isfile(pred_file):
            self.__log.info('VALIDATION: Predicting train.')
            pred['train'] = dict()
            full_x = DataSignature(self.sign2_universe)
            train = read_h5(full_x, train_idxs[:4000])

            for name, mask_fn in mask_fns.items():
                pred['train'][name] = siamese.predict(mask_fn(train))
            del train
            self.__log.info('VALIDATION: Predicting test.')
            pred['test'] = dict()
            test = read_h5(full_x, test_idxs[:1000])
            for name, mask_fn in mask_fns.items():
                pred['test'][name] = siamese.predict(mask_fn(test))
            del test
            self.__log.info('VALIDATION: Predicting unknown.')
            pred['unknown'] = dict()
            if np.any(unknown_idxs):
                unknown = read_h5(full_x, unknown_idxs[:5000])
                self.__log.info('Number of unknown %s' % len(unknown))
                for name, mask_fn in mask_fns.items():
                    if name == 'ALL':
                        pred['unknown'][name] = siamese.predict(
                            mask_fn(unknown))
                    else:
                        pred['unknown'][name] = []
                del unknown
            else:
                for name, mask_fn in mask_fns.items():
                    pred['unknown'][name] = []
            pickle.dump(pred, open(pred_file, "wb"))
        else:
            pred = pickle.load(open(pred_file, 'rb'))

        # Plot
        self.__log.info('VALIDATION: Plot correlations.')
        fig, axes = plt.subplots(
            1, 3, sharex=True, sharey=False, figsize=(10, 3))
        combos = itertools.combinations(mask_fns, 2)
        for ax, (n1, n2) in zip(axes.flatten(), combos):
            scaled_corrs = row_wise_correlation(
                pred['train'][n1], pred['train'][n2], scaled=True)
            sns.histplot(scaled_corrs, ax=ax, label='Train')
            scaled_corrs = row_wise_correlation(
                pred['test'][n1], pred['test'][n2], scaled=True)
            sns.histplot(scaled_corrs, ax=ax, label='Test')
            ax.legend()
            ax.set_title(label='%s vs. %s' % (n1, n2))
        fname = 'known_unknown_correlations.png'
        plot_file = os.path.join(siamese.model_dir, fname)
        plt.savefig(plot_file)
        plt.close()

        for metric in ['euclidean', 'cosine']:
            self.__log.info('VALIDATION: Plot %s distances.' % metric)
            fig, axes = plt.subplots(
                1, 3, sharex=True, sharey=True, figsize=(10, 3))
            for ax, name in zip(axes.flatten(), mask_fns):
                ax.set_title(name)
                dist_known = pdist(pred['train'][name][:dist_limit],
                                   metric=metric)
                sns.histplot(dist_known, label='Train', ax=ax)
                dist_known = pdist(pred['test'][name][:dist_limit],
                                   metric=metric)
                sns.histplot(dist_known, label='Test', ax=ax)
                if len(pred['unknown'][name]) > 0:
                    dist_unknown = pdist(pred['unknown'][name][:dist_limit],
                                         metric=metric)
                    sns.histplot(dist_unknown, label='Unknown', ax=ax)
                ax.legend()
            fname = 'known_unknown_dist_%s.png' % metric
            plot_file = os.path.join(siamese.model_dir, fname)
            plt.savefig(plot_file)
            plt.close()

        self.__log.info('VALIDATION: Plot Projections.')
        fig, axes = plt.subplots(2, 2, sharex=True, sharey=True,
                                 figsize=(10, 10), dpi=200)
        proj_model = MulticoreTSNE(n_components=2)
        proj_limit = min(500, len(pred['test']['ALL']))
        if np.any(pred['unknown']['ALL']):
            proj_train = np.vstack([
                pred['train']['ALL'][:proj_limit],
                pred['test']['ALL'][:proj_limit],
                pred['unknown']['ALL'][:proj_limit],
                pred['test']['ONLY-SELF'][-proj_limit:]
            ])
            proj = proj_model.fit_transform(proj_train)
            dist_tr = proj[:proj_limit]
            dist_ts = proj[proj_limit:(proj_limit * 2)]
            dist_uk = proj[(proj_limit * 2):(proj_limit * 3)]
            dist_os = proj[(proj_limit * 3):]
        else:
            proj_train = np.vstack([
                pred['train']['ALL'][:proj_limit],
                pred['test']['ALL'][:proj_limit],
                pred['test']['ONLY-SELF'][-proj_limit:]
            ])
            proj = proj_model.fit_transform(proj_train)
            dist_tr = proj[:proj_limit]
            dist_ts = proj[proj_limit:(proj_limit * 2)]
            dist_os = proj[(proj_limit * 2):]

        axes[0][0].set_title('Train')
        axes[0][0].scatter(dist_ts[:, 0], dist_ts[:, 1], s=10, color="grey")
        if np.any(pred['unknown']['ALL']):
            axes[0][0].scatter(dist_uk[:, 0], dist_uk[
                               :, 1], s=10, color="grey")
        axes[0][0].scatter(dist_os[:, 0], dist_os[:, 1], s=10, color="grey")
        axes[0][0].scatter(dist_tr[:, 0], dist_tr[:, 1], s=10, color="#1f77b4")

        axes[0][1].set_title('Test')
        axes[0][1].scatter(dist_tr[:, 0], dist_tr[:, 1], s=10, color="grey")
        if np.any(pred['unknown']['ALL']):
            axes[0][1].scatter(dist_uk[:, 0], dist_uk[
                               :, 1], s=10, color="grey")
        axes[0][1].scatter(dist_os[:, 0], dist_os[:, 1], s=10, color="grey")
        axes[0][1].scatter(dist_ts[:, 0], dist_ts[:, 1], s=10, color="#ff7f0e")

        axes[1][0].set_title('Unknown')
        axes[1][0].scatter(dist_tr[:, 0], dist_tr[:, 1], s=10, color="grey")
        axes[1][0].scatter(dist_ts[:, 0], dist_ts[:, 1], s=10, color="grey")
        axes[1][0].scatter(dist_os[:, 0], dist_os[:, 1], s=10, color="grey")
        if np.any(pred['unknown']['ALL']):
            axes[1][0].scatter(dist_uk[:, 0], dist_uk[
                               :, 1], s=10, color="#2ca02c")

        axes[1][1].set_title('ONLY-SELF')
        axes[1][1].scatter(dist_tr[:, 0], dist_tr[:, 1], s=10, color="grey")
        axes[1][1].scatter(dist_ts[:, 0], dist_ts[:, 1], s=10, color="grey")
        if np.any(pred['unknown']['ALL']):
            axes[1][1].scatter(dist_uk[:, 0], dist_uk[
                               :, 1], s=10, color="grey")
        axes[1][1].scatter(dist_os[:, 0], dist_os[:, 1], s=10, color="#d62728")

        fname = 'known_unknown_projection.png'
        plot_file = os.path.join(siamese.model_dir, fname)
        plt.savefig(plot_file)
        plt.close()


    def get_universe_inchikeys(self):
        # get sorted universe inchikeys
        if hasattr(self, 'universe_inchikeys'):
            return self.universe_inchikeys
        inchikeys = set()
        for ds, sign in zip(self.src_datasets, self.sign2_list):
            inchikeys.update(sign.unique_keys)
        inchikeys = sorted(list(inchikeys))
        return inchikeys

[docs]    def fit(self, sign2_list=None, sign2_self=None, triplet_sign=None,
            sign2_universe=None, complete_universe='full',
            sign2_coverage=None,
            model_confidence=True, save_correlations=False,
            predict_novelty=False, update_preds=True,
            chunk_size=1000, suffix=None, plots_train=True,
            triplets_sampler=None, **kwargs):
        """Fit signature 3 given a list of signature 2.

        Args:
            sign2_list(list): List of signature 2 objects to learn from.
            sign2_self(sign2): Signature 2 of the current space.
            triplet_sign(sign1): Signature used to define acnhor positive and 
                negative in triplets.
            sign2_universe(str): Path to the union of all signatures 2 for all
                molecules in the CC universe. (~1M x 3200)
            complete_universe(str): add chemistry information for molecules not
                in the universe. 'full' use all A* spaces while, 'fast' skips
                A2 (3D conformation) which is slow. False by default, not
                adding any signature to the universe.
            sign2_coverage(str): Path to the coverage of all signatures 2 for
                all molecules in the CC universe. (~1M x 25)
            model_confidence(bool): Whether to model confidence. That is based
                on standard deviation of prediction with dropout.
            save_correlations(bool) Whether to save the correlation (average,
                tertile, max) for the given input dataset (result of the
                evaluation).
            predict_novelty(bool) Whether to predict molecule novelty score.
            update_preds(bool): Whether to write or update the sign3.h5
            normalize_scores(bool): Whether to normalize confidence scores.
            chunk_size(int): Chunk size when writing to sign3.h5
            suffix(str): Suffix of the generated model.
            plots_train(bool): plotting trained models outcomes defaulted to True.
                               it applies to train_prior_model, 
                               train_prior_signature_model,
                               train_confidence_model
        """
        try:
            from chemicalchecker.tool.siamese import SiameseTriplets
        except ImportError:
            raise ImportError("requires tensorflow https://tensorflow.org")
        try:
            import faiss
        except ImportError as err:
            raise err
        BaseSignature.fit(self, **kwargs)

        # signature specific checks
        if self.molset != "full":
            self.__log.debug("Fit will be done with the full sign3")
            self = self.get_molset("full")

        # define datasets that will be used
        self.update_status("Getting data")
        cc = self.get_cc()

        if sign2_list is None:
            sign2_list = list()
            for ds in cc.datasets_exemplary():
                sign2_list.append(cc.get_signature('sign2', 'full', ds))
        self.sign2_list = sign2_list
        self.src_datasets = [sign.dataset for sign in sign2_list]
        if sign2_self is None:
            sign2_self = self.get_sign('sign2')
        self.sign2_self = sign2_self
        if triplet_sign is None:
            triplet_sign = self.get_sign('sign1')
        self.triplet_sign = triplet_sign

        self.sign2_coverage = sign2_coverage
        self.dataset_idx = np.argwhere(
            np.isin(self.src_datasets, self.dataset)).flatten()
        if self.sign2_coverage is None:
            self.sign2_coverage = os.path.join(self.model_path,
                                               'all_sign2_coverage.h5')
        if not os.path.isfile(self.sign2_coverage):
            sign3.save_sign2_coverage(sign2_list, self.sign2_coverage)

        self.__log.debug('Siamese fit %s based on %s', self.dataset,
                         str(self.src_datasets))

        # build input universe sign2 matrix if not provided
        # In the update pipeline this is pre-computed
        self.sign2_universe = sign2_universe
        if self.sign2_universe is None:
            self.sign2_universe = os.path.join(self.model_path, 'all_sign2.h5')
        if not os.path.isfile(self.sign2_universe):
            sign3.save_sign2_universe(sign2_list, self.sign2_universe)

        # check if molecules are missing from chemistry spaces, complete
        if complete_universe:
            if complete_universe == 'full':
                calc_ds_idx = [0, 1, 2, 3, 4]
                calc_ds_names = ['A1.001', 'A2.001',
                                 'A3.001', 'A4.001', 'A5.001']
            if complete_universe == 'fast':
                calc_ds_idx = [0, 2, 3, 4]
                calc_ds_names = ['A1.001', 'A3.001', 'A4.001', 'A5.001']
            if complete_universe == 'custom':
                calc_ds_idx = kwargs['calc_ds_idx']
                calc_ds_names = kwargs['calc_ds_names']
            res = self.complete_sign2_universe(
                sign2_self, self.sign2_universe, self.sign2_coverage,
                tmp_path=self.model_path, calc_ds_idx=calc_ds_idx,
                calc_ds_names=calc_ds_names)
            self.sign2_universe, self.sign2_coverage = res

        # check if performance evaluations need to be done
        siamese = None
        prior_mdl = None
        prior_sign_mdl = None
        conf_mdl = None

        if suffix is None:
            eval_model_path = os.path.join(self.model_path, 'siamese_eval')
            eval_file = os.path.join(eval_model_path, 'siamesetriplets.h5')
            if not os.path.isfile(eval_file):
                res = self.train_SNN(
                    self.params['sign2'].copy(), suffix='eval', evaluate=True,
                    plots_train=plots_train, triplets_sampler=triplets_sampler)
                siamese, prior_mdl, prior_sign_mdl, conf_mdl = res
        else:
            eval_model_path = os.path.join(self.model_path,
                                           'siamese_%s' % suffix)
            eval_file = os.path.join(eval_model_path, 'siamesetriplets.h5')
            if not os.path.isfile(eval_file):
                res = self.train_SNN(
                    self.params['sign2'].copy(), suffix=suffix, evaluate=True,
                    plots_train=plots_train, triplets_sampler=triplets_sampler)
                siamese, prior_mdl, prior_sign_mdl, conf_mdl = res
            return False

        # check if we have the final trained model
        final_model_path = os.path.join(self.model_path, 'siamese_final')
        final_file = os.path.join(final_model_path, 'siamesetriplets.h5')
        if not os.path.isfile(final_file):
            # get the learning rate from the siamese eval
            siamese_eval = SiameseTriplets(eval_model_path, predict_only=True)
            self.params['sign2']['learning_rate'] = siamese_eval.learning_rate
            siamese = self.train_SNN(
                self.params['sign2'].copy(), suffix='final', evaluate=False,
                triplets_sampler=triplets_sampler)

        # load models if not already available
        if siamese is None:
            siamese = SiameseTriplets(final_model_path, predict_only=True)
        siamese_cp = SiameseTriplets(siamese.model_dir, predict_only=True)

        if model_confidence:
            # part of confidence is the priors
            if prior_mdl is None:
                prior_path = os.path.join(self.model_path, 'prior_eval')
                prior_file = os.path.join(prior_path, 'prior.pkl')
                # if prior model is not there, retrain confidence
                if not os.path.isfile(prior_file):
                    siamese_eval = SiameseTriplets(eval_model_path)
                    self.train_confidence(
                        siamese_eval, plots_train=plots_train)
                prior_mdl = pickle.load(open(prior_file, 'rb'))

            # part of confidence is the priors based on signatures
            if prior_sign_mdl is None:
                prior_sign_path = os.path.join(
                    self.model_path, 'prior_sign_eval')
                prior_sign_file = os.path.join(prior_sign_path, 'prior.pkl')
                prior_sign_mdl = pickle.load(open(prior_sign_file, 'rb'))

            # and finally the linear combination of scores
            confidence_path = os.path.join(self.model_path, 'confidence_eval')
            if conf_mdl is None:
                confidence_file = os.path.join(
                    confidence_path, 'confidence.pkl')
                if not os.path.isfile(confidence_file):
                    self.rerun_confidence(
                        cc, 'eval', train=True, update_sign=False,
                        sign2_universe=self.sign2_universe,
                        sign2_coverage=self.sign2_coverage, plots_train=plots_train)
                calibration_file = os.path.join(
                    confidence_path, 'calibration.pkl')
                conf_mdl = (pickle.load(open(confidence_file, 'rb')),
                            pickle.load(open(calibration_file, 'rb')))

            # another part of confidence is the applicability
            neig_file = os.path.join(confidence_path, 'neig.index')
            app_neig = faiss.read_index(neig_file)
            known_dist = os.path.join(confidence_path, 'known_dist.h5')
            app_range = DataSignature(known_dist).get_h5_dataset(
                'applicability_range')
            realistic_fn, trim_mask = self.realistic_subsampling_fn()

        # get sorted universe inchikeys
        with h5py.File(self.sign2_universe, "r") as features:
            self.universe_inchikeys = features['keys'][:]
            tot_inks = len(self.universe_inchikeys)

        # save universe sign3
        if update_preds:
            self.update_status("Predicting universe Sign3")
            with h5py.File(self.data_path, "a") as results:
                # initialize V and keys datasets
                safe_create(results, 'V', (tot_inks, 128), dtype=np.float32)
                safe_create(results, 'keys',
                            data=np.array(self.universe_inchikeys,
                                          DataSignature.string_dtype()))
                # dataset used to generate the signature
                safe_create(results, 'datasets',
                            data=np.array(self.src_datasets,
                                          DataSignature.string_dtype()))
                known_mask = np.isin(list(self.universe_inchikeys),
                                     list(self.sign2_self.keys),
                                     assume_unique=True)
                # save the mask for know/inknown
                safe_create(results, 'known', data=known_mask)
                safe_create(results, 'shape', data=(tot_inks, 128))
                # the actual confidence value will be stored here
                safe_create(results, 'confidence', (tot_inks,),
                            dtype=np.float32)
                if model_confidence:
                    # this is to store robustness
                    safe_create(results, 'robustness',
                                (tot_inks,), dtype=np.float32)
                    # this is to store applicability
                    safe_create(results, 'applicability',
                                (tot_inks,), dtype=np.float32)
                    # this is to store priors
                    safe_create(results, 'prior',
                                (tot_inks,), dtype=np.float32)
                    # this is to store priors based on signature
                    safe_create(results, 'prior_signature',
                                (tot_inks,), dtype=np.float32)
                if predict_novelty:
                    safe_create(results, 'novelty',
                                (tot_inks, ), dtype=np.float32)
                    safe_create(results, 'outlier',
                                (tot_inks, ), dtype=np.float32)

                # predict signature 3 for universe molecules
                with h5py.File(self.sign2_universe, "r") as features:
                    # reference prediction (based on no information)
                    nan_feat = np.full((1, features['x_test'].shape[1]),
                                       np.nan, dtype=np.float32)
                    nan_pred = siamese.predict(nan_feat)
                    # read input in chunks
                    for idx in tqdm(range(0, tot_inks, chunk_size),
                                    desc='Predicting'):
                        chunk = slice(idx, idx + chunk_size)
                        feat = features['x_test'][chunk]
                        # predict with final model
                        preds = siamese.predict(feat)
                        results['V'][chunk] = preds
                        # skip modelling confidence if not required
                        if not model_confidence:
                            continue
                        # save confidence natural scores
                        # compute prior from coverage
                        cov = ~np.isnan(feat[:, 0::128])
                        prior = prior_mdl.predict(cov[:, trim_mask])
                        results['prior'][chunk] = prior
                        # and from prediction
                        prior_sign = prior_sign_mdl.predict(preds)
                        results['prior_signature'][chunk] = prior_sign
                        # conformal prediction
                        ints, robs, _ = self.conformal_prediction(
                            siamese_cp, feat, nan_pred=nan_pred,
                            dropout_fn=realistic_fn)
                        results['robustness'][chunk] = robs
                        # distance from known predictions
                        app, centrality, cons = self.applicability_domain(
                            app_neig, feat, siamese, app_range=app_range,
                            n_samples=1)
                        results['applicability'][chunk] = app
                        # and estimate confidence
                        conf_feats = np.vstack(
                            [app, robs, prior, prior_sign, ints]).T
                        conf_estimate = conf_mdl[0].predict(conf_feats)
                        conf_calib = conf_mdl[1].predict(
                            np.expand_dims(conf_estimate, 1))
                        results['confidence'][chunk] = conf_calib
                    # conpute confidence where self is known
                    known_idxs = np.argwhere(known_mask).flatten()
                    # iterate on chunks of knowns
                    for idx in tqdm(range(0, len(known_idxs), 10000),
                                    desc='Computing Confidence'):
                        chunk = slice(idx, idx + 10000)
                        feat = features['x_test'][known_idxs[chunk]]
                        # predict with all features
                        preds_all = siamese.predict(feat)
                        # predict with only-self features
                        feat_onlyself = mask_keep(self.dataset_idx, feat)
                        preds_onlyself = siamese.predict(feat_onlyself)
                        # confidence is correlation ALL vs. ONLY-SELF
                        corrs = row_wise_correlation(
                            preds_onlyself, preds_all, scaled=True)
                        results['confidence'][known_idxs[chunk]] = corrs

        # use semi-supervised anomaly detection algorithm to predict novelty
        if predict_novelty:
            self.update_status("Predicting novelty")
            self.predict_novelty()

        # save reference
        self.save_reference(overwrite=True)
        # finalize signature
        BaseSignature.fit_end(self, **kwargs)

[docs]    def predict_novelty(self, retrain=False, update_sign3=True, cpu=4):
        """Model novelty score via LocalOutlierFactor (semi-supervised).

        Args:
            retrain(bool): Drop old model and train again. (default: False)
            update_sign3(bool): Write novelty scores in h5. (default: True)

        """
        novelty_path = os.path.join(self.model_path, 'novelty')
        if not os.path.isdir(novelty_path):
            os.mkdir(novelty_path)
        novelty_model = os.path.join(novelty_path, 'lof.pkl')
        s2_inks = self.sign2_self.keys
        model = None
        if not os.path.isfile(novelty_model) or retrain:
            self.__log.debug('Training novelty score predictor')
            # fit on molecules available in sign2
            _, predicted = self.get_vectors(s2_inks, dataset_name='V')
            t0 = time()
            model = LocalOutlierFactor(
                novelty=True, metric='euclidean', n_jobs=cpu)
            model.fit(predicted)
            delta = time() - t0
            self.__log.debug('Training took: %s' % delta)
            # serialize for later
            pickle.dump(model, open(novelty_model, 'wb'))
        if update_sign3:
            self.__log.debug('Updating novelty scores')
            if model is None:
                model = pickle.load(open(novelty_model, 'rb'))
            # get scores for known molecules and pair with indexes
            s2_idxs = np.argwhere(np.isin(list(self.keys), s2_inks,
                                          assume_unique=True))
            s2_novelty = model.negative_outlier_factor_
            s2_outlier = [0] * s2_novelty.shape[0]
            assert(s2_idxs.shape[0] == s2_novelty.shape[0])
            # predict scores for other molecules and pair with indexes
            s3_inks = sorted(self.unique_keys - set(s2_inks))
            # no new prediction to add
            if len(s3_inks) == 0:
                return
            s3_idxs = np.argwhere(np.isin(list(self.keys), s3_inks,
                                          assume_unique=True))
            _, s3_pred_sign = self.get_vectors(s3_inks)
            s3_novelty = model.score_samples(s3_pred_sign)
            s3_outlier = model.predict(s3_pred_sign)
            assert(len(s3_inks) == s3_novelty.shape[0])
            ordered_scores = np.array(sorted(
                list(zip(s2_idxs.flatten(), s2_novelty, s2_outlier))
                + list(zip(s3_idxs.flatten(), s3_novelty, s3_outlier))))
            ordered_novelty = ordered_scores[:, 1]
            ordered_outlier = ordered_scores[:, 2]
            with h5py.File(self.data_path, "r+") as results:
                if 'novelty' in results:
                    del results['novelty']
                results['novelty'] = ordered_novelty
                if 'outlier' in results:
                    del results['outlier']
                results['outlier'] = ordered_outlier

[docs]    def predict(self, src_file, dst_file, src_h5_ds='x_test',
                dst_h5_ds='V', model_path=None, chunk_size=1000):
        try:
            from chemicalchecker.tool.siamese import SiameseTriplets
        except ImportError:
            raise ImportError("requires tensorflow https://tensorflow.org")

        if model_path is None:
            model_path = os.path.join(self.model_path, 'siamese_debug')
        self.__log.info('PREDICTING using model from: %s' % model_path)
        self.__log.info('INPUT from: %s' % src_file)
        self.__log.info('OUTPUT goes to: %s' % dst_file)
        siamese = SiameseTriplets(model_path, predict_only=True)
        with h5py.File(src_file, "r") as features:
            with h5py.File(dst_file, "w") as preds:
                # create destination h5 dataset
                tot_inks = features[src_h5_ds].shape[0]
                preds_shape = (tot_inks, 128)
                preds.create_dataset(dst_h5_ds, preds_shape, dtype=np.float32)
                if 'keys' in features:
                    preds.create_dataset('keys', data=features['keys'])
                # predict in chunks
                for idx in tqdm(range(0, tot_inks, chunk_size), desc='PRED'):
                    chunk = slice(idx, idx + chunk_size)
                    feat = features[src_h5_ds][chunk]
                    preds[dst_h5_ds][chunk] = siamese.predict(feat)


[docs]def safe_create(h5file, *args, **kwargs):
    if args[0] not in h5file:
        h5file.create_dataset(*args, **kwargs)


[docs]def mask_keep(idxs, x1_data):
    # we will fill an array of NaN with values we want to keep
    x1_data_transf = np.zeros_like(x1_data, dtype=np.float32) * np.nan
    for idx in idxs:
        # copy column from original data
        col_slice = slice(idx * 128, (idx + 1) * 128)
        x1_data_transf[:, col_slice] = x1_data[:, col_slice]
    # keep rows containing at least one not-NaN value
    return x1_data_transf


[docs]def mask_exclude(idxs, x1_data):
    x1_data_transf = np.copy(x1_data)
    for idx in idxs:
        # set current space to nan
        col_slice = slice(idx * 128, (idx + 1) * 128)
        x1_data_transf[:, col_slice] = np.nan
    # drop rows that only contain NaNs
    return x1_data_transf


[docs]def col_wise_correlation(X, Y, scaled=False):
    if scaled:
        X = robust_scale(X)
        Y = robust_scale(Y)
    return row_wise_correlation(X.T, Y.T)


[docs]def row_wise_correlation(X, Y, scaled=False):
    if scaled:
        X = robust_scale(X)
        Y = robust_scale(Y)
    var1 = (X.T - np.mean(X, axis=1)).T
    var2 = (Y.T - np.mean(Y, axis=1)).T
    cov = np.mean(var1 * var2, axis=1)
    return cov / (np.std(X, axis=1) * np.std(Y, axis=1))


[docs]def linear_pow2(val, slope=0.33, intercept=4.):
    flat_log_size = int(np.floor(val * slope))
    return np.power(2, int(flat_log_size + intercept))


[docs]def inverse_linear_pow2(val, slope=100, intercept=4.):
    flat_log_size = int(np.floor(slope / val))
    return np.power(2, int(flat_log_size + intercept))


[docs]def layer_size_heuristic(samples, features, clip=(512, 2048)):
    log_size = np.log(samples * features)
    pow2 = linear_pow2(log_size, slope=0.3, intercept=5.)
    return np.clip(pow2, *clip)


[docs]def batch_size_heuristic(samples, features, clip=(32, 256)):
    log_size = np.log(samples * features)
    pow2 = linear_pow2(log_size, slope=0.5, intercept=-2.)
    return np.clip(pow2, *clip)


[docs]def epoch_per_iteration_heuristic(samples, features, clip=(16, 1024)):
    log_size = np.log(samples * features)
    pow2 = inverse_linear_pow2(log_size, slope=270, intercept=-8)
    return np.clip(pow2, *clip)


[docs]def subsampling_probs(sign2_coverage, dataset_idx, trim_threshold=0.1,
                      min_unknown=10000):
    """Extract probabilities for known and unknown of a given dataset."""
    if type(sign2_coverage) == str:
        with h5py.File(sign2_coverage, "r") as cov_matrix:
            cov = cov_matrix['x_test'][:]
    else:
        cov = sign2_coverage
    max_ds = cov.shape[1]
    unknown = cov[(cov[:, dataset_idx] == 0).ravel()]
    known = cov[(cov[:, dataset_idx] == 1).ravel()]
    if unknown.shape[0] < min_unknown:
        unknown = known[:min_unknown]
    # decide which spaces are frequent enought in known (used for trainint)
    trim_mask = (np.sum(known, axis=0) / known.shape[0]) > trim_threshold

    def compute_probs(coverage, max_nr=25):
        # how many dataset per molecule?
        nrs, freq_nrs = np.unique(
            np.sum(coverage, axis=1).astype(int), return_counts=True)
        # frequency based probabilities
        p_nrs = freq_nrs / coverage.shape[0]
        # add minimum probability (corner cases where to use 1 or 2 datasets)
        min_p_nr = np.full(max_nr + 1, min(p_nrs), dtype=np.float32)
        for nr, p_nr in zip(nrs, p_nrs):
            min_p_nr[nr] = p_nr
        # but leave out too large nrs
        min_p_nr[max(nrs) + 1:] = 0.0
        min_p_nr[0] = 0.0
        # normalize (sum of probabilities must be one)
        min_p_nr = min_p_nr / np.sum(min_p_nr)
        # print(np.log10(min_p_nr + 1e-10).astype(int))
        # probabilities to keep a dataset?
        p_keep = np.sum(coverage, axis=0) / coverage.shape[0]
        return min_p_nr, p_keep

    p_nr_known, p_keep_known = compute_probs(known[:, trim_mask], max_nr=max_ds)
    unknown[:, dataset_idx] = 0
    p_nr_unknown, p_keep_unknown = compute_probs(unknown[:, trim_mask], max_nr=max_ds)
    return trim_mask, p_nr_unknown, p_keep_unknown, p_nr_known, p_keep_known


[docs]def subsample(tensor, sign_width=128,
              p_nr=(np.array([1 / 25.] * 25), np.array([1 / 25.] * 25)),
              p_keep=(np.array([1 / 25.] * 25), np.array([1 / 25.] * 25)),
              p_only_self=0.0,
              p_self=0.1,
              dataset_idx=[0],
              **kwargs):
    """Function to subsample stacked data."""
    # it is safe to make a local copy of the input matrix
    new_data = np.copy(tensor)
    # we will have a masking matrix at the end
    mask = np.zeros_like(new_data).astype(bool)
    # if new_data.shape[1] % sign_width != 0:
    #    raise Exception('All signature should be of length %i.' % sign_width)
    for idx, row in enumerate(new_data):
        # the following assumes the stacked signature have a fixed width
        presence = ~np.isnan(row[0::sign_width])
        # case where we show only the space itself
        if np.random.rand() < p_only_self:
            presence_add = np.full_like(presence, False)
            presence_add[dataset_idx] = True
            mask[idx] = np.repeat(presence_add, sign_width)
            continue
        # decide to use probabilities from known or unknown
        if np.random.rand() < p_self:
            p_nr_curr = p_nr[0]
            p_keep_curr = p_keep[0]
        else:
            p_nr_curr = p_nr[1]
            p_keep_curr = p_keep[1]
        # datasets that I can select
        present_idxs = np.argwhere(presence).flatten()
        # how many dataset at most?
        max_add = present_idxs.shape[0]
        # normalize nr dataset probabilities
        p_nr_row = p_nr_curr[1:max_add] / np.sum(p_nr_curr[1:max_add])
        # how many dataset are we keeping?
        try:
            nr_keep = np.random.choice(np.arange(1, len(p_nr_row) + 1),
                                       p=p_nr_row)
        except Exception:
            nr_keep = 1
        # normalize dataset keep probabilities
        p_keep_row = (p_keep_curr[presence] + 1e-10) / \
            (np.sum(p_keep_curr[presence]) + 1e-10)
        nr_keep = np.min([nr_keep, np.sum(p_keep_row > 0)])
        # which ones?
        to_add = np.random.choice(
            present_idxs, nr_keep, p=p_keep_row, replace=False)
        # dataset mask
        presence_add = np.zeros(presence.shape).astype(bool)
        presence_add[to_add] = True
        # from dataset mask to signature mask
        mask[idx] = np.repeat(presence_add, sign_width)
    # make masked dataset NaN
    new_data[~mask] = np.nan
    return new_data


[docs]def plot_subsample(sign, plot_file, sign2_coverage, traintest_file,
                   ds='B1.001', p_self=0.1, p_only_self=0.0, limit=10000,
                   sign2_list=None):
    """ Validation plot for the subsampling procedure."""
    import numpy as np
    import pandas as pd
    import seaborn as sns
    import matplotlib.pyplot as plt
    from chemicalchecker import ChemicalChecker
    cc = sign.get_cc()

    if sign2_list is not None:
        sign2_ds_list = [s.dataset for s in sign2_list]
    else:
        sign2_ds_list = list(cc.datasets_exemplary())
    max_ds = len(sign2_list)

    # get triplet generator
    dataset_idx = np.argwhere(
        np.isin(sign2_ds_list, ds)).flatten()
    res = subsampling_probs(sign2_coverage, dataset_idx)
    trim_mask, p_nr_unknown, p_keep_unknown, p_nr_known, p_keep_known = res
    trim_dataset_idx = np.argwhere(np.arange(len(trim_mask))[
        trim_mask] == dataset_idx).ravel()[0]
    augment_kwargs = {
        'p_nr': (p_nr_unknown, p_nr_known),
        'p_keep': (p_keep_unknown, p_keep_known),
        'dataset_idx': [trim_dataset_idx],
        'p_only_self': 0.0}
    realistic_fn, trim_mask = sign.realistic_subsampling_fn()
    tr_shape_type_gen = TripletIterator.generator_fn(
        traintest_file,
        'train_train',
        batch_size=10,
        train=True,
        augment_fn=realistic_fn,
        augment_kwargs=augment_kwargs,
        trim_mask=trim_mask,
        onlyself_notself=True)
    tr_gen = tr_shape_type_gen[2]

    # get known unknown
    with h5py.File(sign2_coverage, "r") as cov_matrix:
        cov = cov_matrix['x_test'][:]
    unknown = cov[(cov[:, dataset_idx] == 0).flatten()]
    known = cov[(cov[:, dataset_idx] == 1).flatten()]

    # get dataset probabilities
    probs_ds = {
        'space': np.array([d[:2] for d in sign2_ds_list])[trim_mask],
        'p_keep_known': p_keep_known,
        'p_keep_unknown': p_keep_unknown}
    df_probs_ds = pd.DataFrame(probs_ds)
    df_probs_ds = df_probs_ds.melt(id_vars=['space'])
    df_probs_ds['probabilities'] = df_probs_ds['value']

    # get nr probabilities
    nnrs, freq_nrs = np.unique(
        np.sum(unknown, axis=1).astype(int), return_counts=True)
    unknown_nr = np.zeros((max_ds + 1,))
    unknown_nr[nnrs] = freq_nrs
    nnrs, freq_nrs = np.unique(
        np.sum(known, axis=1).astype(int), return_counts=True)
    known_nr = np.zeros((max_ds + 1,))
    known_nr[nnrs] = freq_nrs
    probs_nr = {
        'nr_ds': np.arange(max_ds + 1),
        'p_nr_known': p_nr_known,
        'p_nr_unknown': p_nr_unknown}  # == p_nr
    df_probs_nr = pd.DataFrame(probs_nr)
    df_probs_nr = df_probs_nr.melt(id_vars=['nr_ds'])
    df_probs_nr['probabilities'] = df_probs_nr['value']

    # get sampled dataset presence counts
    ds_a = np.zeros((max_ds,))[trim_mask]
    ds_p = np.zeros((max_ds,))[trim_mask]
    ds_n = np.zeros((max_ds,))[trim_mask]
    ds_o = np.zeros((max_ds,))[trim_mask]
    ds_ns = np.zeros((max_ds,))[trim_mask]
    batch = 0
    for (a, p, n, o, ns), y in tr_gen():
        ds_a += np.sum(~np.isnan(a[:, ::128]), axis=0)
        ds_p += np.sum(~np.isnan(p[:, ::128]), axis=0)
        ds_n += np.sum(~np.isnan(n[:, ::128]), axis=0)
        ds_o += np.sum(~np.isnan(o[:, ::128]), axis=0)
        ds_ns += np.sum(~np.isnan(ns[:, ::128]), axis=0)
        batch += 1
        if batch == 1000:
            break
    trimmed_ds = np.array(sign2_ds_list)[trim_mask]
    sampled_ds = {
        'space': np.array([d[:2] for d in trimmed_ds]),
        'anchor': ds_a,
        'positive': ds_p,
        'negative': ds_n,
        'only-self': ds_o,
        'not-self': ds_ns}
    df_sampled_ds = pd.DataFrame(sampled_ds)
    df_sampled_ds = df_sampled_ds.melt(id_vars=['space'])
    df_sampled_ds['sampled'] = df_sampled_ds['value']

    # get sampled nr dataset
    nr_a = np.zeros((max_ds + 1,))
    nr_p = np.zeros((max_ds + 1,))
    nr_n = np.zeros((max_ds + 1,))
    nr_o = np.zeros((max_ds + 1,))
    nr_ns = np.zeros((max_ds + 1,))
    batch = 0
    for (a, p, n, o, ns), y in tr_gen():
        nr_batch_a = np.sum(~np.isnan(a[:, ::128]), axis=1).astype(int)
        nnrs, freq_nrs = np.unique(nr_batch_a, return_counts=True)
        nr_a[nnrs] += freq_nrs
        nr_batch_p = np.sum(~np.isnan(p[:, ::128]), axis=1).astype(int)
        nnrs, freq_nrs = np.unique(nr_batch_p, return_counts=True)
        nr_p[nnrs] += freq_nrs
        nr_batch_n = np.sum(~np.isnan(n[:, ::128]), axis=1).astype(int)
        nnrs, freq_nrs = np.unique(nr_batch_n, return_counts=True)
        nr_n[nnrs] += freq_nrs
        nr_batch_o = np.sum(~np.isnan(o[:, ::128]), axis=1).astype(int)
        nnrs, freq_nrs = np.unique(nr_batch_o, return_counts=True)
        nr_o[nnrs] += freq_nrs
        nr_batch_ns = np.sum(~np.isnan(ns[:, ::128]), axis=1).astype(int)
        nnrs, freq_nrs = np.unique(nr_batch_ns, return_counts=True)
        nr_ns[nnrs] += freq_nrs
        batch += 1
        if batch == 1000:
            break
    sampled_nr = {
        'nr_ds': np.arange(max_ds + 1),
        'anchor': nr_a,
        'positive': nr_p,
        'negative': nr_n,
        'only-self': nr_o,
        'not-self': nr_ns}
    df_sampled_nr = pd.DataFrame(sampled_nr)
    df_sampled_nr = df_sampled_nr.melt(id_vars=['nr_ds'])
    df_sampled_nr['sampled'] = df_sampled_nr['value']

    # plot

    fig = plt.figure(figsize=(24, 12))
    gs = fig.add_gridspec(2, 2)
    ax = fig.add_subplot(gs[0, 0])
    sns.barplot(x="space", y="probabilities", hue='variable',
                data=df_probs_ds, ax=ax)
    ax = fig.add_subplot(gs[0, 1])
    sns.barplot(x="nr_ds", y="probabilities", hue='variable',
                data=df_probs_nr, ax=ax)
    ax = fig.add_subplot(gs[1, 0])
    sns.barplot(x="space", y="sampled", hue='variable',
                data=df_sampled_ds, ax=ax,
                palette=['gold', 'forestgreen', 'crimson', 'royalblue', 'k'])
    ax = fig.add_subplot(gs[1, 1])
    sns.barplot(x="nr_ds", y="sampled", hue='variable',
                data=df_sampled_nr, ax=ax,
                palette=['gold', 'forestgreen', 'crimson', 'royalblue', 'k'])

    plt.tight_layout()
    plt.savefig(plot_file)
    plt.close()