Source code for chemicalchecker.core.sign4

"""Signature type 4.

Fixed-length (e.g. 128-d) representation of the data, generalizing signature 3
to unseen molecules. Signatures type 4 are available for any molecule of
interest and have a confidence/applicability measure assigned to them.
"""
import os
import h5py
import numpy as np
from tqdm import tqdm

from .signature_base import BaseSignature
from .signature_data import DataSignature

from chemicalchecker.util import logged


[docs]@logged class sign4(BaseSignature, DataSignature): """Signature type 4 class.""" def __init__(self, signature_path, dataset, **params): """Initialize a Signature. Args: signature_path(str): The signature root directory. dataset(`Dataset`): `chemicalchecker.database.Dataset` object. params(): Parameters, expected keys are: * 'sign0_params' for learning based on sign0 (Morgan Fingerprint) * 'sign0_conf_params' for learning confidences based on MFP """ # Calling init on the base class to trigger file existence checks BaseSignature.__init__(self, signature_path, dataset, **params) self.data_path = os.path.join(self.signature_path, 'sign4.h5') DataSignature.__init__(self, self.data_path) # get parameters or default values self.params = dict() # parameters to learn from sign0 default_sign0 = { 'epochs': 30, 'cpu': 8, 'learning_rate': 1e-3, 'layers': ['Dense', 'Dense', 'Dense', 'Dense'], 'layers_sizes': [1024, 512, 256, 128], 'activations': ['relu', 'relu', 'relu', 'tanh'], 'dropouts': [0.1, 0.1, 0.1, None], } default_sign0.update(params.get('sign0_params', {})) self.params['sign0'] = default_sign0 # parameters to learn confidence from sign0 default_sign0_conf = { 'epochs': 30, 'cpu': 8, 'learning_rate': 1e-3, 'layers': ['Dense', 'Dense', 'Dense', 'Dense'], 'layers_sizes': [1024, 512, 256, 1], 'activations': ['relu', 'relu', 'relu', 'linear'], 'dropouts': [0.5, 0.2, 0.2, None] } default_sign0_conf.update(params.get('sign0_conf_params', {})) self.params['sign0_conf'] = default_sign0_conf self._sign0_V = None self._sign3_V = None @property def shared_keys(self): return sorted(list(self.sign0.unique_keys & self.sign3.unique_keys)) @property def sign0_vectors(self): if self._sign0_V is None: self.__log.debug("Reading sign0, this should only be loaded once.") _, self._sign0_V = self.sign0.get_vectors(self.shared_keys) # make sure the order of features is correct if 'features' in self.sign0.info_h5: order = np.argsort( self.sign0.get_h5_dataset('features').astype(int)) self._sign0_V = self._sign0_V[:, order] self.__log.debug("sign0 shape: %s" % str(self._sign0_V.shape)) return self._sign0_V @property def sign3_vectors(self): if self._sign3_V is None: self.__log.debug("Reading sign3, this should only be loaded once.") _, self._sign3_V = self.sign3.get_vectors(self.shared_keys) self.__log.debug("sign3 shape: %s" % str(self._sign3_V.shape)) return self._sign3_V
[docs] def learn_sign0(self, sign0, sign3, params, suffix=None, evaluate=True): """Learn the signature 3 from sign0. This method is used twice. First to evaluate the performances of the model. Second to train the final model on the full set of data. Args: sign0(list): Signature 0 object to learn from. params(dict): Dictionary with algorithm parameters. reuse(bool): Whether to reuse intermediate files (e.g. the aggregated signature 3 matrix). suffix(str): A suffix for the siamese model path (e.g. 'sign3/models/smiles_<suffix>'). evaluate(bool): Whether we are performing a train-test split and evaluating the performances (N.B. this is required for complete confidence scores) include_confidence(bool): whether to include confidences. """ try: from chemicalchecker.tool.smilespred import Smilespred except ImportError: raise ImportError("requires tensorflow https://tensorflow.org") # get params and set folder model_path = os.path.join(self.model_path, 'smiles_%s' % suffix) if not os.path.isdir(model_path): os.makedirs(model_path) # initialize model and start learning smpred = Smilespred( model_dir=model_path, sign0=self.sign0_vectors, sign3=self.sign3_vectors, evaluate=evaluate, **params) self.__log.debug('Smiles pred training on %s' % model_path) smpred.fit() self.smiles_predictor = smpred self.__log.debug('model saved to %s' % model_path) if evaluate: smpred.evaluate()
[docs] def learn_sign0_conf(self, sign0, sign3, params, reuse=True, suffix=None, evaluate=True): """Learn the signature 3 applicability from sign0. This method is used twice. First to evaluate the performances of the model. Second to train the final model on the full set of data. Args: sign0(list): Signature 0 object to learn from. reuse(bool): Whether to reuse intermediate files (e.g. the aggregated signature 3 matrix). suffix(str): A suffix for the siamese model path (e.g. 'sign3/models/smiles_<suffix>'). evaluate(bool): Whether we are performing a train-test split and evaluating the performances (N.B. this is required for complete confidence scores) include_confidence(bool): whether to include confidences. """ try: from chemicalchecker.tool.smilespred import ApplicabilityPredictor except ImportError: raise ImportError("requires tensorflow https://tensorflow.org") # get params and set folder model_path = os.path.join(self.model_path, 'smiles_applicability_%s' % suffix) if not os.path.isdir(model_path): reuse = False os.makedirs(model_path) _, sign3_app_V = self.sign3.get_vectors(self.shared_keys, dataset_name='confidence') sign3_app_V = sign3_app_V.ravel() # initialize model and start learning apppred = ApplicabilityPredictor( model_dir=model_path, sign0=self.sign0_vectors, applicability=sign3_app_V, evaluate=evaluate, **params) self.__log.debug('Applicability pred training on %s' % model_path) if not reuse: apppred.fit() self.smiles_predictor = apppred self.__log.debug('model saved to %s' % model_path) if evaluate: apppred.evaluate()
[docs] def fit(self, sign0=None, sign3=None, suffix=None, include_confidence=True, only_confidence=False, **kwargs): """Fit signature 4 from Morgan Fingerprint. This method is fitting a model that uses Morgan fingerprint as features to predict signature 3. In future other featurization approaches can be tested. Args: sign0(str): Path to the MFP file (i.e. sign0 of A1.001). include_confidence(bool): Whether to include confidence score in regression problem. only_confidence(bool): Whether to only train an additional regressor exclusively devoted to confidence. """ BaseSignature.fit(self, **kwargs) # signature specific checks if self.molset != "full": self.__log.debug("Fit will be done for the full sign4") self = self.get_molset("full") if sign3 is None: sign3 = self.get_sign('sign3').get_molset("full") if sign0 is None: sign0 = self.get_cc().signature('A1.001', 'sign0') if sign0.molset != "full": self.__log.debug("Fit will be done using full sign0") sign0 = sign0.get_molset("full") self.sign0 = sign0 if sign3.molset != "full": self.__log.debug("Fit will be done using full sign3") sign3 = sign3.get_molset("full") self.sign3 = sign3 if sign0.shape[0] != sign3.shape[0]: self.__log.warning("sign3 and MFP do not have the same nr of " "molecules. This might give bad sign0 recap.") # check if performance evaluations need to be done if not only_confidence: self.update_status("Training SMILES-based signature predictor") if suffix is not None: self.learn_sign0(sign0, sign3, self.params['sign0'].copy(), suffix=suffix, evaluate=True) return False else: self.learn_sign0(sign0, sign3, self.params['sign0'].copy(), suffix='eval', evaluate=True) # check if we have the final trained model self.update_status("Fitting final SMILES model") self.learn_sign0(sign0, sign3, self.params['sign0'].copy(), suffix='final', evaluate=False) if include_confidence: self.update_status("Training SMILES-based confidence predictor") if suffix is not None: self.learn_sign0_conf( sign0, sign3, self.params['sign0_conf'].copy(), suffix=suffix, evaluate=True) return False else: dest_file = os.path.join( self.model_path, 'smiles_applicability_eval', 'applicabilitypredictor.h5') if not os.path.isfile(dest_file): self.learn_sign0_conf( sign0, sign3, self.params['sign0_conf'].copy(), suffix='eval', evaluate=True) # check if we have the final trained model dest_file = os.path.join(self.model_path, 'smiles_applicability_final', 'applicabilitypredictor.h5') if not os.path.isfile(dest_file): self.update_status("Fitting final confidence model") self.learn_sign0_conf( sign0, sign3, self.params['sign0_conf'].copy(), suffix='final', evaluate=False) # predict for CC universe self.update_status("Predicting for CC universe") self.predict_from_sign0(sign0, self.data_path) # save reference self.save_reference(overwrite=True) # finalize signature BaseSignature.fit_end(self, **kwargs)
def get_predict_fn(self, model='smiles_final'): try: from chemicalchecker.tool.smilespred import Smilespred except ImportError as err: raise err model_path = os.path.join(self.model_path, model) model = Smilespred(model_path, save_params=False) return model.predict def get_applicability_predict_fn(self, model='smiles_applicability_final'): try: from chemicalchecker.tool.smilespred import ApplicabilityPredictor except ImportError as err: raise err model_path = os.path.join(self.model_path, model) model = ApplicabilityPredictor(model_path, save_params=False) return model.predict def predict_from_smiles(self, smiles, dest_file, **kwargs): return self.predict_from_string(smiles, dest_file, keytype='SMILES', **kwargs) def predict_from_sign0(self, sign0, dest_file, chunk_size=1000, y_order=None, **kwargs): # load NN predict_fn = self.get_predict_fn() appl_fn = self.get_applicability_predict_fn() # we return a simple DataSignature object (basic HDF5 access) pred_s3 = DataSignature(dest_file) # load novelty model for more accurate novelty scores (slower) with h5py.File(dest_file, "w") as results: results.create_dataset('keys', data=np.array( sign0.keys, DataSignature.string_dtype())) results.create_dataset( 'applicability', (len(sign0.keys), 1), dtype=np.float32) results.create_dataset( 'V', (len(sign0.keys), 128), dtype=np.float32) results.create_dataset("shape", data=(len(sign0.keys), 128)) # sign0 reorder if y_order is None: y_order = np.arange(2048) if 'features' in sign0.info_h5: y_order = np.argsort(sign0.get_h5_dataset('features').astype(int)) cs = chunk_size for chunk, rows in sign0.chunk_iter('V', cs, axis=0, chunk=True): rows = rows[:, y_order] preds = predict_fn(rows) # save chunk to H5 results['V'][chunk] = preds[:] # also run applicability prediction apreds = appl_fn(rows) results['applicability'][chunk] = apreds[:] return pred_s3
[docs] def predict_from_string(self, molecules, dest_file, keytype='SMILES', chunk_size=1000, predict_fn=None, keys=None, components=128, applicability=True, y_order=None): """Given molecuel string, generate MFP and predict sign3. Args: molecules(list): A list of molecules strings. dest_file(str): File where to save the predictions. keytype(str): Wether to interpret molecules as InChI or SMILES. Returns: pred_s3(DataSignature): The predicted signatures as DataSignature object. """ try: from rdkit import Chem from rdkit.Chem import AllChem except ImportError as err: raise err # input must be a list, otherwise we make it so if isinstance(molecules, str): molecules = [molecules] # reorder as sign0 A1 or leave it as is if y_order is None: y_order = np.arange(2048) # convert input molecules to InChI inchies = list() if keytype.upper() == 'SMILES': for smi in molecules: if smi == '': smi = 'INVALID SMILES' mol = Chem.MolFromSmiles(smi) if mol is None: self.__log.warning( "Cannot get molecule from SMILES: %s." % smi) inchies.append('INVALID SMILES') continue inchi = Chem.rdinchi.MolToInchi(mol)[0] self.__log.debug('CONVERTED: %s %s', smi, inchi) inchies.append(inchi) elif keytype.upper() == 'INCHI': inchies = molecules else: raise Exception('Keytype not recognized') # load NN if predict_fn is None: predict_fn = self.get_predict_fn() if applicability: appl_fn = self.get_applicability_predict_fn() # we return a simple DataSignature object (basic HDF5 access) pred_s3 = DataSignature(dest_file) # load novelty model for more accurate novelty scores (slower) with h5py.File(dest_file, "w") as results: # initialize V (with NaN in case of failing rdkit) and smiles keys results.create_dataset('molecules', data=np.array( molecules, DataSignature.string_dtype())) if keys is not None: results.create_dataset('keys', data=np.array( keys, DataSignature.string_dtype())) else: results.create_dataset('keys', data=np.array( molecules, DataSignature.string_dtype())) if applicability: results.create_dataset( 'applicability', (len(molecules), 1), dtype=np.float32) results.create_dataset( 'V', (len(molecules), components), dtype=np.float32) results.create_dataset("shape", data=(len(molecules), components)) # compute sign0 (i.e. Morgan fingerprint) nBits = 2048 radius = 2 # predict by chunk for i in tqdm(range(0, len(molecules), chunk_size)): chunk = slice(i, i + chunk_size) sign0s = list() failed = list() for idx, inchi in enumerate(inchies[chunk]): try: # read molecules inchi = inchi.encode('ascii', 'ignore') mol = Chem.inchi.MolFromInchi(inchi) if mol is None: raise Exception("Cannot get molecule from string.") info = {} fp = AllChem.GetMorganFingerprintAsBitVect( mol, radius, nBits=nBits, bitInfo=info) bin_s0 = [fp.GetBit(i) for i in range(fp.GetNumBits())] calc_s0 = np.array(bin_s0).astype(np.float32) except Exception as err: # in case of failure append a NaN vector self.__log.warn("%s: %s", inchi, str(err)) failed.append(idx) calc_s0 = np.full((nBits, ), np.nan) finally: sign0s.append(calc_s0) # stack input signatures and generate predictions sign0s = np.vstack(sign0s)[:, y_order] preds = predict_fn(sign0s) # add NaN when SMILES conversion failed if failed: preds[np.array(failed)] = np.full( (components, ), np.nan) # save chunk to H5 results['V'][chunk] = preds[:, :components] # also run applicability prediction if applicability: apreds = appl_fn(sign0s) if failed: apreds[np.array(failed)] = np.full( (1, ), np.nan) results['applicability'][chunk] = apreds[:] return pred_s3