"""Calculate data for molecules.
Each calc function here is iterating on a list of InChIKeys of standardised
molecules.
Each molecule is loaded, and properties/descriptors/features are computed.
The raw features are yielded in chunks as dictionaries.
These methods are used to populate the
:mod:`~chemicalchecker.database.calcdata`
database where the table has the same name as functions defined here.
"""
import os
import numpy as np
from chemicalchecker.util import logged
from chemicalchecker.util.decorator import timeout
[docs]@logged
class DataCalculator():
"""DataCalculator class."""
[docs] @staticmethod
def calc_fn(function):
"""Serve a calc function."""
try:
return eval('DataCalculator.' + function)
except Exception as ex:
DataCalculator.__log.error(
"Cannot find calculator function %s", function)
raise ex
@staticmethod
def morgan_fp_r2_2048(inchikey_inchi, chunks=1000, dense=True):
try:
from rdkit.Chem import AllChem as Chem
except ImportError:
raise ImportError("requires rdkit " +
"https://www.rdkit.org/")
iks = inchikey_inchi.keys()
nBits = 2048
radius = 2
chunk = list()
for ik in iks:
if ik is None:
continue
v = str(inchikey_inchi[ik])
try:
mol = Chem.rdinchi.InchiToMol(v)[0]
except Exception as ex:
DataCalculator.__log.debug("Skipping molecule %s" % ik)
DataCalculator.__log.debug(str(ex))
continue
info = {}
# print mol
fp = Chem.GetMorganFingerprintAsBitVect(
mol, radius, nBits=nBits, bitInfo=info)
if dense:
dense = ",".join("%d" % s for s in sorted(
[x for x in fp.GetOnBits()]))
result = {
"inchikey": ik,
"raw": dense
}
else:
result = {
"inchikey": ik,
"raw": np.array(fp)
}
chunk.append(result)
if len(chunk) == chunks:
yield chunk
chunk = list()
yield chunk
@staticmethod
def e3fp_3conf_1024(inchikey_inchi, chunks=1000):
try:
from rdkit.Chem import AllChem as Chem
from rdkit.Chem import Descriptors, rdMolDescriptors
except ImportError:
raise ImportError("requires rdkit " +
"https://www.rdkit.org/")
try:
from e3fp import pipeline
except ImportError:
raise ImportError("requires e3fp " +
"https://github.com/keiserlab/e3fp")
root = os.path.dirname(os.path.realpath(__file__))
params = pipeline.params_to_dicts(root + "/data/defaults.cfg")
@timeout(100, use_signals=False)
def fprints_from_inchi(inchi, inchikey, confgen_params={},
fprint_params={}, save=False):
mol = Chem.rdinchi.InchiToMol(inchi)[0]
if Descriptors.MolWt(mol) > 800 or \
rdMolDescriptors.CalcNumRotatableBonds(mol) > 11:
return None
smiles = Chem.MolToSmiles(mol, isomericSmiles=True)
result = pipeline.fprints_from_smiles(
smiles, inchikey, confgen_params, fprint_params, save)
return result
iks = inchikey_inchi.keys()
chunk = list()
for k in iks:
if k is None:
continue
try:
fps = fprints_from_inchi(
str(inchikey_inchi[k]), str(k), params[0], params[1])
except Exception:
DataCalculator.__log.warning('Timeout inchikey: ' + k)
fps = None
if not fps:
result = {
"inchikey": k,
"raw": fps
}
else:
s = ",".join([str(x) for fp in fps for x in fp.indices])
result = {
"inchikey": k,
"raw": s
}
chunk.append(result)
if len(chunk) == chunks:
yield chunk
chunk = list()
yield chunk
@staticmethod
def murcko_1024_cframe_1024(inchikey_inchi, chunks=1000):
try:
from rdkit.Chem import AllChem as Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
except ImportError:
raise ImportError("requires rdkit " +
"https://www.rdkit.org/")
iks = inchikey_inchi.keys()
nBits = 1024
radius = 2
def murcko_scaffold(mol):
core = MurckoScaffold.GetScaffoldForMol(mol)
if not Chem.MolToSmiles(core, isomericSmiles=True):
core = mol
fw = MurckoScaffold.MakeScaffoldGeneric(core)
info = {}
c_fp = Chem.GetMorganFingerprintAsBitVect(
core, radius, nBits=nBits, bitInfo=info).GetOnBits()
f_fp = Chem.GetMorganFingerprintAsBitVect(
fw, radius, nBits=nBits, bitInfo=info).GetOnBits()
fp = ["c%d" % x for x in c_fp] + ["f%d" % y for y in f_fp]
return ",".join(fp)
chunk = list()
for k in iks:
if k is None:
continue
v = str(inchikey_inchi[k])
try:
mol = Chem.rdinchi.InchiToMol(v)[0]
except Exception as ex:
DataCalculator.__log.debug("Skipping molecule %s" % k)
DataCalculator.__log.debug(str(ex))
continue
try:
dense = murcko_scaffold(mol)
except Exception:
dense = None
result = {
"inchikey": k,
"raw": dense
}
chunk.append(result)
if len(chunk) == chunks:
yield chunk
chunk = list()
yield chunk
@staticmethod
def maccs_keys_166(inchikey_inchi, chunks=1000):
try:
from rdkit.Chem import AllChem as Chem
from rdkit.Chem import MACCSkeys
except ImportError:
raise ImportError("requires rdkit " +
"https://www.rdkit.org/")
iks = inchikey_inchi.keys()
chunk = list()
for k in iks:
if k is None:
continue
v = str(inchikey_inchi[k])
try:
mol = Chem.rdinchi.InchiToMol(v)[0]
except Exception as ex:
DataCalculator.__log.debug("Skipping molecule %s" % k)
DataCalculator.__log.debug(str(ex))
continue
fp = MACCSkeys.GenMACCSKeys(mol)
dense = ",".join("%d" % s for s in sorted(
[x for x in fp.GetOnBits()]))
result = {
"inchikey": k,
"raw": dense
}
chunk.append(result)
if len(chunk) == chunks:
yield chunk
chunk = list()
yield chunk
@staticmethod
def general_physchem_properties(inchikey_inchi, chunks=1000):
try:
from rdkit.Chem import AllChem as Chem
from rdkit.Chem import Descriptors, ChemicalFeatures, QED
except ImportError:
raise ImportError("requires rdkit " +
"https://www.rdkit.org/")
def descriptors(mol):
P = {}
P['ringaliph'] = Descriptors.NumAliphaticRings(mol)
P['mr'] = Descriptors.MolMR(mol)
P['heavy'] = Descriptors.HeavyAtomCount(mol)
P['hetero'] = Descriptors.NumHeteroatoms(mol)
P['rings'] = Descriptors.RingCount(mol)
props = QED.properties(mol)
P['mw'] = props[0]
P['alogp'] = props[1]
P['hba'] = props[2]
P['hbd'] = props[3]
P['psa'] = props[4]
P['rotb'] = props[5]
P['ringarom'] = props[6]
P['alerts_qed'] = props[7]
P['qed'] = QED.qed(mol)
# Ro5
ro5 = 0
if P['hbd'] > 5:
ro5 += 1
if P['hba'] > 10:
ro5 += 1
if P['mw'] >= 500:
ro5 += 1
if P['alogp'] > 5:
ro5 += 1
P['ro5'] = ro5
# Ro3
ro3 = 0
if P['hbd'] > 3:
ro3 += 1
if P['hba'] > 3:
ro3 += 1
if P['rotb'] > 3:
ro3 += 1
if P['mw'] >= 300:
ro3 += 1
if P['alogp'] > 3:
ro3 += 1
P['ro3'] = ro3
# Structural alerts from Chembl
P['alerts_chembl'] = len(
set([int(f.GetType()) for f
in alerts_chembl.GetFeaturesForMol(mol)]))
return P
root = os.path.dirname(os.path.realpath(__file__))
# Find how to create this file in netscreens/physchem
alerts_chembl = ChemicalFeatures.BuildFeatureFactory(
root + "/data/structural_alerts.fdef")
iks = inchikey_inchi.keys()
chunk = list()
for k in iks:
if k is None:
continue
v = str(inchikey_inchi[k])
try:
mol = Chem.rdinchi.InchiToMol(v)[0]
except Exception as ex:
DataCalculator.__log.debug("Skipping molecule %s" % k)
DataCalculator.__log.debug(str(ex))
continue
try:
P = descriptors(mol)
raw = "mw(%.2f),heavy(%d),hetero(%d),rings(%d),ringaliph(%d),ringarom(%d),alogp(%.3f),mr(%.3f)" + \
",hba(%d),hbd(%d),psa(%.3f),rotb(%d),alerts_qed(%d),alerts_chembl(%d),ro5(%d),ro3(%d),qed(%.3f)"
raw = raw % (P['mw'], P['heavy'], P['hetero'],
P['rings'], P['ringaliph'], P['ringarom'],
P['alogp'], P['mr'], P['hba'], P['hbd'], P['psa'],
P['rotb'], P['alerts_qed'], P['alerts_chembl'],
P['ro5'], P['ro3'], P['qed'])
except:
raw = None
result = {
"inchikey": k,
"raw": raw
}
chunk.append(result)
if len(chunk) == chunks:
yield chunk
chunk = list()
yield chunk
@staticmethod
def chembl_target_predictions_v23_10um(inchikey_inchi, chunks=1000):
try:
from rdkit.Chem import AllChem as Chem
from rdkit import DataStructs
except ImportError:
raise ImportError("requires rdkit " +
"https://www.rdkit.org/")
import joblib
import collections
import numpy as np
import pandas as pd
from chemicalchecker.database import Datasource
# Variables
max_targets = 20
min_targets = 1
min_proba = 0.5
# Paths
models_path = Datasource.get("chembl_target_predictions")[0].data_path
uniprot_mapping_path = Datasource.get(
"chembl_uniprot_mapping")[0].data_path
# Load models
morgan_nb = joblib.load(
models_path + "/models_23/10uM/mNB_10uM_all.pkl")
classes = list(morgan_nb.targets)
# Read target-to-uniprot mapping
chembltarg2prot = collections.defaultdict(set)
with open(uniprot_mapping_path + "/chembl_uniprot_mapping.txt", "r") as f:
for l in f:
if l[0] == "#":
continue
l = l.rstrip("\n").split("\t")
chembltarg2prot[l[1]].update([l[0]])
# Fuction for target prediction
def predict_targets(inchi):
mol = Chem.rdinchi.InchiToMol(inchi)[0]
fp = Chem.GetMorganFingerprintAsBitVect(
mol, 2, nBits=2048, bitInfo={})
res = np.zeros(len(fp), np.int32)
DataStructs.ConvertToNumpyArray(fp, res)
probas = list(morgan_nb.predict_proba(res.reshape(1, -1))[0])
predictions = pd.DataFrame(
zip(classes, probas), columns=['id', 'proba'])
top_preds = predictions.sort_values(
by='proba', ascending=False).head(max_targets)
top_preds = top_preds[top_preds["proba"] >= min_proba]
prots = []
for r in np.array(top_preds):
if r[0] not in chembltarg2prot:
continue
for p in chembltarg2prot[r[0]]:
prots += [(p, int(r[1] * 10))]
if len(prots) < min_targets:
return None
return prots
# Start iterating
iks = inchikey_inchi.keys()
chunk = list()
for ik in iks:
if ik is None:
continue
v = str(inchikey_inchi[ik])
# DataCalculator.__log.info( ik)
targs = predict_targets(v)
if not targs:
result = {
"inchikey": ik,
"raw": targs
}
else:
targs_ = collections.defaultdict(list)
for t in targs:
targs_[t[0]] += [t[1]]
targs_ = dict((k, np.max(v)) for k, v in targs_.items())
targs = sorted(targs_.items(), key=lambda x: -x[1])
dense = ",".join("%s(%d)" % x for x in targs)
result = {
"inchikey": ik,
"raw": dense
}
chunk.append(result)
if len(chunk) == chunks:
yield chunk
chunk = list()
yield chunk