Source code for chemicalchecker.tool.targetmate.datasets.chembl.fetch_chembl

"""Get target data from a local PostGreSQL ChEMBL database.
"""
import os
import pandas as pd
from chemicalchecker.tool.targetmate.utils.chemistry import read_molecule
from chemicalchecker.util import psql
from chemicalchecker.util import logged


[docs]@logged
class ChemblDb:

    def __init__(self, dbname='chembl_27'):
        """Class to fetch data from a local ChEMBL database.

        Args:
            dbname (str): ChEMBL database name (default='chembl_27').
        """
        self.dbname = dbname

    def get_targets(self, target_chembl_ids):
        self.__log.info("Getting targets")
        if not target_chembl_ids:
            query = '''
                SELECT chembl_id, target_type, pref_name, tax_id, organism
                    FROM target_dictionary;
            '''
        else:
            s = "(%s)" % ",".join(["'%s'" % t for t in target_chembl_ids])
            query = '''
                SELECT chembl_id, target_type, pref_name, tax_id, organism
                    FROM target_dictionary
                    WHERE
                        chembl_id IN %s
            ''' % s
        results = psql.qstring(query, self.dbname)
        col_names = ["target_id", "target_type",
                     "pref_name", "tax_id", "organism"]
        return pd.DataFrame(results, columns=col_names)

    def _get_activities(self, chembl_ids, entity, only_pchembl):
        self.__log.debug("Getting activities")
        query = '''
            SELECT
                a.chembl_id,
                a.assay_type,
                t.chembl_id,
                m.chembl_id,
                s.canonical_smiles,
                act.pchembl_value
            FROM
                molecule_dictionary m,
                compound_structures s,
                activities act,
                assays a,
                target_dictionary t
            WHERE
                m.molregno = s.molregno
                AND m.molregno = act.molregno
                AND act.assay_id = a.assay_id
                AND a.tid = t.tid
                AND a.assay_type IN ('B', 'F')
                AND s.canonical_smiles IS NOT NULL
        '''
        if only_pchembl:
            query += " AND act.pchembl_value IS NOT NULL"
        if type(chembl_ids) is str:
            query += " AND %s.chembl_id = '%s'" % (entity, chembl_ids)
        else:
            chembl_ids_list = ",".join(["'%s'" % t for t in chembl_ids])
            query += " AND %s.chembl_id IN (%s)" % (entity, chembl_ids_list)
        results = psql.qstring(query, self.dbname)
        col_names = ["assay_id", "assay_type", "target_id", "molecule_id",
                     "canonical_smiles", "pchembl_value"]
        return pd.DataFrame(results, columns=col_names)

    def get_molecule_activities(self, molecule_chembl_ids, only_pchembl=False):
        return self._get_activities(molecule_chembl_ids, entity="m",
                                    only_pchembl=only_pchembl)

    def get_target_activities(self, target_chembl_ids, only_pchembl=False):
        return self._get_activities(target_chembl_ids, entity="t",
                                    only_pchembl=only_pchembl)

    def get_assay_activities(self, assay_chembl_ids, only_pchembl=False):
        return self._get_activities(assay_chembl_ids, entity="a",
                                    only_pchembl=only_pchembl)


[docs]@logged
class Chembl(ChemblDb):

    def __init__(self, output_folder,
                 min_actives=10,
                 pchembl_values=[5, 6, 7], only_pchembl=True,
                 standardize=True,
                 **kwargs):
        """Query ChEMBL and produce a hierarchy of active/inactive data.

        Args:
            output_folder (str): Output folder where to put the hierarchy.
            min_actives (int): Minimum number of actives for an training set to
                be considered (default=10).
            pchembl_cuts (list): Chosen pchembl scores to divide actives and
                inactives (default=[5,6,7]).
            only_pchembl (bool): Keep values without a pchembl score and assume
                they are positives (default=True).
            standardize (bool): Standardize molecules (default=True).
        """
        ChemblDb.__init__(self, **kwargs)
        self.output_folder = output_folder
        self.min_actives = min_actives
        self.only_pchembl = only_pchembl
        self.pchembl_values = sorted(
            set([round(x, 2) for x in pchembl_values]))
        if not self.only_pchembl:
            self.pchembl_values += [None]
        self.standardize = standardize
        self.activities = {}

    def _process_smiles(self, df):
        inchikeys = []
        smiles = []
        idxs = []
        for idx, smi in df["canonical_smiles"].items():
            smi = read_molecule(smi, standardize=self.standardize)
            if not smi:
                continue
            inchikeys += [smi[0]]
            smiles += [smi[1]]
            idxs += [idx]
        df = df.loc[idxs]
        df["inchikey"] = inchikeys
        df["smiles"] = smiles
        df = df[["assay_id", "assay_type", "target_id",
                 "molecule_id", "smiles", "inchikey", "pchembl_value"]]
        return df

    @staticmethod
    def _to_set(df):
        values = [tuple(x)
                  for x in df[["smiles", "molecule_id", "inchikey"]].values]
        d = {}
        for v in values:
            d[v[-1]] = v
        values = set([v for k, v in d.items()])
        return values

    def get_molecule_activities(self, molecule_chembl_ids):
        df = super().get_molecule_activities(
            molecule_chembl_ids, only_pchembl=self.only_pchembl)
        return self._process_smiles(df)

    def get_target_activities(self, target_chembl_ids):
        df = super().get_target_activities(
            target_chembl_ids, only_pchembl=self.only_pchembl)
        return self._process_smiles(df)

    def get_assay_activities(self, assay_chembl_id):
        df = super().get_assay_activities(
            assay_chembl_id, only_pchembl=self.only_pchembl)
        return self._process_smiles(df)

    def decide_actives_inactives(self, actives, inactives):
        common_iks = set([smi[-1] for smi in actives]
                         ).intersection([smi[-1] for smi in inactives])
        actives = set([smi for smi in actives if smi[-1] not in common_iks])
        if len(actives) < self.min_actives:
            return None
        inactives = set(
            [smi for smi in inactives if smi[-1] not in common_iks])
        return actives, inactives

    def get_activities(self, df, pchembl_value=None):
        if pchembl_value:
            dfa = df[df["pchembl_value"] >= pchembl_value]
            dfi = df[df["pchembl_value"] < pchembl_value]
        else:
            dfa = df
            dfi = df[df["pchembl_value"] < -666]
        actives = self._to_set(dfa)
        inactives = self._to_set(dfi)
        results = self.decide_actives_inactives(actives, inactives)
        if not results:
            return None
        actives, inactives = results
        R = []
        for r in actives:
            R += [[1, r[0], r[1], r[2]]]
        for r in inactives:
            R += [(-1, r[0], r[1], r[2])]
        df_ = pd.DataFrame(R, columns=["activity", "smiles", "id", "inchikey"])
        df_ = df_.sample(frac=1).reset_index(drop=True)
        return df_

    @staticmethod
    def to_csv(df, file_name):
        file_name = os.path.abspath(file_name)
        df.to_csv(file_name, sep="\t", header=False, index=False)

    @staticmethod
    def pchembl_filename(pchembl_value):
        if not pchembl_value:
            return "pchembl_NA.tsv"
        else:
            return "pchembl_%d.tsv" % (pchembl_value * 100)

    def write_every_pchembl(self, df, folder):
        to_write = []
        done = []
        for pchembl_value in self.pchembl_values:
            df_ = self.get_activities(df, pchembl_value=pchembl_value)
            if df_ is None:
                continue
            to_write += [(df_, os.path.join(
                folder, self.pchembl_filename(pchembl_value)))]
            done += [pchembl_value]
        if not to_write:
            return done
        if not os.path.exists(folder):
            os.makedirs(folder)
        for tw in to_write:
            self.to_csv(tw[0], tw[1])
        return done

    @staticmethod
    def _summary_update(summary, sumr, sump):
        for sr in sumr:
            for sp in sump:
                summary += [sr + [sp]]
        return summary

    def write_folder_flat(self, target_chembl_ids = None):
        self.__log.info("Writing ")
        dft = self.get_targets(target_chembl_ids)
        for idx, target_chembl_id in dft["target_id"].items():
            self.__log.debug("Working on %s" % target_chembl_id)
            df = self.get_target_activities(target_chembl_id)
            folder = os.path.join(self.output_folder, target_chembl_id)
            self.write_every_pchembl(df, folder)

    def write_folder_hierarchy(self, target_chembl_ids = None):
        self.__log.info("Writing folder dictionary")
        summary = []
        dft = self.get_targets(target_chembl_ids)
        for idx, target_chembl_id in dft["target_id"].items():
            self.__log.debug("Working on %s" % target_chembl_id)
            df = self.get_target_activities(target_chembl_id)
            folder = os.path.join(self.output_folder, target_chembl_id)
            sumr = [[target_chembl_id, None, None]]
            sump = self.write_every_pchembl(df, folder)
            summary = self._summary_update(summary, sumr, sump)
            assay_types = pd.unique(df["assay_type"])
            for assay_type in assay_types:
                folder_a = os.path.join(folder, assay_type)
                df_a = df[df["assay_type"] == assay_type]
                sumr = [[target_chembl_id, assay_type, None]]
                sump = self.write_every_pchembl(df_a, folder_a)
                summary = self._summary_update(summary, sumr, sump)
                assay_ids = pd.unique(df_a["assay_id"])
                for assay_id in assay_ids:
                    folder_b = os.path.join(folder_a, assay_id)
                    df_b = df_a[df_a["assay_id"] == assay_id]
                    sumr = [[target_chembl_id, assay_type, assay_id]]
                    sump = self.write_every_pchembl(df_b, folder_b)
                    summary = self._summary_update(summary, sumr, sump)
        col_names = ["target_id", "assay_type", "assay_id", "pchembl_value"]
        summary = pd.DataFrame(summary, columns=col_names)
        summary_path=os.path.join(self.output_folder, "summary.tsv")
        summary.to_csv(summary_path,
                       sep="\t", na_rep="NA", header=True, index=False)