Source code for chemicalchecker.util.pipeline.tasks_web.task_web_showtargets

import os
import collections

from chemicalchecker.util import psql
from chemicalchecker.database import Dataset
from chemicalchecker.database import UniprotKB
from chemicalchecker.core import ChemicalChecker
from chemicalchecker.util.pipeline import BaseTask
from chemicalchecker.util import logged

# We got these strings by doing: pg_dump -t 'pubchem' --schema-only mosaic
# -h aloy-dbsrv

DROP_TABLE = "DROP TABLE IF EXISTS public.showtargets"
DROP_TABLE_DESC = "DROP TABLE IF EXISTS public.showtargets_description"

CREATE_TABLE_DESC = """CREATE TABLE public.showtargets_description (
    uniprot_ac text PRIMARY KEY,
    genename text,
    fullname text,
    taxid text,
    organism text
);"""

CREATE_TABLE = """CREATE TABLE public.showtargets (
    inchikey text,
    uniprot_ac text,
    rank integer,
    display text
);"""

CREATE_INDEX_DESC = """
CREATE INDEX genename_showtargets_description_idx ON public.showtargets_description USING btree (genename);
CREATE INDEX taxid_showtargets_description_idx ON public.showtargets_description USING btree (taxid);
"""

CREATE_INDEX = """
CREATE INDEX inchikey_showtargets_idx ON showtargets(inchikey);
CREATE INDEX uniprot_ac_showtargets_idx ON showtargets(uniprot_ac);
CREATE INDEX rank_showtargets_idx ON showtargets(rank);
"""

INSERT = "INSERT INTO showtargets VALUES %s"

ref_spaces = ['B1.001', 'B2.001', 'B4.001', 'B5.001']


[docs]@logged
class ShowTargets(BaseTask):

    def __init__(self, name=None, **params):
        task_id = params.get('task_id', None)
        if task_id is None:
            params['task_id'] = name
        BaseTask.__init__(self, name, **params)

        self.DB = params.get('DB', None)
        if self.DB is None:
            raise Exception('DB parameter is not set')
        self.CC_ROOT = params.get('CC_ROOT', None)
        if self.CC_ROOT is None:
            raise Exception('CC_ROOT parameter is not set')
        self.uniprot_db_version = params.get('uniprot_db_version', None)
        if self.uniprot_db_version is None:
            raise Exception('uniprot_db_version parameter is not set')

[docs]    def run(self):
        """Run the show targets step."""
        database_name = self.DB
        try:
            self.__log.info("Creating table")
            psql.query(DROP_TABLE, database_name)
            psql.query(CREATE_TABLE, database_name)
            psql.query(DROP_TABLE_DESC, database_name)
            psql.query(CREATE_TABLE_DESC, database_name)
            # psql.query(CREATE_INDEX, database_name)
        except Exception as e:
            self.__log.error("Error while creating tables")
            if not self.custom_ready():
                raise Exception(e)
            else:
                self.__log.error(e)
                return

        cc = ChemicalChecker(self.CC_ROOT)
        prots = set()
        for space in ref_spaces:
            s0 = cc.get_signature('sign0', 'full', space)
            features = s0.features
            prots.update([x.split('(')[0] for x in features if "Class:" not in x])
        prots = sorted(prots)

        self.__log.info("Querying UniprotKB...")
        ukb = UniprotKB(self.uniprot_db_version)
        showtarg_d = ukb.get_proteins(
            prots,
            limit_to_fields=["genename", "fullname", "taxid", "organism"])

        self.__log.info("Inserting proteins into database...")
        R = []
        for p in prots:
            if p in showtarg_d:
                d = showtarg_d[p]
                R += [(self.__pstr(p),
                       self.__pstr(d["genename"]),
                       self.__pstr(d["fullname"]),
                       self.__pstr(d["taxid"]),
                       self.__pstr(d["organism"]))]
            else:
                R += [("'%s'" % p, 'NULL', 'NULL', 'NULL', 'NULL')]
        R = ["(%s)" % ",".join(r) for r in R]

        try:
            for c in self.__chunker(R, 1000):
                psql.query("INSERT INTO showtargets_description VALUES %s" %
                           ",".join(c), database_name)
            psql.query(CREATE_INDEX_DESC, database_name)
            showtarg_d = {}
            for r in psql.qstring(
                    "SELECT uniprot_ac, genename, taxid FROM "
                    "showtargets_description", database_name):
                showtarg_d[r[0]] = [r[1], r[2]]
        except Exception as e:
            self.__log.error("Error while filling showtargets_description")
            if not self.custom_ready():
                raise Exception(e)
            else:
                self.__log.error(e)
                return

        self.__log.info("Getting orthologs from MetaPhors")
        dataset = Dataset.get('C3.001')
        map_files = {}
        for ds in dataset.datasources:
            map_files[ds.datasource_name] = ds.data_path

        id_conversion = os.path.join(
            map_files["metaphors_id_conversion"], "id_conversion.txt")
        file_9606 = os.path.join(map_files["metaphors_9606"], "9606.txt")
        human_proteome = os.path.join(
            map_files["human_proteome"], "human_proteome.tab")

        metaphorsid_uniprot = collections.defaultdict(set)
        f = open(id_conversion, "r")
        f.readline()
        for l in f:
            l = l.rstrip("\n").split("\t")
            if l[1] == "SwissProt" or l[1] == "TrEMBL":
                metaphorsid_uniprot[l[2]].update([l[0]])
        f.close()

        any_human = collections.defaultdict(set)
        f = open(file_9606, "r")
        f.readline()
        for l in f:
            l = l.rstrip("\n").split("\t")
            if l[3] not in metaphorsid_uniprot:
                continue
            if l[1] not in metaphorsid_uniprot:
                continue
            for po in metaphorsid_uniprot[l[3]]:
                for ph in metaphorsid_uniprot[l[1]]:
                    any_human[po].update([ph])
                    any_human[ph].update([ph])
        f.close()

        f = open(human_proteome, "r")
        f.readline()
        for l in f:
            p = l.split("\t")[0]
            any_human[p].update([p])
        f.close()

        self.__log.info(
            "First is always MOA, in any species... "
            "(only sorted alphabetically by gene name)")
        seens = collections.defaultdict(set)
        showtargs = collections.defaultdict(list)
        s0 = cc.get_signature('sign0', 'full', ref_spaces[0])
        features = s0.features
        keys = s0.keys
        i = 0
        for sig in s0:
            collected_features = features[sig > 0]
            key = keys[i]
            i += 1
            if len(collected_features) == 0:
                continue
            prots = [x.split('(')[0] for x in collected_features if "Class:" not in x]

            prots = self.__sort_alphabet(prots, showtarg_d)
            showtargs[key] += prots
            seens[key].update(prots)

        self.__log.info("Now it is human...")
        hp = set([r[0] for r in psql.qstring(
            "SELECT uniprot_ac FROM showtargets_description "
            "WHERE taxid = '9606'", database_name)])

        self.__log.info("...binding table... (sorted by potency)")
        s0 = cc.get_signature('sign0', 'full', ref_spaces[2])
        features = s0.features
        keys = s0.keys
        i = 0
        for sig in s0:
            collected_features_1 = features[sig == 1]
            collected_features_2 = features[sig == 2]
            if len(collected_features_1) == 0 and len(collected_features_2) == 0:
                i += 1
                continue
            key = keys[i]
            i += 1
            if key in seens:
                s = seens[key]
            else:
                s = set()
            prots0 = hp.intersection(
                [x.split('(')[0] for x in collected_features_2 if "Class:" not in x]).difference(s)
            seens[key].update(prots0)
            prots0 = self.__sort_alphabet(list(prots0), showtarg_d)
            showtargs[key] += prots0
            prots1 = hp.intersection(
                [x.split('(')[0] for x in collected_features_1 if "Class:" not in x]).difference(s)
            seens[key].update(prots1)
            prots1 = self.__sort_alphabet(list(prots1), showtarg_d)
            showtargs[key] += prots1

        self.__log.info("...metabolic genes table...")
        s0 = cc.get_signature('sign0', 'full', ref_spaces[1])
        features = s0.features
        keys = s0.keys
        i = 0
        for sig in s0:
            collected_features = features[sig > 0]
            if len(collected_features) == 0:
                i += 1
                continue
            key = keys[i]
            i += 1
            if key in seens:
                s = seens[key]
            else:
                s = set()
            prots = set(
                [x.split('(')[0] for x in collected_features if "Class:" not in x]).difference(s)
            seens[key].update(prots)
            prots = self.__sort_alphabet(prots, showtarg_d)
            showtargs[key] += prots

        self.__log.info("...HTS bioassays table...")
        s0 = cc.get_signature('sign0', 'full', ref_spaces[3])
        features = s0.features
        keys = s0.keys
        i = 0
        for sig in s0:
            collected_features = features[sig > 0]
            if len(collected_features) == 0:
                i += 1
                continue
            key = keys[i]
            i += 1
            if key in seens:
                s = seens[key]
            else:
                s = set()
            prots = hp.intersection(
                [x.split('(')[0] for x in collected_features if "Class:" not in x]).difference(s)
            seens[key].update(prots)
            prots = self.__sort_alphabet(prots, showtarg_d)
            showtargs[key] += prots

        self.__log.info(
            "And then the rest of species "
            "(where no human orthologs are already known)")

        self.__log.info("...binding table... (sorted by potency)")
        s0 = cc.get_signature('sign0', 'full', ref_spaces[2])
        features = s0.features
        keys = s0.keys
        i = 0
        for sig in s0:
            collected_features_1 = features[sig == 1]
            collected_features_2 = features[sig == 2]
            key = keys[i]
            i += 1
            if len(collected_features_1) == 0 and len(collected_features_2) == 0:
                continue
            if key in seens:
                s = seens[key]
            else:
                s = set()
            prots0 = set(
                [x.split('(')[0] for x in collected_features_2 if "Class:" not in x]).difference(s)
            ho = set()
            for p in prots0:
                if p in any_human:
                    ho.update(any_human[p])
            prots0 = prots0.difference(ho)
            seens[key].update(prots0)
            prots0 = self.__sort_alphabet(list(prots0), showtarg_d)
            showtargs[key] += prots0
            prots1 = set(
                [x.split('(')[0] for x in collected_features_1 if "Class:" not in x]).difference(s)
            ho = set()
            for p in prots1:
                if p in any_human:
                    ho.update(any_human[p])
            prots1 = prots1.difference(ho)
            seens[key].update(prots1)
            prots1 = self.__sort_alphabet(list(prots1), showtarg_d)
            showtargs[key] += prots1

        self.__log.info("...HTS bioassays table...")
        s0 = cc.get_signature('sign0', 'full', ref_spaces[3])
        features = s0.features
        keys = s0.keys
        i = 0
        for sig in s0:
            collected_features = features[sig > 0]
            key = keys[i]
            i += 1
            if len(collected_features) == 0:
                continue

            if key in seens:
                s = seens[key]
            else:
                s = set()
            prots = set(
                [x.split('(')[0] for x in collected_features if "Class:" not in x]).difference(s)
            ho = set()
            for p in prots:
                if p in any_human:
                    ho.update(any_human[p])
            prots = prots.difference(ho)
            seens[key].update(prots)
            prots = self.__sort_alphabet(prots, showtarg_d)
            showtargs[key] += prots

        self.__log.info("And finally the rest of species...")

        self.__log.info("...binding table... (sorted by potency)")
        s0 = cc.get_signature('sign0', 'full', ref_spaces[2])
        features = s0.features
        keys = s0.keys
        i = 0
        for sig in s0:
            collected_features_1 = features[sig == 1]
            collected_features_2 = features[sig == 2]
            key = keys[i]
            i += 1
            if len(collected_features_1) == 0 and len(collected_features_2) == 0:
                continue
            if key in seens:
                s = seens[key]
            else:
                s = set()
            prots0 = set(
                [x.split('(')[0] for x in collected_features_2 if "Class:" not in x]).difference(s)
            seens[key].update(prots0)
            prots0 = self.__sort_alphabet(list(prots0), showtarg_d)
            showtargs[key] += prots0
            prots1 = set(
                [x.split('(')[0] for x in collected_features_1 if "Class:" not in x]).difference(s)
            seens[key].update(prots1)
            prots1 = self.__sort_alphabet(list(prots1), showtarg_d)
            showtargs[key] += prots1

        self.__log.info("...HTS bioassays table...")
        s0 = cc.get_signature('sign0', 'full', ref_spaces[3])
        features = s0.features
        keys = s0.keys
        i = 0
        for sig in s0:
            collected_features = features[sig > 0]
            key = keys[i]
            i += 1
            if len(collected_features) == 0:
                continue
            if key in seens:
                s = seens[key]
            else:
                s = set()
            prots = set(
                [x.split('(')[0] for x in collected_features if "Class:" not in x]).difference(s)
            seens[key].update(prots)
            prots = self.__sort_alphabet(prots, showtarg_d)
            showtargs[key] += prots

        displays = {}
        for k, v in showtarg_d.items():
            if v[0] is None:
                displays[k] = k
            else:
                displays[k] = v[0]

        R = []
        for k, v in showtargs.items():
            already_disp = set()
            rank = 1
            for p in v:
                disp = displays[p]
                if disp.lower() in already_disp:
                    R += [("'%s'" % k, "'%s'" % p, 'NULL', 'NULL')]
                else:
                    R += [("'%s'" % k, "'%s'" % p, '%d' % rank, "'%s'" % disp)]
                    rank += 1
                    already_disp.update([disp.lower()])
        R = ["(%s)" % ",".join(r) for r in R]

        try:
            self.__log.info("Inserting into database...")

            for c in self.__chunker(R, 10000):
                psql.query(INSERT % ",".join(c), database_name)

            self.__log.info("Indexing table")
            psql.query(CREATE_INDEX, database_name)
            self.mark_ready()
        except Exception as e:

            self.__log.error("Error while saving tables")
            if not self.custom_ready():
                raise Exception(e)
            else:
                self.__log.error(e)
                return

    def __pstr(self, s):
        if s == "":
            return 'NULL'
        else:
            return "'%s'" % s.replace("'", "")

    def __sort_alphabet(self, prots, showtarg_d):
        def nonesorter(a):
            if not a:
                return ""
            return a
        P0 = []
        P1 = []
        for p in prots:
            if p not in showtarg_d:
                P1 += [(p, p)]
            else:
                P0 += [(p, showtarg_d[p][0])]
        return [r[0] for r in sorted(P0, key=lambda tup: nonesorter(tup[1]))] + [r[0] for r in sorted(P1, key=lambda tup: nonesorter(tup[1]))]

    def __chunker(self, data, size=2000):
        for i in range(0, len(data), size):
            yield data[slice(i, i + size)]

[docs]    def execute(self, context):
        """Run the molprops step."""
        self.tmpdir = context['params']['tmpdir']
        self.run()