Source code for chemicalchecker.util.pipeline.tasks_web.task_web_showtargets

import os
import collections

from chemicalchecker.util import psql
from chemicalchecker.database import Dataset
from chemicalchecker.database import UniprotKB
from chemicalchecker.core import ChemicalChecker
from chemicalchecker.util.pipeline import BaseTask
from chemicalchecker.util import logged

# We got these strings by doing: pg_dump -t 'pubchem' --schema-only mosaic
# -h aloy-dbsrv

DROP_TABLE = "DROP TABLE IF EXISTS public.showtargets"
DROP_TABLE_DESC = "DROP TABLE IF EXISTS public.showtargets_description"

CREATE_TABLE_DESC = """CREATE TABLE public.showtargets_description (
    uniprot_ac text PRIMARY KEY,
    genename text,
    fullname text,
    taxid text,
    organism text
);"""

CREATE_TABLE = """CREATE TABLE public.showtargets (
    inchikey text,
    uniprot_ac text,
    rank integer,
    display text
);"""

CREATE_INDEX_DESC = """
CREATE INDEX genename_showtargets_description_idx ON public.showtargets_description USING btree (genename);
CREATE INDEX taxid_showtargets_description_idx ON public.showtargets_description USING btree (taxid);
"""

CREATE_INDEX = """
CREATE INDEX inchikey_showtargets_idx ON showtargets(inchikey);
CREATE INDEX uniprot_ac_showtargets_idx ON showtargets(uniprot_ac);
CREATE INDEX rank_showtargets_idx ON showtargets(rank);
"""

INSERT = "INSERT INTO showtargets VALUES %s"

ref_spaces = ['B1.001', 'B2.001', 'B4.001', 'B5.001']


[docs]@logged class ShowTargets(BaseTask): def __init__(self, name=None, **params): task_id = params.get('task_id', None) if task_id is None: params['task_id'] = name BaseTask.__init__(self, name, **params) self.DB = params.get('DB', None) if self.DB is None: raise Exception('DB parameter is not set') self.CC_ROOT = params.get('CC_ROOT', None) if self.CC_ROOT is None: raise Exception('CC_ROOT parameter is not set') self.uniprot_db_version = params.get('uniprot_db_version', None) if self.uniprot_db_version is None: raise Exception('uniprot_db_version parameter is not set')
[docs] def run(self): """Run the show targets step.""" database_name = self.DB try: self.__log.info("Creating table") psql.query(DROP_TABLE, database_name) psql.query(CREATE_TABLE, database_name) psql.query(DROP_TABLE_DESC, database_name) psql.query(CREATE_TABLE_DESC, database_name) # psql.query(CREATE_INDEX, database_name) except Exception as e: self.__log.error("Error while creating tables") if not self.custom_ready(): raise Exception(e) else: self.__log.error(e) return cc = ChemicalChecker(self.CC_ROOT) prots = set() for space in ref_spaces: s0 = cc.get_signature('sign0', 'full', space) features = s0.features prots.update([x.split('(')[0] for x in features if "Class:" not in x]) prots = sorted(prots) self.__log.info("Querying UniprotKB...") ukb = UniprotKB(self.uniprot_db_version) showtarg_d = ukb.get_proteins( prots, limit_to_fields=["genename", "fullname", "taxid", "organism"]) self.__log.info("Inserting proteins into database...") R = [] for p in prots: if p in showtarg_d: d = showtarg_d[p] R += [(self.__pstr(p), self.__pstr(d["genename"]), self.__pstr(d["fullname"]), self.__pstr(d["taxid"]), self.__pstr(d["organism"]))] else: R += [("'%s'" % p, 'NULL', 'NULL', 'NULL', 'NULL')] R = ["(%s)" % ",".join(r) for r in R] try: for c in self.__chunker(R, 1000): psql.query("INSERT INTO showtargets_description VALUES %s" % ",".join(c), database_name) psql.query(CREATE_INDEX_DESC, database_name) showtarg_d = {} for r in psql.qstring( "SELECT uniprot_ac, genename, taxid FROM " "showtargets_description", database_name): showtarg_d[r[0]] = [r[1], r[2]] except Exception as e: self.__log.error("Error while filling showtargets_description") if not self.custom_ready(): raise Exception(e) else: self.__log.error(e) return self.__log.info("Getting orthologs from MetaPhors") dataset = Dataset.get('C3.001') map_files = {} for ds in dataset.datasources: map_files[ds.datasource_name] = ds.data_path id_conversion = os.path.join( map_files["metaphors_id_conversion"], "id_conversion.txt") file_9606 = os.path.join(map_files["metaphors_9606"], "9606.txt") human_proteome = os.path.join( map_files["human_proteome"], "human_proteome.tab") metaphorsid_uniprot = collections.defaultdict(set) f = open(id_conversion, "r") f.readline() for l in f: l = l.rstrip("\n").split("\t") if l[1] == "SwissProt" or l[1] == "TrEMBL": metaphorsid_uniprot[l[2]].update([l[0]]) f.close() any_human = collections.defaultdict(set) f = open(file_9606, "r") f.readline() for l in f: l = l.rstrip("\n").split("\t") if l[3] not in metaphorsid_uniprot: continue if l[1] not in metaphorsid_uniprot: continue for po in metaphorsid_uniprot[l[3]]: for ph in metaphorsid_uniprot[l[1]]: any_human[po].update([ph]) any_human[ph].update([ph]) f.close() f = open(human_proteome, "r") f.readline() for l in f: p = l.split("\t")[0] any_human[p].update([p]) f.close() self.__log.info( "First is always MOA, in any species... " "(only sorted alphabetically by gene name)") seens = collections.defaultdict(set) showtargs = collections.defaultdict(list) s0 = cc.get_signature('sign0', 'full', ref_spaces[0]) features = s0.features keys = s0.keys i = 0 for sig in s0: collected_features = features[sig > 0] key = keys[i] i += 1 if len(collected_features) == 0: continue prots = [x.split('(')[0] for x in collected_features if "Class:" not in x] prots = self.__sort_alphabet(prots, showtarg_d) showtargs[key] += prots seens[key].update(prots) self.__log.info("Now it is human...") hp = set([r[0] for r in psql.qstring( "SELECT uniprot_ac FROM showtargets_description " "WHERE taxid = '9606'", database_name)]) self.__log.info("...binding table... (sorted by potency)") s0 = cc.get_signature('sign0', 'full', ref_spaces[2]) features = s0.features keys = s0.keys i = 0 for sig in s0: collected_features_1 = features[sig == 1] collected_features_2 = features[sig == 2] if len(collected_features_1) == 0 and len(collected_features_2) == 0: i += 1 continue key = keys[i] i += 1 if key in seens: s = seens[key] else: s = set() prots0 = hp.intersection( [x.split('(')[0] for x in collected_features_2 if "Class:" not in x]).difference(s) seens[key].update(prots0) prots0 = self.__sort_alphabet(list(prots0), showtarg_d) showtargs[key] += prots0 prots1 = hp.intersection( [x.split('(')[0] for x in collected_features_1 if "Class:" not in x]).difference(s) seens[key].update(prots1) prots1 = self.__sort_alphabet(list(prots1), showtarg_d) showtargs[key] += prots1 self.__log.info("...metabolic genes table...") s0 = cc.get_signature('sign0', 'full', ref_spaces[1]) features = s0.features keys = s0.keys i = 0 for sig in s0: collected_features = features[sig > 0] if len(collected_features) == 0: i += 1 continue key = keys[i] i += 1 if key in seens: s = seens[key] else: s = set() prots = set( [x.split('(')[0] for x in collected_features if "Class:" not in x]).difference(s) seens[key].update(prots) prots = self.__sort_alphabet(prots, showtarg_d) showtargs[key] += prots self.__log.info("...HTS bioassays table...") s0 = cc.get_signature('sign0', 'full', ref_spaces[3]) features = s0.features keys = s0.keys i = 0 for sig in s0: collected_features = features[sig > 0] if len(collected_features) == 0: i += 1 continue key = keys[i] i += 1 if key in seens: s = seens[key] else: s = set() prots = hp.intersection( [x.split('(')[0] for x in collected_features if "Class:" not in x]).difference(s) seens[key].update(prots) prots = self.__sort_alphabet(prots, showtarg_d) showtargs[key] += prots self.__log.info( "And then the rest of species " "(where no human orthologs are already known)") self.__log.info("...binding table... (sorted by potency)") s0 = cc.get_signature('sign0', 'full', ref_spaces[2]) features = s0.features keys = s0.keys i = 0 for sig in s0: collected_features_1 = features[sig == 1] collected_features_2 = features[sig == 2] key = keys[i] i += 1 if len(collected_features_1) == 0 and len(collected_features_2) == 0: continue if key in seens: s = seens[key] else: s = set() prots0 = set( [x.split('(')[0] for x in collected_features_2 if "Class:" not in x]).difference(s) ho = set() for p in prots0: if p in any_human: ho.update(any_human[p]) prots0 = prots0.difference(ho) seens[key].update(prots0) prots0 = self.__sort_alphabet(list(prots0), showtarg_d) showtargs[key] += prots0 prots1 = set( [x.split('(')[0] for x in collected_features_1 if "Class:" not in x]).difference(s) ho = set() for p in prots1: if p in any_human: ho.update(any_human[p]) prots1 = prots1.difference(ho) seens[key].update(prots1) prots1 = self.__sort_alphabet(list(prots1), showtarg_d) showtargs[key] += prots1 self.__log.info("...HTS bioassays table...") s0 = cc.get_signature('sign0', 'full', ref_spaces[3]) features = s0.features keys = s0.keys i = 0 for sig in s0: collected_features = features[sig > 0] key = keys[i] i += 1 if len(collected_features) == 0: continue if key in seens: s = seens[key] else: s = set() prots = set( [x.split('(')[0] for x in collected_features if "Class:" not in x]).difference(s) ho = set() for p in prots: if p in any_human: ho.update(any_human[p]) prots = prots.difference(ho) seens[key].update(prots) prots = self.__sort_alphabet(prots, showtarg_d) showtargs[key] += prots self.__log.info("And finally the rest of species...") self.__log.info("...binding table... (sorted by potency)") s0 = cc.get_signature('sign0', 'full', ref_spaces[2]) features = s0.features keys = s0.keys i = 0 for sig in s0: collected_features_1 = features[sig == 1] collected_features_2 = features[sig == 2] key = keys[i] i += 1 if len(collected_features_1) == 0 and len(collected_features_2) == 0: continue if key in seens: s = seens[key] else: s = set() prots0 = set( [x.split('(')[0] for x in collected_features_2 if "Class:" not in x]).difference(s) seens[key].update(prots0) prots0 = self.__sort_alphabet(list(prots0), showtarg_d) showtargs[key] += prots0 prots1 = set( [x.split('(')[0] for x in collected_features_1 if "Class:" not in x]).difference(s) seens[key].update(prots1) prots1 = self.__sort_alphabet(list(prots1), showtarg_d) showtargs[key] += prots1 self.__log.info("...HTS bioassays table...") s0 = cc.get_signature('sign0', 'full', ref_spaces[3]) features = s0.features keys = s0.keys i = 0 for sig in s0: collected_features = features[sig > 0] key = keys[i] i += 1 if len(collected_features) == 0: continue if key in seens: s = seens[key] else: s = set() prots = set( [x.split('(')[0] for x in collected_features if "Class:" not in x]).difference(s) seens[key].update(prots) prots = self.__sort_alphabet(prots, showtarg_d) showtargs[key] += prots displays = {} for k, v in showtarg_d.items(): if v[0] is None: displays[k] = k else: displays[k] = v[0] R = [] for k, v in showtargs.items(): already_disp = set() rank = 1 for p in v: disp = displays[p] if disp.lower() in already_disp: R += [("'%s'" % k, "'%s'" % p, 'NULL', 'NULL')] else: R += [("'%s'" % k, "'%s'" % p, '%d' % rank, "'%s'" % disp)] rank += 1 already_disp.update([disp.lower()]) R = ["(%s)" % ",".join(r) for r in R] try: self.__log.info("Inserting into database...") for c in self.__chunker(R, 10000): psql.query(INSERT % ",".join(c), database_name) self.__log.info("Indexing table") psql.query(CREATE_INDEX, database_name) self.mark_ready() except Exception as e: self.__log.error("Error while saving tables") if not self.custom_ready(): raise Exception(e) else: self.__log.error(e) return
def __pstr(self, s): if s == "": return 'NULL' else: return "'%s'" % s.replace("'", "") def __sort_alphabet(self, prots, showtarg_d): def nonesorter(a): if not a: return "" return a P0 = [] P1 = [] for p in prots: if p not in showtarg_d: P1 += [(p, p)] else: P0 += [(p, showtarg_d[p][0])] return [r[0] for r in sorted(P0, key=lambda tup: nonesorter(tup[1]))] + [r[0] for r in sorted(P1, key=lambda tup: nonesorter(tup[1]))] def __chunker(self, data, size=2000): for i in range(0, len(data), size): yield data[slice(i, i + size)]
[docs] def execute(self, context): """Run the molprops step.""" self.tmpdir = context['params']['tmpdir'] self.run()