Source code for chemicalchecker.tool.targetmate.datasets.drugbank.fetch_drugbank

"""Get target data from a local xml DrugBank database.
Adapted from CC B1.001 preprocessing.
"""
import os
import pandas as pd
from chemicalchecker.tool.targetmate.utils.chemistry import read_smiles
from chemicalchecker.util import psql
from chemicalchecker.util import logged

from chemicalchecker.database import Molrepo
import xml.etree.ElementTree as ET
import collections
import numpy as np

[docs]class DrugBankDb(): def __init__(self, drugbank_xml="/aloy/scratch/ptorren/databases/drugbank/full_database.xml", dirs = None): self.drugbank_xml = os.path.abspath(drugbank_xml) if dirs is None: self.dirs = { 'Inhibitor': -1, 'acetylation': -1, 'activator': +1, 'agonist': +1, 'antagonist': -1, 'binder': -1, 'binding': -1, 'blocker': -1, 'cofactor': +1, 'inducer': +1, 'inhibitor': -1, 'inhibitor, competitive': -1, 'inhibitory allosteric modulator': -1, 'intercalation': -1, 'inverse agonist': +1, 'ligand': -1, 'negative modulator': -1, 'partial agonist': +1, 'partial antagonist': -1, 'positive allosteric modulator': +1, 'positive modulator': +1, 'potentiator': +1, 'stimulator': -1, 'suppressor': -1} else: self.dirs = dirs self.prefix = "{http://www.drugbank.ca}" dbid_inchikey = {} inchikey_srcid = {} inchikey_smiles = {} molrepos = Molrepo.get_by_molrepo_name("drugbank") for molrepo in molrepos: if not molrepo.inchikey: continue dbid_inchikey[molrepo.src_id] = molrepo.inchikey inchikey_srcid[molrepo.inchikey] = molrepo.src_id inchikey_smiles[molrepo.inchikey] = molrepo.smiles self.dbid_inchikey = dbid_inchikey self.inchikey_srcid = inchikey_srcid self.inchikey_smiles = inchikey_smiles def decide(self, acts): if len(acts) == 0: return 0 m = np.mean(acts) if m > 0: return 1 else: return -1 def get_inchikey(self, drug): db_id = None for child in drug.findall(self.prefix + "drugbank-id"): if "primary" in child.attrib: if child.attrib["primary"] == "true": db_id = child.text if db_id not in self.dbid_inchikey: return None inchikey = self.dbid_inchikey[db_id] return inchikey def get_actions(self, targ): actions = [] for action in targ.findall(self.prefix + "actions"): for child in action: actions += [child.text] return actions def get_uniprot(self, targ): uniprot_ac = None prot = targ.find(self.prefix + "polypeptide") if not prot: return None if "source" in prot.attrib: if prot.attrib["source"] == "Swiss-Prot": uniprot_ac = prot.attrib["id"] if not uniprot_ac: return None return uniprot_ac def get_targets(self, drug): targets = collections.defaultdict(list) uniprot_ac = None for targs in drug.findall(self.prefix + "targets"): for targ in targs.findall(self.prefix + "target"): actions = self.get_actions(targ) uniprot_ac = self.get_uniprot(targ) if not uniprot_ac: continue targets[uniprot_ac] = actions return targets def parse_drugbank(self): ACTS = collections.defaultdict(list) tree = ET.parse(self.drugbank_xml) root = tree.getroot() DB = {} for drug in root: inchikey = self.get_inchikey(drug) ## LOOK INTO THIS targets = self.get_targets(drug) if not inchikey: continue if not targets: continue DB[inchikey] = targets return DB def store_activities(self, explicit): ACTS = [] DB = self.parse_drugbank() for inchikey, targs in DB.items(): for uniprot_ac, actions in targs.items(): d = [] for action in actions: if action in self.dirs: d += [self.dirs[action]] if explicit: if not d: continue act = self.decide(d) ACTS.append((inchikey, uniprot_ac, self.inchikey_srcid[inchikey], act)) return ACTS
[docs]class DrugBank(DrugBankDb): def __init__(self, output_folder, min_actives=10, uniprot_acs = None, multiclass = True ): DrugBankDb.__init__(self) if output_folder: self.output_folder = os.path.abspath(output_folder) if not os.path.isdir(self.output_folder): os.mkdir(self.output_folder) self.min_actives = min_actives self.uniprot_acs = uniprot_acs self.multiclass = multiclass if self.multiclass: self.file_name = "multiclass" else: self.file_name = "general" def create_folder_hierarchy(self, uniprot_ac): path = os.path.join(self.output_folder, uniprot_ac) if not os.path.isdir(path): os.mkdir(path) return path def get_activities(self, data): if self.multiclass: dat = [[act, self.inchikey_smiles[inchi], self.inchikey_srcid[inchi], inchi] for (inchi, act) in data] else: dat = [[1, self.inchikey_smiles[inchi], self.inchikey_srcid[inchi], inchi] for (inchi, act) in data] return pd.DataFrame(dat) def write_to_file(self, df, path): df.to_csv(os.path.join(path, "{:s}.tsv".format(self.file_name)), sep='\t', header=False, index=False) def parse_activities(self, explicit=True, store=True): molrepos = Molrepo.get_by_molrepo_name("drugbank") ACTS = super().store_activities(explicit) by_protein = collections.defaultdict(list) for (inchikey, uniprot_ac, srcid, act) in ACTS: if by_protein[uniprot_ac]: by_protein[uniprot_ac].append((inchikey, act)) else: by_protein[uniprot_ac] = [(inchikey, act)] if not store: return by_protein data = [] for target in by_protein: if self.uniprot_acs is not None: if target not in self.uniprot_acs: continue if self.min_actives: if len(by_protein[target]) < self.min_actives: continue path = self.create_folder_hierarchy(target) df = self.get_activities(by_protein[target]) self.write_to_file(df, path)