Source code for chemicalchecker.util.pipeline.tasks_web.task_web_similars

import os
import h5py
import json
import shutil
import tempfile
from tqdm import tqdm

from chemicalchecker.util import psql
from chemicalchecker.util.pipeline import BaseTask
from chemicalchecker.util import logged, HPC


# We got these strings by doing: pg_dump -t 'scores' --schema-only mosaic
# -h aloy-dbsrv


[docs]@logged class Similars(BaseTask): def __init__(self, name=None, **params): task_id = params.get('task_id', None) if task_id is None: params['task_id'] = name BaseTask.__init__(self, name, **params) self.DB = params.get('DB', None) if self.DB is None: raise Exception('DB parameter is not set') self.CC_ROOT = params.get('CC_ROOT', None) if self.CC_ROOT is None: raise Exception('CC_ROOT parameter is not set') self.MOLECULES_PATH = params.get('MOLECULES_PATH', None) if self.MOLECULES_PATH is None: raise Exception('MOLECULES_PATH parameter is not set')
[docs] def run(self): """Run the molecular info step.""" script_path = os.path.join(os.path.dirname( os.path.realpath(__file__)), "scripts/similars.py") universe_file = os.path.join(self.cachedir, "universe.h5") with h5py.File(universe_file, 'r') as hf: universe_keys = hf["keys"][:] # get all bioactive compounds from libraries (with pubchem names) lib_bio_file = os.path.join(self.tmpdir, "lib_bio.json") if not os.path.exists(lib_bio_file): text_bio = "select library_description.name as lib,lib.inchikey from libraries as lib INNER JOIN library_description on lib.lib = library_description.lib where lib.is_bioactive = '1' order by library_description.rank" lib_bio = psql.qstring(text_bio, self.DB) ref_bioactive = dict() for lib in lib_bio: if lib[0] not in ref_bioactive: ref_bioactive[lib[0]] = set() ref_bioactive[lib[0]].add(lib[1]) for lib in lib_bio: ref_bioactive[lib[0]] = list(ref_bioactive[lib[0]]) with open(lib_bio_file, 'w') as outfile: json.dump(ref_bioactive, outfile) # save chunks of inchikey pubmed synonyms ik_names_file = os.path.join(self.tmpdir, "inchies_names.json") if not os.path.exists(ik_names_file): names_map = {} for input_data in self.__chunker(universe_keys): data = psql.qstring("select inchikey_pubchem as inchikey,name from pubchem INNER JOIN( VALUES " + ', '.join('(\'{0}\')'.format(w) for w in input_data) + ") vals(v) ON (inchikey_pubchem = v)", self.DB) for i in range(0, len(data)): inchi = data[i][0] name = data[i][1] if name is None: name = inchi names_map[inchi] = name if len(names_map) > 0: with open(ik_names_file, 'w') as outfile: json.dump(names_map, outfile) else: if not self.custom_ready(): raise Exception( "Inchikeys name JSON file was not created") else: self.__log.error( "Inchikeys name JSON file was not created") return self.__log.info("Launching jobs to create json files for " + str(len(universe_keys)) + " molecules") job_path = tempfile.mkdtemp( prefix='jobs_similars_', dir=self.tmpdir) version = self.DB.replace("cc_web_", '') mol_path = self.MOLECULES_PATH params = {} params["num_jobs"] = len(universe_keys) / 200 params["jobdir"] = job_path params["job_name"] = "CC_JSONSIM" params["elements"] = universe_keys params["memory"] = 2 params["wait"] = True # job command cc_config_path = self.config.config_path cc_package = os.path.join(self.config.PATH.CC_REPO, 'package') singularity_image = self.config.PATH.SINGULARITY_IMAGE command = "SINGULARITYENV_PYTHONPATH={} SINGULARITYENV_CC_CONFIG={} singularity exec {} python {} <TASK_ID> <FILE> {} {} {} {} {} {}" command = command.format( cc_package, cc_config_path, singularity_image, script_path, ik_names_file, lib_bio_file, mol_path, self.DB, version, self.CC_ROOT) # submit jobs cluster = HPC.from_config(self.config) jobs = cluster.submitMultiJob(command, **params) self.__log.info("Checking results") missing_keys = list() for i in tqdm(range(len(universe_keys))): inchikey = universe_keys[i] PATH = mol_path + "/%s/%s/%s/%s" % ( inchikey[:2], inchikey[2:4], inchikey, 'explore_' + version + '.json') if not os.path.exists(PATH): missing_keys.append(inchikey) if len(missing_keys) != 0: if not self.custom_ready(): raise Exception( "Not all molecules have their json explore file (%d/%d)" % (len(missing_keys), len(universe_keys))) else: self.__log.error( "Not all molecules have their json explore file (%d/%d)" % (len(missing_keys), len(universe_keys))) else: shutil.rmtree(job_path, ignore_errors=True) self.mark_ready()
def __chunker(self, data, size=2000): for i in range(0, len(data), size): yield data[slice(i, i + size)]
[docs] def execute(self, context): """Run the molprops step.""" self.tmpdir = context['params']['tmpdir'] self.run()