Source code for chemicalchecker.util.pipeline.tasks_web.task_web_similars

import os
import h5py
import json
import shutil
import tempfile
from tqdm import tqdm

from chemicalchecker.util import psql
from chemicalchecker.util.pipeline import BaseTask
from chemicalchecker.util import logged, HPC


# We got these strings by doing: pg_dump -t 'scores' --schema-only mosaic
# -h aloy-dbsrv


[docs]@logged
class Similars(BaseTask):

    def __init__(self, name=None, **params):
        task_id = params.get('task_id', None)
        if task_id is None:
            params['task_id'] = name
        BaseTask.__init__(self, name, **params)

        self.DB = params.get('DB', None)
        if self.DB is None:
            raise Exception('DB parameter is not set')
        self.CC_ROOT = params.get('CC_ROOT', None)
        if self.CC_ROOT is None:
            raise Exception('CC_ROOT parameter is not set')
        self.MOLECULES_PATH = params.get('MOLECULES_PATH', None)
        if self.MOLECULES_PATH is None:
            raise Exception('MOLECULES_PATH parameter is not set')

[docs]    def run(self):
        """Run the molecular info step."""
        script_path = os.path.join(os.path.dirname(
            os.path.realpath(__file__)), "scripts/similars.py")
        universe_file = os.path.join(self.cachedir, "universe.h5")

        with h5py.File(universe_file, 'r') as hf:
            universe_keys = hf["keys"][:]

        # get all bioactive compounds from libraries (with pubchem names)
        lib_bio_file = os.path.join(self.tmpdir, "lib_bio.json")
        if not os.path.exists(lib_bio_file):
            text_bio = "select  library_description.name as lib,lib.inchikey from libraries as lib INNER JOIN library_description on lib.lib = library_description.lib   where lib.is_bioactive = '1' order by  library_description.rank"
            lib_bio = psql.qstring(text_bio, self.DB)
            ref_bioactive = dict()
            for lib in lib_bio:
                if lib[0] not in ref_bioactive:
                    ref_bioactive[lib[0]] = set()
                ref_bioactive[lib[0]].add(lib[1])
            for lib in lib_bio:
                ref_bioactive[lib[0]] = list(ref_bioactive[lib[0]])
            with open(lib_bio_file, 'w') as outfile:
                json.dump(ref_bioactive, outfile)

        # save chunks of inchikey pubmed synonyms
        ik_names_file = os.path.join(self.tmpdir, "inchies_names.json")
        if not os.path.exists(ik_names_file):
            names_map = {}
            for input_data in self.__chunker(universe_keys):
                data = psql.qstring("select inchikey_pubchem as inchikey,name from pubchem INNER JOIN( VALUES " +
                                    ', '.join('(\'{0}\')'.format(w) for w in input_data) + ") vals(v) ON (inchikey_pubchem = v)", self.DB)
                for i in range(0, len(data)):
                    inchi = data[i][0]
                    name = data[i][1]
                    if name is None:
                        name = inchi
                    names_map[inchi] = name
            if len(names_map) > 0:
                with open(ik_names_file, 'w') as outfile:
                    json.dump(names_map, outfile)
            else:
                if not self.custom_ready():
                    raise Exception(
                        "Inchikeys name JSON file was not created")
                else:
                    self.__log.error(
                        "Inchikeys name JSON file was not created")
                    return

        self.__log.info("Launching jobs to create json files for " +
                        str(len(universe_keys)) + " molecules")

        job_path = tempfile.mkdtemp(
            prefix='jobs_similars_', dir=self.tmpdir)

        version = self.DB.replace("cc_web_", '')
        mol_path = self.MOLECULES_PATH

        params = {}
        params["num_jobs"] = len(universe_keys) / 200
        params["jobdir"] = job_path
        params["job_name"] = "CC_JSONSIM"
        params["elements"] = universe_keys
        params["memory"] = 2
        params["wait"] = True
        # job command
        cc_config_path = self.config.config_path
        cc_package = os.path.join(self.config.PATH.CC_REPO, 'package')
        singularity_image = self.config.PATH.SINGULARITY_IMAGE
        command = "SINGULARITYENV_PYTHONPATH={} SINGULARITYENV_CC_CONFIG={} singularity exec {} python {} <TASK_ID> <FILE> {} {} {} {} {} {}"
        command = command.format(
            cc_package, cc_config_path, singularity_image, script_path, 
            ik_names_file, lib_bio_file, mol_path, self.DB, version, self.CC_ROOT)
        # submit jobs
        cluster = HPC.from_config(self.config)
        jobs = cluster.submitMultiJob(command, **params)

        self.__log.info("Checking results")
        missing_keys = list()
        for i in tqdm(range(len(universe_keys))):
            inchikey = universe_keys[i]
            PATH = mol_path + "/%s/%s/%s/%s" % (
                inchikey[:2], inchikey[2:4], inchikey, 'explore_' + version + '.json')
            if not os.path.exists(PATH):
                missing_keys.append(inchikey)

        if len(missing_keys) != 0:
            if not self.custom_ready():
                raise Exception(
                    "Not all molecules have their json explore file (%d/%d)" % (len(missing_keys), len(universe_keys)))
            else:
                self.__log.error(
                    "Not all molecules have their json explore file (%d/%d)" % (len(missing_keys), len(universe_keys)))
        else:
            shutil.rmtree(job_path, ignore_errors=True)
            self.mark_ready()

    def __chunker(self, data, size=2000):
        for i in range(0, len(data), size):
            yield data[slice(i, i + size)]

[docs]    def execute(self, context):
        """Run the molprops step."""
        self.tmpdir = context['params']['tmpdir']
        self.run()