Source code for chemicalchecker.core.examples

"""Example data.

This class fetches examples to be used in tutorials and notebooks.
"""
import os
import csv
import pickle
import numpy as np
import pandas as pd

from chemicalchecker.util import logged


[docs]class BaseExample(object): """BaseExample class.""" def __init__(self, dup_keys_prop, dup_features_prop, nan_prop, inf_prop, empty_keys_prop, empty_features_prop, force): """Initialize a BaseExample instance.""" self.path = os.path.join(os.path.dirname( os.path.abspath(__file__)), "examples") self.dup_keys_prop = dup_keys_prop self.dup_features_prop = dup_features_prop self.nan_prop = nan_prop self.inf_prop = inf_prop self.empty_keys_prop = empty_keys_prop self.empty_features_prop = empty_features_prop self.force = force def check_type(self, key_type): if key_type not in ["inchikey", "smiles", "src"]: raise Exception( "key_type must be one of 'inchikey', 'smiles' and 'src'") def duplicate_keys_pairs(self, pairs): keys = list(set([p[0] for p in pairs])) num = int(len(keys) * self.dup_keys_prop) if num == 0: return pairs else: features = [p[1] for p in pairs] keys_ = np.random.choice(keys, num, replace=False) features_ = np.random.choice(features, num, replace=True) for k, f in zip(keys_, features_): pairs += [(k, f)] return pairs def duplicate_features_pairs(self, pairs): features = list(set([p[1] for p in pairs])) num = int(len(features) * self.dup_features_prop) if num == 0: return pairs else: keys = [p[0] for p in pairs] features_ = np.random.choice(features, num, replace=False) keys_ = np.random.choice(keys, num, replace=True) for k, f in zip(keys_, features_): pairs += [(k, f)] return pairs def duplicate_keys_matrix(self, X, keys): num = int(X.shape[0] * self.dup_keys_prop) if num == 0: return X, keys else: keys_ = np.random.choice(keys, num, replace=False) idxs_ = np.randam.choice(X.shape[0], num, replace=False) X_ = X[idxs_] X = np.vstack([X, X_]) keys = np.array(list(keys) + list(keys_)) return X, keys def duplicate_features_matrix(self, X, features): num = int(X.shape[1] * self.dup_features_prop) if num == 0: return X, features else: features_ = np.random.choice(features, num, replace=False) idxs_ = np.random.choice(X.shape[1], num, replace=False) X_ = X[:, idxs_] X = np.hstack([X, X_]) features = np.array(list(features) + list(features_)) return X, features def add_nans(self, X): if not self.force: if isinstance(X[0, 0], np.integer): return X num = int(X.shape[0] * X.shape[1] * self.nan_prop) if num == 0: return X else: X = X.astype(np.float) idxs1 = np.random.choice(X.shape[0], num, replace=True) idxs2 = np.random.choice(X.shape[1], num, replace=True) for i1, i2 in zip(idxs1, idxs2): X[i1, i2] = np.nan return X def add_infs(self, X): if not self.force: if isinstance(X[0, 0], np.integer): return X num = int(X.shape[0] * X.shape[1] * self.nan_prop) if num == 0: return X else: X = X.astype(np.float) idxs1 = np.random.choice(X.shape[0], num, replace=True) idxs2 = np.random.choice(X.shape[1], num, replace=True) dires = np.random.choice(2, num, replace=True) for i1, i2, dr in zip(idxs1, idxs2, dires): if dr == 0: X[i1, i2] = np.inf else: X[i1, i2] = -np.inf return X def empty_keys(self, X): num = int(X.shape[0] * self.empty_keys_prop) if num == 0: return X else: idxs = np.random.choice(X.shape[0], num, replace=False) X[idxs, :] = 0 return X def empty_features(self, X): num = int(X.shape[1] * self.empty_features_prop) if num == 0: return X else: idxs = np.random.choice(X.shape[1], num, replace=False) X[:, idxs] = 0 return X def returner_matrix(self, X, keys, features): X, keys = self.duplicate_keys_matrix(X, keys) X, features = self.duplicate_features_matrix(X, features) X = self.add_nans(X) X = self.add_infs(X) X = self.empty_keys(X) X = self.empty_features(X) return X, keys, features def returner_pairs(self, pairs): pairs = self.duplicate_keys_pairs(pairs) pairs = self.duplicate_features_pairs(pairs) return pairs
[docs]@logged class Example(BaseExample): """Example class. The Example class contains some example data, accessible with the class methods. """ def __init__(self, dup_keys_prop=0, dup_features_prop=0, nan_prop=0, inf_prop=0, empty_keys_prop=0, empty_features_prop=0, force=False): """Initialize a Example instance.""" BaseExample.__init__(self, dup_keys_prop, dup_features_prop, nan_prop, inf_prop, empty_keys_prop, empty_features_prop, force)
[docs] def toy_matrix(self, key_type='inchikey', fit_case=True): """A toy matrix, just to test the sign0. Args: key_type(str): One of 'inchikey', 'smiles', 'src' (source) (default='inchikey') fit_case(bool): Get a fit case or a predict/transform case (default=True) Returns: X, keys, features """ self.check_type(key_type) self.__log.info("Getting a toy matrix dataset") with open(os.path.join(self.path, "toy_matrix.pkl"), "rb") as f: d = pickle.load(f) if fit_case: suf = 1 else: suf = 2 X = d["X%d" % suf] features = d["features%d" % suf] if key_type == "inchikey": keys = d["key_inks%d" % suf] if key_type == "smiles": keys = d["key_smis%d" % suf] if key_type == "src": keys = d["key_srcs%d" % suf] return self.returner_matrix(X, keys, features)
[docs] def toy_pairs(self, key_type='inchikey', fit_case=True): """Toy pairs, just to test the sign0. Args: key_type(str): One of 'inchikey', 'smiles', 'src' (source) (default='inchikey') fit_case(bool): Get a fit case or a predict/transform case (default=True) Returns: pairs """ self.check_type(key_type) self.__log.info("Getting a toy pairs dataset") with open(os.path.join(self.path, "toy_pairs.pkl"), "rb") as f: d = pickle.load(f) if fit_case: suf = 1 else: suf = 2 if key_type == "inchikey": pairs = d["pairs_inks%d" % suf] if key_type == "smiles": pairs = d["pairs_smis%d" % suf] if key_type == "src": pairs = d["pairs_srcs%d" % suf] return self.returner_pairs(pairs)
[docs] def drug_targets(self): """Drug targets from PharmacoDB. Returns: pairs """ self.__log.info("Getting drug targets (example of unweighted pairs)") pairs = [] with open(os.path.join(self.path, "pharmacodb_targets.tsv"), "r") as f: reader = csv.reader(f, delimiter="\t") next(reader) for r in reader: pairs += [(r[0], r[1])] return self.returner_pairs(pairs)
[docs] def cell_sensitivity(self): """Cell sensitivity data from CTRP Returns: X, keys, features """ self.__log.info( "Getting cell sensitivity data (example of continuous data)") df = pd.read_csv(os.path.join( self.path, "ctrp_auc.csv"), delimiter=",") keys = np.array(df[df.columns[0]]).astype(str) features = np.array(df.columns[1:]).astype(str) X = np.array(df[df.columns[1:]]).astype(float) return self.returner_matrix(X, keys, features)
[docs] def fingerprints(self): """Morgan fingerprints from LINCS. Returns: X, keys, features """ self.__log.info("Getting fingerprint data for LINCS molecules") with open(os.path.join(self.path, "fps.pkl"), "rb") as f: d = pickle.load(f) X = d["V"] keys = np.array(d["keys"]) features = np.array(d["features"]) return self.returner_matrix(X, keys, features)