"""Molecule InChIKey-InChI mapping.
Simple table storing the correspondence between InChIKey and InChI.
Example::
from chemicalchecker.database import Molecule
mol = Molecule.get('RZVAJINKPMORJF-UHFFFAOYSA-N'))
mol.inchi
>>> 'InChI=1S/C8H9NO2/c1-6(10)9-7-2-4-8(11)5-3-7/h2-5,11H,1H3,(H,9,10)'
"""
from tqdm import trange
import sqlalchemy
from sqlalchemy import Column, Text, VARCHAR
from sqlalchemy.dialects import postgresql
from .database import Base, get_session, get_engine
from chemicalchecker.util import logged
[docs]@logged
class Molecule(Base):
"""Molecule Table class.
Parameters:
inchikey(str): primary key, simple unique name for the Datasource.
inchi(str): the download link.
"""
__tablename__ = 'molecule'
inchikey = Column(VARCHAR(27), primary_key=True, index=True)
inchi = Column(Text)
[docs] @staticmethod
def add(kwargs):
""" Method to add a new row to the table.
Args:
kwargs(dict):The data in dictionary format .
"""
Molecule.__log.debug(type(kwargs))
if type(kwargs) is dict:
struct = Molecule(**kwargs)
Molecule.__log.debug(struct.inchikey)
session = get_session()
session.add(struct)
session.commit()
session.close()
[docs] @staticmethod
def add_bulk(data, chunk=1000, on_conflict_do_nothing=True):
"""Add lot of rows to the table.
This method allows to load a big amount of rows in one instruction
Args:
data(list): The data in list format. Each list member is a new row.
The order is important.
chunk(int): The size of the chunks to load data to the database.
"""
engine = get_engine()
with engine.begin() as conn:
for pos in range(0, len(data), chunk):
if on_conflict_do_nothing:
conn.execute(
postgresql.insert(Molecule.__table__).values(
[{"inchikey": row[0], "inchi": row[1]}
for row in data[pos:pos + chunk]]
).on_conflict_do_nothing(
index_elements=[Molecule.inchikey]))
else:
conn.execute(
Molecule.__table__.insert(),
[{"inchikey": row[0], "inchi": row[1]}
for row in data[pos:pos + chunk]]
)
[docs] @staticmethod
def get(key):
"""Method to query table."""
session = get_session()
query = session.query(Molecule).filter_by(inchikey=key)
res = query.one_or_none()
session.close()
return res
@staticmethod
def get_inchikey_inchi_mapping(inchikeys, batch=10000):
mapping = dict()
for ink in inchikeys:
mapping[ink] = None
session = get_session()
desc = 'Fetching InChIKey-InChI mapping'
dis = len(inchikeys) < batch
for idx in trange(0, len(inchikeys), batch, desc=desc, disable=dis):
query = session.query(Molecule).filter(
Molecule.inchikey.in_(inchikeys[idx:idx + batch]))
res = query.with_entities(Molecule.inchikey, Molecule.inchi).all()
mapping.update(dict(res))
return mapping
@staticmethod
def get_missing_from_set(keys):
size = 1000
present = set()
vec = list(keys)
session = get_session()
for pos in range(0, len(keys), size):
query = session.query(Molecule).filter(
Molecule.inchikey.in_(vec[pos:pos + size]))
res = query.with_entities(Molecule.inchikey).all()
for ele in res:
present.add(ele[0])
session.close()
Molecule.__log.debug("Found already present: " + str(len(present)))
return keys.difference(present)
[docs] @staticmethod
def add_missing_only(data):
"""Add data to the table if not already present.
Args:
data(dict): The data in dict format, containing inchikey, inchi.
"""
list_inchikey_inchi = list()
set_inks = set(data.keys())
Molecule.__log.debug(
"Size initial data to add: " + str(len(set_inks)))
todo_iks = Molecule.get_missing_from_set(set_inks)
Molecule.__log.debug("Size final data to add: " + str(len(todo_iks)))
for ik, inchi in data.items():
if ik in todo_iks:
list_inchikey_inchi.append((ik, inchi))
if len(list_inchikey_inchi) > 0:
Molecule.add_bulk(list_inchikey_inchi)
@staticmethod
def _create_table():
engine = get_engine()
Base.metadata.create_all(engine)
@staticmethod
def _table_exists():
engine = get_engine()
return sqlalchemy.inspect(engine).has_table(Molecule.__tablename__)
@staticmethod
def _drop_table():
engine = get_engine()
Molecule.__table__.drop(engine)