Source code for chemicalchecker.database.molecule

"""Molecule InChIKey-InChI mapping.

Simple table storing the correspondence between InChIKey and InChI.

Example::

    from chemicalchecker.database import Molecule
    mol = Molecule.get('RZVAJINKPMORJF-UHFFFAOYSA-N'))
    mol.inchi
    >>> 'InChI=1S/C8H9NO2/c1-6(10)9-7-2-4-8(11)5-3-7/h2-5,11H,1H3,(H,9,10)'

"""
from tqdm import trange
import sqlalchemy
from sqlalchemy import Column, Text, VARCHAR
from sqlalchemy.dialects import postgresql

from .database import Base, get_session, get_engine

from chemicalchecker.util import logged


[docs]@logged
class Molecule(Base):
    """Molecule Table class.

    Parameters:
        inchikey(str): primary key, simple unique name for the Datasource.
        inchi(str): the download link.
    """
    __tablename__ = 'molecule'
    inchikey = Column(VARCHAR(27), primary_key=True, index=True)
    inchi = Column(Text)

[docs]    @staticmethod
    def add(kwargs):
        """ Method to add a new row to the table.

        Args:
            kwargs(dict):The data in dictionary format .
        """
        Molecule.__log.debug(type(kwargs))
        if type(kwargs) is dict:
            struct = Molecule(**kwargs)

        Molecule.__log.debug(struct.inchikey)
        session = get_session()
        session.add(struct)
        session.commit()
        session.close()

[docs]    @staticmethod
    def add_bulk(data, chunk=1000, on_conflict_do_nothing=True):
        """Add lot of rows to the table.

        This method allows to load a big amount of rows in one instruction

        Args:
            data(list): The data in list format. Each list member is a new row.
                The order is important.
            chunk(int): The size of the chunks to load data to the database.
        """
        engine = get_engine()
        with engine.begin() as conn:
            for pos in range(0, len(data), chunk):
                if on_conflict_do_nothing:
                    conn.execute(
                        postgresql.insert(Molecule.__table__).values(
                            [{"inchikey": row[0], "inchi": row[1]}
                             for row in data[pos:pos + chunk]]
                        ).on_conflict_do_nothing(
                            index_elements=[Molecule.inchikey]))
                else:
                    conn.execute(
                        Molecule.__table__.insert(),
                        [{"inchikey": row[0], "inchi": row[1]}
                            for row in data[pos:pos + chunk]]
                    )

[docs]    @staticmethod
    def get(key):
        """Method to query table."""
        session = get_session()
        query = session.query(Molecule).filter_by(inchikey=key)
        res = query.one_or_none()

        session.close()

        return res

    @staticmethod
    def get_inchikey_inchi_mapping(inchikeys, batch=10000):
        mapping = dict()
        for ink in inchikeys:
            mapping[ink] = None

        session = get_session()
        desc = 'Fetching InChIKey-InChI mapping'
        dis = len(inchikeys) < batch
        for idx in trange(0, len(inchikeys), batch, desc=desc, disable=dis):
            query = session.query(Molecule).filter(
                Molecule.inchikey.in_(inchikeys[idx:idx + batch]))
            res = query.with_entities(Molecule.inchikey, Molecule.inchi).all()
            mapping.update(dict(res))

        return mapping

    @staticmethod
    def get_missing_from_set(keys):
        size = 1000
        present = set()

        vec = list(keys)

        session = get_session()
        for pos in range(0, len(keys), size):
            query = session.query(Molecule).filter(
                Molecule.inchikey.in_(vec[pos:pos + size]))
            res = query.with_entities(Molecule.inchikey).all()
            for ele in res:
                present.add(ele[0])

        session.close()

        Molecule.__log.debug("Found already present: " + str(len(present)))

        return keys.difference(present)

[docs]    @staticmethod
    def add_missing_only(data):
        """Add data to the table if not already present.

        Args:
            data(dict): The data in dict format, containing inchikey, inchi.
        """
        list_inchikey_inchi = list()
        set_inks = set(data.keys())

        Molecule.__log.debug(
            "Size initial data to add: " + str(len(set_inks)))

        todo_iks = Molecule.get_missing_from_set(set_inks)

        Molecule.__log.debug("Size final data to add: " + str(len(todo_iks)))

        for ik, inchi in data.items():
            if ik in todo_iks:
                list_inchikey_inchi.append((ik, inchi))

        if len(list_inchikey_inchi) > 0:
            Molecule.add_bulk(list_inchikey_inchi)

    @staticmethod
    def _create_table():
        engine = get_engine()
        Base.metadata.create_all(engine)

    @staticmethod
    def _table_exists():
        engine = get_engine()
        return sqlalchemy.inspect(engine).has_table(Molecule.__tablename__)

    @staticmethod
    def _drop_table():
        engine = get_engine()
        Molecule.__table__.drop(engine)