Source code for chemicalchecker.tool.node2vec.word2vec

from gensim.models import Word2Vec
from chemicalchecker.util import logged
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
import h5py
import numpy as np


[docs]@logged class Word2VecWrapper(): def __init__(self, recipe): self.__log.info("INIT") self.recipe = recipe def learn_embedding(self, corpus): embedding = Word2Vec( corpus, size=self.recipe.params.embedding.dimensions, window=self.recipe.params.embedding.window_size, min_count=self.recipe.params.embedding.min_count, workers=self.recipe.params.embedding.workers, sg=1) self.word_vectors = embedding.wv def save(self, filepath): self.word_vectors.save(filepath) @classmethod def load(cls, filepath, embedding_format='gensim'): w2v = cls(None) if embedding_format == 'gensim': w2v.word_vectors = KeyedVectors.load(filepath, mmap='r') elif embedding_format == 'c_txt': w2v.word_vectors = KeyedVectors.load_word2vec_format( datapath(filepath), binary=False) elif embedding_format == 'c_bin': w2v.word_vectors = KeyedVectors.load_word2vec_format( datapath(filepath), binary=True) return w2v @staticmethod def convert(in_file, out_file, in_format='c_txt', out_format='h5', names_map=None, limit_words=set()): Word2VecWrapper.__log.info("Converting %s to %s" % (in_file, out_file)) if in_format == 'c_txt': with open(in_file, 'r') as fh: words = list() vectors = list() fh.readline() # skip first row skipped = 0 for line in fh: fields = line.split() # first colum is id word = int(fields[0]) if word in limit_words: skipped += 1 continue # then embedding vector = np.fromiter((float(x) for x in fields[1:]), dtype=np.float) words.append(word) vectors.append(vector) # to numpy arrays words = np.array(words) matrix = np.array(vectors) # get them sorted sorted_idx = np.argsort(words) Word2VecWrapper.__log.info('words: %s' % str(words.shape)) Word2VecWrapper.__log.info('matrix: %s' % str(matrix.shape)) Word2VecWrapper.__log.info('skipped: %s' % skipped) else: raise Exception("Unrecognized input format.") if out_format == 'h5': names = np.loadtxt(names_map, dtype='|S27', usecols=[1]) with h5py.File(out_file, "w") as fh: fh.create_dataset('inchikeys', data=names[words[sorted_idx]]) fh.create_dataset('V', data=matrix[sorted_idx]) else: raise Exception("Unrecognized output format.")