Source code for chemicalchecker.util.download.download

"""Systematize download and decompression from external repositories."""
import os
import sys
import shutil
import tempfile
from glob import glob
from time import sleep
from ftplib import FTP
from subprocess import call
from six.moves import urllib
from six.moves.urllib import request
from six.moves.urllib.parse import urlparse

from chemicalchecker.util import logged
from chemicalchecker.util import Config
from chemicalchecker.util import psql


[docs]@logged class Downloader(): """Downloader class.""" def __init__(self, url, data_path, tmp_dir=None, dbname=None, file=None): """Initialize a Download instance. Download from url, unpack in tmp_dir, and copy to data_path. We allow wild-char in url only for FTP links. We resolve the link and make sure there is only one file matching the regex. Args: url(str): The external link to a file. data_path(str): Final destination for downloaded stuff. tmp_dir(str): Temp download path, from config by default. """ self.__log.debug('%s to %s', url, data_path) self.data_path = data_path self.dbname = dbname self.file = file if not tmp_dir: tmp_dir = Config().PATH.CC_TMP self.tmp_dir = tmp_dir try: self.url = Downloader.validate_url(url) except Exception as err: self.__log.warning("Cannot validate url: %s", str(err)) self.url = url
[docs] @staticmethod def validate_url(url): """Validate and check if url is working.""" # validating url parsed = urlparse(url) if parsed.scheme == 'ftp': attempts = 0 connected = False while attempts < 5: try: f = FTP(parsed.netloc) connected = True break except Exception as err: attempts += 1 Downloader.__log.warning('Attempt failed: %s', str(err)) request.urlcleanup() sleep(5) if not connected: raise Exception('All attempts to connect failed.') f.login() files = f.nlst(parsed.path) if len(files) > 1: raise RuntimeError('Url resolving to multiple files.') if len(files) == 0: raise RuntimeError('Url not resolving to file.') parsed = parsed._replace(path=files[0]) new_url = parsed.geturl() Downloader.__log.debug('Resolved to as: %s', new_url) elif parsed.scheme == 'http' or parsed.scheme == 'https': if '*' in parsed.path: raise RuntimeError('Wild-char `*` in url not accepted.') headers = {'User-Agent': 'Mozilla/5.0'} req = request.Request(url, headers=headers) req.get_method = lambda: 'HEAD' try: request.urlopen(req, timeout=60) new_url = url except urllib.error.HTTPError: raise RuntimeError('Url not resolving to file.') elif parsed.scheme == 'file': if os.path.isfile(parsed.path): new_url = url else: raise RuntimeError( 'Url not resolving to file: %s.' % parsed.path) else: raise RuntimeError('Unrecognized URL protocol.') return new_url
[docs] def download(self): """Perform the download.""" try: import wget except ImportError: raise ImportError("requires wget " + "http://bitbucket.org/techtonik/python-wget/src") try: import patoolib except ImportError: raise ImportError("requires patoolib " + "http://wummel.github.io/patool/") # create temp dir tmp_dir = tempfile.mkdtemp( prefix='tmp_', dir=self.tmp_dir) self.__log.debug('temp download dir %s', tmp_dir) # set wget user agent class MyOpener(urllib.request.FancyURLopener): version = 'Mozilla/5.0' wget.ulib.urlretrieve = MyOpener().retrieve # determine file name parsed = urlparse(self.url) tmp_file = os.path.join(tmp_dir, wget.detect_filename(self.url)) # download if parsed.scheme == 'file': # file can be on local filesystem shutil.copy(parsed.path, tmp_dir) else: # or has to be downloaded, in case try several times attempts = 0 downloaded = False while attempts < 5: try: wget.download(self.url, tmp_file) downloaded = True break except Exception as err: attempts += 1 self.__log.warning('Attempt failed: %s', str(err)) request.urlcleanup() sleep(5) if not downloaded: raise Exception('All attempts to download failed.') # unzip tmp_unzip_dir = os.path.join(tmp_dir, 'unzip') os.mkdir(tmp_unzip_dir) self.__log.debug('temp unzip dir %s', tmp_unzip_dir) # not a clear way to check if file is compressed, just try mime, compression = patoolib.util.guess_mime(tmp_file) self.__log.debug("MIME %s COMPRESSION %s", mime, compression) if mime not in patoolib.ArchiveMimetypes: self.__log.debug('no need to uncompress %s, copying', tmp_file) shutil.move(tmp_file, tmp_unzip_dir) else: try: patoolib.extract_archive(tmp_file, outdir=tmp_unzip_dir) except patoolib.util.PatoolError as err: self.__log.error('problem uncompressing %s', tmp_file) self.__log.error('error was: %s', str(err)) raise err if self.dbname is not None: file_path = os.path.join(tmp_unzip_dir, self.file) if '*' in self.file: # resolve the path paths = glob(file_path) if len(paths) > 1: raise Exception("`*` in %s db_file is ambigous.", self) file_path = paths[0] cmd2run = 'PGPASSWORD=' + Config().DB.password + \ ' dropdb --if-exists -h ' + Config().DB.host + \ ' -U ' + Config().DB.user + \ ' ' + self.dbname + ' && ' cmd2run += 'PGPASSWORD=' + Config().DB.password + \ " createdb -h " + Config().DB.host + \ " -U " + Config().DB.user + \ ' ' + self.dbname + " && " cmd2run += 'PGPASSWORD=' + Config().DB.password + \ ' psql -h ' + Config().DB.host + \ " -U " + Config().DB.user + \ ' -d ' + self.dbname + ' <' + file_path # cmd2run += 'PGPASSWORD=' + Config().DB.password + # ' pg_restore -h ' + Config().DB.host + # " -U " + Config().DB.user + # ' -d ' + self.dbname + # ' ' + file_path try: self.__log.debug('calling script: ' + cmd2run) retcode = call(cmd2run, shell=True) self.__log.debug("FINISHED! " + cmd2run + (" returned code %d" % retcode)) except OSError as e: self.__log.critical("Execution failed: %s" % e) sys.exit(1) R = psql.qstring("SELECT pg_size_pretty(pg_database_size('" + self.dbname + "'))", Config().DB.database) size = R[0][0].split(" ") self.__log.debug("Size of the new DB in: " + size[0]) if size[1] == 'kB': raise RuntimeError( 'DB created seems to be empty. Please check.') # move to final destination source = tmp_unzip_dir destination = self.data_path dirs = os.listdir(source) if self.file is not None and self.dbname is None and len(dirs) == 1: source = os.path.join(source, os.listdir(source)[0]) if not os.path.exists(self.data_path): os.makedirs(self.data_path, 0o775) destination = os.path.join(self.data_path, self.file) shutil.move(source, destination) self.__log.debug('downloaded to %s', self.data_path) # remove temp dir shutil.rmtree(tmp_dir) self.__log.debug('removed temp dir %s', tmp_dir)