Mercurial > repos > melissacline > ucsc_xena_platform
view ucsc_xena_download.py @ 36:d64a002c3b0c
modify
author | jingchunzhu |
---|---|
date | Fri, 24 Jul 2015 11:39:31 -0700 |
parents | 8bb037f88ed2 |
children |
line wrap: on
line source
#!/usr/bin/env python import socket, urllib, sys, os from galaxy import eggs #eggs needs to be imported so that galaxy.util can find docutils egg... from galaxy.util.json import from_json_string, to_json_string from galaxy.util import get_charset_from_http_headers import galaxy.model # need to import model before sniff to resolve a circular import dependency from galaxy.datatypes import sniff import tarfile import re filemap = [ ('genomic', r'genomic(Segment|Matrix)$'), ('clinical', r'clinicalMatrix$'), ] files = { 'genomic': sys.argv[1], 'clinical': sys.argv[2] } max_file_size = sys.argv[3] def file_type(file): with open(file) as f: return from_json_string(f.read())['type'] def stop_err( msg ): sys.stderr.write( msg ) sys.exit() def load_input_parameters( filename, erase_file = True ): datasource_params = {} try: json_params = from_json_string( open( filename, 'r' ).read() ) datasource_params = json_params.get( 'param_dict' ) except: json_params = None for line in open( filename, 'r' ): try: line = line.strip() fields = line.split( '\t' ) datasource_params[ fields[0] ] = fields[1] except: continue if erase_file: open( filename, 'w' ).close() #open file for writing, then close, removes params from file return json_params, datasource_params def load_file(files): filename = files['genomic'] job_params, params = load_input_parameters( filename, False ) URL = params.get( 'URL', None ) #using exactly URL indicates that only one dataset is being downloaded URL_method = params.get( 'URL_method', None ) socket.setdefaulttimeout( 600 ) try: if not URL_method or URL_method == 'get': page = urllib.urlopen( URL ) elif URL_method == 'post': page = urllib.urlopen( URL, urllib.urlencode( params ) ) except Exception, e: stop_err( 'The remote data source application may be off line, please try again later. Error: %s' % str( e ) ) if max_file_size: file_size = int( page.info().get( 'Content-Length', 0 ) ) if file_size > max_file_size: stop_err( 'The size of the data (%d bytes) you have requested exceeds the maximum allowed (%d bytes) on this server.' % ( file_size, max_file_size ) ) try: cur_filename, is_multi_byte = sniff.stream_to_open_named_file( page, os.open( filename, os.O_WRONLY | os.O_CREAT ), filename, source_encoding=get_charset_from_http_headers( page.headers ) ) except Exception, e: stop_err( 'Unable to fetch %s:\n%s' % ( URL, e ) ) load_file(files) tar = tarfile.open(files['genomic']) names = tar.getnames() metafiles = [n for n in names if n.endswith('.json')] tar.extractall() withtype = [(file_type(file), file[0:-len(".json")]) for file in metafiles] try: renames = [((n for (t, n) in withtype if re.search(pat, t)).next(), name) for (name, pat) in filemap] except StopIteration: stop_err( 'Missing required file type in tarball' ) for (frm, to) in renames: os.rename(frm, files[to])