ucsc_xena_platform: ucsc_xena_download.py comparison

comparison ucsc_xena_download.py @ 0:8bb037f88ed2

Uploaded

author	melissacline
date	Tue, 13 Jan 2015 23:37:23 -0500
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:8bb037f88ed2
+#!/usr/bin/env python
+import socket, urllib, sys, os
+from galaxy import eggs #eggs needs to be imported so that galaxy.util can find docutils egg...
+from galaxy.util.json import from_json_string, to_json_string
+from galaxy.util import get_charset_from_http_headers
+import galaxy.model # need to import model before sniff to resolve a circular import dependency
+from galaxy.datatypes import sniff
+import tarfile
+import re
+filemap = [
+('genomic', r'genomic(Segment|Matrix)$'),
+('clinical', r'clinicalMatrix$'),
+]
+files = {
+'genomic': sys.argv[1],
+'clinical': sys.argv[2]
+}
+max_file_size = sys.argv[3]
+def file_type(file):
+with open(file) as f:
+return from_json_string(f.read())['type']
+def stop_err( msg ):
+sys.stderr.write( msg )
+sys.exit()
+def load_input_parameters( filename, erase_file = True ):
+datasource_params = {}
+try:
+json_params = from_json_string( open( filename, 'r' ).read() )
+datasource_params = json_params.get( 'param_dict' )
+except:
+json_params = None
+for line in open( filename, 'r' ):
+try:
+line = line.strip()
+fields = line.split( '\t' )
+datasource_params[ fields[0] ] = fields[1]
+except:
+continue
+if erase_file:
+open( filename, 'w' ).close() #open file for writing, then close, removes params from file
+return json_params, datasource_params
+def load_file(files):
+filename = files['genomic']
+job_params, params = load_input_parameters( filename, False )
+URL = params.get( 'URL', None ) #using exactly URL indicates that only one dataset is being downloaded
+URL_method = params.get( 'URL_method', None )
+socket.setdefaulttimeout( 600 )
+try:
+if not URL_method or URL_method == 'get':
+page = urllib.urlopen( URL )
+elif URL_method == 'post':
+page = urllib.urlopen( URL, urllib.urlencode( params ) )
+except Exception, e:
+stop_err( 'The remote data source application may be off line, please try again later. Error: %s' % str( e ) )
+if max_file_size:
+file_size = int( page.info().get( 'Content-Length', 0 ) )
+if file_size > max_file_size:
+stop_err( 'The size of the data (%d bytes) you have requested exceeds the maximum allowed (%d bytes) on this server.' % ( file_size, max_file_size ) )
+try:
+cur_filename, is_multi_byte = sniff.stream_to_open_named_file( page, os.open( filename, os.O_WRONLY | os.O_CREAT ), filename, source_encoding=get_charset_from_http_headers( page.headers ) )
+except Exception, e:
+stop_err( 'Unable to fetch %s:\n%s' % ( URL, e ) )
+load_file(files)
+tar = tarfile.open(files['genomic'])
+names = tar.getnames()
+metafiles = [n for n in names if n.endswith('.json')]
+tar.extractall()
+withtype = [(file_type(file), file[0:-len(".json")]) for file in metafiles]
+try:
+renames = [((n for (t, n) in withtype if re.search(pat, t)).next(), name) for (name, pat) in filemap]
+except StopIteration:
+stop_err( 'Missing required file type in tarball' )
+for (frm, to) in renames:
+os.rename(frm, files[to])

Mercurial > repos > melissacline > ucsc_xena_platform

comparison ucsc_xena_download.py @ 0:8bb037f88ed2