comparison ucsc_xena_download.py @ 0:8bb037f88ed2

Uploaded
author melissacline
date Tue, 13 Jan 2015 23:37:23 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:8bb037f88ed2
1 #!/usr/bin/env python
2 import socket, urllib, sys, os
3 from galaxy import eggs #eggs needs to be imported so that galaxy.util can find docutils egg...
4 from galaxy.util.json import from_json_string, to_json_string
5 from galaxy.util import get_charset_from_http_headers
6 import galaxy.model # need to import model before sniff to resolve a circular import dependency
7 from galaxy.datatypes import sniff
8 import tarfile
9 import re
10
11 filemap = [
12 ('genomic', r'genomic(Segment|Matrix)$'),
13 ('clinical', r'clinicalMatrix$'),
14 ]
15
16 files = {
17 'genomic': sys.argv[1],
18 'clinical': sys.argv[2]
19 }
20
21 max_file_size = sys.argv[3]
22
23 def file_type(file):
24 with open(file) as f:
25 return from_json_string(f.read())['type']
26
27 def stop_err( msg ):
28 sys.stderr.write( msg )
29 sys.exit()
30
31 def load_input_parameters( filename, erase_file = True ):
32 datasource_params = {}
33 try:
34 json_params = from_json_string( open( filename, 'r' ).read() )
35 datasource_params = json_params.get( 'param_dict' )
36 except:
37 json_params = None
38 for line in open( filename, 'r' ):
39 try:
40 line = line.strip()
41 fields = line.split( '\t' )
42 datasource_params[ fields[0] ] = fields[1]
43 except:
44 continue
45 if erase_file:
46 open( filename, 'w' ).close() #open file for writing, then close, removes params from file
47 return json_params, datasource_params
48
49 def load_file(files):
50 filename = files['genomic']
51 job_params, params = load_input_parameters( filename, False )
52 URL = params.get( 'URL', None ) #using exactly URL indicates that only one dataset is being downloaded
53 URL_method = params.get( 'URL_method', None )
54 socket.setdefaulttimeout( 600 )
55 try:
56 if not URL_method or URL_method == 'get':
57 page = urllib.urlopen( URL )
58 elif URL_method == 'post':
59 page = urllib.urlopen( URL, urllib.urlencode( params ) )
60 except Exception, e:
61 stop_err( 'The remote data source application may be off line, please try again later. Error: %s' % str( e ) )
62 if max_file_size:
63 file_size = int( page.info().get( 'Content-Length', 0 ) )
64 if file_size > max_file_size:
65 stop_err( 'The size of the data (%d bytes) you have requested exceeds the maximum allowed (%d bytes) on this server.' % ( file_size, max_file_size ) )
66 try:
67 cur_filename, is_multi_byte = sniff.stream_to_open_named_file( page, os.open( filename, os.O_WRONLY | os.O_CREAT ), filename, source_encoding=get_charset_from_http_headers( page.headers ) )
68 except Exception, e:
69 stop_err( 'Unable to fetch %s:\n%s' % ( URL, e ) )
70
71 load_file(files)
72
73 tar = tarfile.open(files['genomic'])
74 names = tar.getnames()
75 metafiles = [n for n in names if n.endswith('.json')]
76 tar.extractall()
77 withtype = [(file_type(file), file[0:-len(".json")]) for file in metafiles]
78 try:
79 renames = [((n for (t, n) in withtype if re.search(pat, t)).next(), name) for (name, pat) in filemap]
80 except StopIteration:
81 stop_err( 'Missing required file type in tarball' )
82 for (frm, to) in renames:
83 os.rename(frm, files[to])