annotate ucsc_xena_download.py @ 0:8bb037f88ed2

Uploaded
author melissacline
date Tue, 13 Jan 2015 23:37:23 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
1 #!/usr/bin/env python
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
2 import socket, urllib, sys, os
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
3 from galaxy import eggs #eggs needs to be imported so that galaxy.util can find docutils egg...
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
4 from galaxy.util.json import from_json_string, to_json_string
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
5 from galaxy.util import get_charset_from_http_headers
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
6 import galaxy.model # need to import model before sniff to resolve a circular import dependency
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
7 from galaxy.datatypes import sniff
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
8 import tarfile
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
9 import re
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
10
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
11 filemap = [
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
12 ('genomic', r'genomic(Segment|Matrix)$'),
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
13 ('clinical', r'clinicalMatrix$'),
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
14 ]
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
15
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
16 files = {
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
17 'genomic': sys.argv[1],
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
18 'clinical': sys.argv[2]
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
19 }
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
20
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
21 max_file_size = sys.argv[3]
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
22
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
23 def file_type(file):
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
24 with open(file) as f:
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
25 return from_json_string(f.read())['type']
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
26
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
27 def stop_err( msg ):
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
28 sys.stderr.write( msg )
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
29 sys.exit()
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
30
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
31 def load_input_parameters( filename, erase_file = True ):
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
32 datasource_params = {}
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
33 try:
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
34 json_params = from_json_string( open( filename, 'r' ).read() )
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
35 datasource_params = json_params.get( 'param_dict' )
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
36 except:
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
37 json_params = None
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
38 for line in open( filename, 'r' ):
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
39 try:
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
40 line = line.strip()
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
41 fields = line.split( '\t' )
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
42 datasource_params[ fields[0] ] = fields[1]
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
43 except:
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
44 continue
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
45 if erase_file:
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
46 open( filename, 'w' ).close() #open file for writing, then close, removes params from file
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
47 return json_params, datasource_params
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
48
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
49 def load_file(files):
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
50 filename = files['genomic']
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
51 job_params, params = load_input_parameters( filename, False )
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
52 URL = params.get( 'URL', None ) #using exactly URL indicates that only one dataset is being downloaded
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
53 URL_method = params.get( 'URL_method', None )
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
54 socket.setdefaulttimeout( 600 )
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
55 try:
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
56 if not URL_method or URL_method == 'get':
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
57 page = urllib.urlopen( URL )
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
58 elif URL_method == 'post':
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
59 page = urllib.urlopen( URL, urllib.urlencode( params ) )
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
60 except Exception, e:
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
61 stop_err( 'The remote data source application may be off line, please try again later. Error: %s' % str( e ) )
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
62 if max_file_size:
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
63 file_size = int( page.info().get( 'Content-Length', 0 ) )
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
64 if file_size > max_file_size:
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
65 stop_err( 'The size of the data (%d bytes) you have requested exceeds the maximum allowed (%d bytes) on this server.' % ( file_size, max_file_size ) )
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
66 try:
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
67 cur_filename, is_multi_byte = sniff.stream_to_open_named_file( page, os.open( filename, os.O_WRONLY | os.O_CREAT ), filename, source_encoding=get_charset_from_http_headers( page.headers ) )
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
68 except Exception, e:
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
69 stop_err( 'Unable to fetch %s:\n%s' % ( URL, e ) )
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
70
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
71 load_file(files)
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
72
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
73 tar = tarfile.open(files['genomic'])
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
74 names = tar.getnames()
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
75 metafiles = [n for n in names if n.endswith('.json')]
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
76 tar.extractall()
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
77 withtype = [(file_type(file), file[0:-len(".json")]) for file in metafiles]
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
78 try:
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
79 renames = [((n for (t, n) in withtype if re.search(pat, t)).next(), name) for (name, pat) in filemap]
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
80 except StopIteration:
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
81 stop_err( 'Missing required file type in tarball' )
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
82 for (frm, to) in renames:
8bb037f88ed2 Uploaded
melissacline
parents:
diff changeset
83 os.rename(frm, files[to])