annotate macs_wrapper.py @ 2:2d26a3639989 draft default tip

Uploaded
author jbrayet
date Mon, 18 Jan 2016 09:30:56 -0500
parents 14ffe4d75720
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
1 #!/usr/bin/env python
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
2
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
3 import sys, subprocess, tempfile, shutil, glob, os, os.path, gzip
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
4 #from galaxy import eggs
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
5 #import pkg_resources
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
6 #pkg_resources.require( "simplejson" )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
7 import simplejson
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
8
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
9 CHUNK_SIZE = 1024
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
10
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
11 def gunzip_cat_glob_path( glob_path, target_filename, delete = False ):
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
12 out = open( target_filename, 'wb' )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
13 for filename in glob.glob( glob_path ):
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
14 fh = gzip.open( filename, 'rb' )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
15 while True:
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
16 data = fh.read( CHUNK_SIZE )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
17 if data:
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
18 out.write( data )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
19 else:
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
20 break
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
21 fh.close()
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
22 if delete:
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
23 os.unlink( filename )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
24 out.close()
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
25
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
26 def xls_to_interval( xls_file, interval_file, header = None ):
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
27 out = open( interval_file, 'wb' )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
28 if header:
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
29 out.write( '#%s\n' % header )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
30 wrote_header = False
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
31 #From macs readme: Coordinates in XLS is 1-based which is different with BED format.
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
32 for line in open( xls_file ):
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
33 #keep all existing comment lines
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
34 if line.startswith( '#' ):
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
35 out.write( line )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
36 elif not wrote_header:
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
37 out.write( '#%s' % line )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
38 wrote_header = True
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
39 else:
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
40 fields = line.split( '\t' )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
41 #if len( fields ) > 1:
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
42 #fields[1] = str( int( fields[1] ) - 1 )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
43 out.write( '\t'.join( fields ) )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
44 out.close()
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
45
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
46 def main():
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
47 options = simplejson.load( open( sys.argv[1] ) )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
48 output_bed = sys.argv[2]
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
49 output_extra_html = sys.argv[3]
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
50 output_extra_path = sys.argv[4]
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
51
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
52 experiment_name = '_'.join( options['experiment_name'].split() ) #save experiment name here, it will be used by macs for filenames (gzip of wig files will fail with spaces - macs doesn't properly escape them)..need to replace all whitespace, split makes this easier
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
53 cmdline = "export PYTHONPATH=/usr/bin/macs/MACS_1.4.2/lib/python2.6/site-packages;/usr/bin/macs/MACS_1.4.2/bin/macs14 -t %s" % ",".join( options['input_chipseq'] )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
54 if options['input_control']:
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
55 cmdline = "%s -c %s" % ( cmdline, ",".join( options['input_control'] ) )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
56 cmdline = "%s --format='%s' --name='%s' --gsize='%s' --tsize='%s' --bw='%s' --pvalue='%s' --mfold='%s' %s %s" % ( cmdline, options['format'], experiment_name, options['gsize'], options['tsize'], options['bw'], options['pvalue'], options['mfold'], options['nolambda'], options['futurefdr'] )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
57 if 'wig' in options:
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
58 #wigextend = int( options['wig']['wigextend'] )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
59 #if wigextend >= 0:
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
60 #wigextend = "--wigextend='%s'" % wigextend
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
61 #else:
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
62 #wigextend = ''
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
63 #cmdline = "%s --wig %s --space='%s'" % ( cmdline, wigextend, options['wig']['space'] )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
64 cmdline = "%s --wig --space='%s'" % ( cmdline, options['wig']['space'] )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
65 if 'nomodel' in options:
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
66 cmdline = "%s --nomodel --shiftsize='%s'" % ( cmdline, options['nomodel'] )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
67 if 'diag' in options:
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
68 cmdline = "%s --diag --fe-min='%s' --fe-max='%s' --fe-step='%s'" % ( cmdline, options['diag']['fe-min'], options['diag']['fe-max'], options['diag']['fe-step'] )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
69
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
70 tmp_dir = tempfile.mkdtemp() #macs makes very messy output, need to contain it into a temp dir, then provide to user
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
71 stderr_name = tempfile.NamedTemporaryFile().name # redirect stderr here, macs provides lots of info via stderr, make it into a report
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
72 proc = subprocess.Popen( args=cmdline, shell=True, cwd=tmp_dir, stderr=open( stderr_name, 'wb' ) )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
73 proc.wait()
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
74 #We don't want to set tool run to error state if only warnings or info, e.g. mfold could be decreased to improve model, but let user view macs log
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
75 #Do not terminate if error code, allow dataset (e.g. log) creation and cleanup
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
76 if proc.returncode:
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
77 stderr_f = open( stderr_name )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
78 while True:
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
79 chunk = stderr_f.read( CHUNK_SIZE )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
80 if not chunk:
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
81 stderr_f.close()
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
82 break
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
83 sys.stderr.write( chunk )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
84
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
85 #run R to create pdf from model script
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
86 if os.path.exists( os.path.join( tmp_dir, "%s_model.r" % experiment_name ) ):
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
87 cmdline = 'R --vanilla --slave < "%s_model.r" > "%s_model.r.log"' % ( experiment_name, experiment_name )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
88 proc = subprocess.Popen( args=cmdline, shell=True, cwd=tmp_dir )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
89 proc.wait()
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
90
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
91
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
92 #move bed out to proper output file
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
93 created_bed_name = os.path.join( tmp_dir, "%s_peaks.bed" % experiment_name )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
94 if os.path.exists( created_bed_name ):
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
95 shutil.move( created_bed_name, output_bed )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
96
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
97 #parse xls files to interval files as needed
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
98 if options['xls_to_interval']:
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
99 create_peak_xls_file = os.path.join( tmp_dir, '%s_peaks.xls' % experiment_name )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
100 if os.path.exists( create_peak_xls_file ):
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
101 xls_to_interval( create_peak_xls_file, options['xls_to_interval']['peaks_file'], header = 'peaks file' )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
102 create_peak_xls_file = os.path.join( tmp_dir, '%s_negative_peaks.xls' % experiment_name )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
103 if os.path.exists( create_peak_xls_file ):
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
104 xls_to_interval( create_peak_xls_file, options['xls_to_interval']['negative_peaks_file'], header = 'negative peaks file' )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
105
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
106 #merge and move wig files as needed, delete gz'd files and remove emptied dirs
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
107 if 'wig' in options:
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
108 wig_base_dir = os.path.join( tmp_dir, "%s_MACS_wiggle" % experiment_name )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
109 if os.path.exists( wig_base_dir ):
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
110 #treatment
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
111 treatment_dir = os.path.join( wig_base_dir, "treat" )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
112 if os.path.exists( treatment_dir ):
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
113 gunzip_cat_glob_path( os.path.join( treatment_dir, "*.wig.gz" ), options['wig']['output_treatment_file'], delete = True )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
114 os.rmdir( treatment_dir )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
115 #control
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
116 if options['input_control']:
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
117 control_dir = os.path.join( wig_base_dir, "control" )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
118 if os.path.exists( control_dir ):
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
119 gunzip_cat_glob_path( os.path.join( control_dir, "*.wig.gz" ), options['wig']['output_control_file'], delete = True )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
120 os.rmdir( control_dir )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
121 os.rmdir( wig_base_dir )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
122
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
123 #move all remaining files to extra files path of html file output to allow user download
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
124 out_html = open( output_extra_html, 'wb' )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
125 out_html.write( '<html><head><title>Additional output created by MACS (%s)</title></head><body><h3>Additional Files:</h3><p><ul>\n' % experiment_name )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
126 os.mkdir( output_extra_path )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
127 for filename in sorted( os.listdir( tmp_dir ) ):
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
128 shutil.move( os.path.join( tmp_dir, filename ), os.path.join( output_extra_path, filename ) )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
129 out_html.write( '<li><a href="%s">%s</a></li>\n' % ( filename, filename ) )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
130 out_html.write( '</ul></p>\n' )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
131 out_html.write( '<h3>Messages from MACS:</h3>\n<p><pre>%s</pre></p>\n' % open( stderr_name, 'rb' ).read() )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
132 out_html.write( '</body></html>\n' )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
133 out_html.close()
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
134
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
135 os.unlink( stderr_name )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
136 os.rmdir( tmp_dir )
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
137
14ffe4d75720 Uploaded
jbrayet
parents:
diff changeset
138 if __name__ == "__main__": main()