annotate macs2_wrapper.py @ 0:6bc303d12c70 draft default tip

planemo upload
author eduardo
date Mon, 20 Feb 2017 17:23:19 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
1 # macs2 python wrapper
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
2 # based on http://toolshed.g2.bx.psu.edu/view/modencode-dcc/macs2
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
3
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
4 import sys, subprocess, tempfile, shutil, glob, os, os.path, gzip
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
5 from galaxy import eggs
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
6 import json
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
7
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
8 CHUNK_SIZE = 1024
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
9
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
10 #==========================================================================================
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
11 #functions
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
12 #==========================================================================================
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
13 def gunzip_cat_glob_path( glob_path, target_filename, delete = False ):
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
14 out = open( target_filename, 'wb' )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
15 for filename in glob.glob( glob_path ):
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
16 fh = gzip.open( filename, 'rb' )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
17 while True:
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
18 data = fh.read( CHUNK_SIZE )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
19 if data:
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
20 out.write( data )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
21 else:
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
22 break
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
23 fh.close()
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
24 if delete:
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
25 os.unlink( filename )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
26 out.close()
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
27
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
28 def xls_to_interval( xls_file, interval_file, header = None ):
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
29 out = open( interval_file, 'wb' )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
30 if header:
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
31 out.write( '#%s\n' % header )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
32 wrote_header = False
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
33 #From macs readme: Coordinates in XLS is 1-based which is different with BED format.
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
34 for line in open( xls_file ):
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
35 #keep all existing comment lines
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
36 if line.startswith( '#' ):
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
37 out.write( line )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
38 #added for macs2 since there is an extra newline
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
39 elif line.startswith( '\n' ):
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
40 out.write( line )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
41 elif not wrote_header:
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
42 out.write( '#%s' % line )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
43 print line
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
44 wrote_header = True
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
45 else:
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
46 fields = line.split( '\t' )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
47 if len( fields ) > 1:
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
48 fields[1] = str( int( fields[1] ) - 1 )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
49 out.write( '\t'.join( fields ) )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
50 out.close()
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
51
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
52 #==========================================================================================
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
53 #main
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
54 #==========================================================================================
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
55 def main():
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
56 #take in options file and output file names
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
57 options = json.load( open( sys.argv[1] ) )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
58 outputs = json.load( open( sys.argv[2] ) )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
59
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
60 #=================================================================================
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
61 #parse options and execute macs2
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
62 #=================================================================================
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
63 #default inputs that are in every major command
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
64 experiment_name = '_'.join( options['experiment_name'].split() ) #save experiment name here, it will be used by macs for some file names
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
65 cmdline = "macs2 %s -t %s" % ( options['command'], ",".join( options['input_chipseq'] ) )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
66 if options['input_control']:
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
67 cmdline = "%s -c %s" % ( cmdline, ",".join( options['input_control'] ) )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
68
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
69 #=================================================================================
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
70 if (options['command'] == "callpeak"):
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
71 output_bed = outputs['output_bed_file']
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
72 output_extra_html = outputs['output_extra_file']
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
73 output_extra_path = outputs['output_extra_file_path']
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
74 output_peaks = outputs['output_peaks_file']
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
75 output_narrowpeaks = outputs['output_narrowpeaks_file']
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
76 output_xls_to_interval_peaks_file = outputs['output_xls_to_interval_peaks_file']
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
77 output_xls_to_interval_negative_peaks_file = outputs['output_xls_to_interval_negative_peaks_file']
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
78
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
79 if 'pvalue' in options:
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
80 cmdline = "%s --format='%s' --name='%s' --gsize='%s' --bw='%s' --pvalue='%s' --mfold %s %s %s %s" % ( cmdline, options['format'], experiment_name, options['gsize'], options['bw'], options['pvalue'], options['mfoldlo'], options['mfoldhi'], options['nolambda'], options['bdg'] )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
81 elif 'qvalue' in options:
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
82 cmdline = "%s --format='%s' --name='%s' --gsize='%s' --bw='%s' --qvalue='%s' --mfold %s %s %s %s" % ( cmdline, options['format'], experiment_name, options['gsize'], options['bw'], options['qvalue'], options['mfoldlo'], options['mfoldhi'], options['nolambda'], options['bdg'] )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
83
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
84 if 'broad_cutoff' in options:
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
85 cmdline += " --broad --broad-cutoff=%s" % (options['broad_cutoff'])
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
86
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
87 if 'nomodel' in options:
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
88 cmdline = "%s --nomodel --shiftsize='%s'" % ( cmdline, options['nomodel'] )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
89 #=================================================================================
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
90 if (options['command'] == "bdgcmp"):
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
91 output_bdgcmp = outputs['output_bdgcmp_file']
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
92
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
93 cmdline = "%s -m %s -p %s -o bdgcmp_out.bdg" % ( cmdline, options['m'], options['pseudocount'] )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
94 #=================================================================================
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
95
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
96 tmp_dir = tempfile.mkdtemp() #macs makes very messy output, need to contain it into a temp dir, then provide to user
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
97 stderr_name = tempfile.NamedTemporaryFile().name # redirect stderr here, macs provides lots of info via stderr, make it into a report
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
98 proc = subprocess.Popen( args=cmdline, shell=True, cwd=tmp_dir, stderr=open( stderr_name, 'wb' ) )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
99 proc.wait()
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
100 #We don't want to set tool run to error state if only warnings or info, e.g. mfold could be decreased to improve model, but let user view macs log
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
101 #Do not terminate if error code, allow dataset (e.g. log) creation and cleanup
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
102 if proc.returncode:
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
103 stderr_f = open( stderr_name )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
104 while True:
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
105 chunk = stderr_f.read( CHUNK_SIZE )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
106 if not chunk:
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
107 stderr_f.close()
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
108 break
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
109 sys.stderr.write( chunk )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
110
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
111 #=================================================================================
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
112 #copy files created by macs2 to appripriate directory with the provided names
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
113 #=================================================================================
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
114
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
115 #=================================================================================
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
116 #move files generated by callpeak command
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
117 if (options['command'] == "callpeak"):
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
118 #run R to create pdf from model script
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
119 if os.path.exists( os.path.join( tmp_dir, "%s_model.r" % experiment_name ) ):
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
120 cmdline = 'R --vanilla --slave < "%s_model.r" > "%s_model.r.log"' % ( experiment_name, experiment_name )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
121 proc = subprocess.Popen( args=cmdline, shell=True, cwd=tmp_dir )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
122 proc.wait()
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
123
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
124 #move bed out to proper output file
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
125 created_bed_name = os.path.join( tmp_dir, "%s_peaks.bed" % experiment_name )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
126 if os.path.exists( created_bed_name ):
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
127 shutil.move( created_bed_name, output_bed )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
128
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
129 #OICR peak_xls file
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
130 created_peak_xls_file = os.path.join( tmp_dir, "%s_peaks.xls" % experiment_name )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
131 if os.path.exists( created_peak_xls_file ):
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
132 # shutil.copy( created_peak_xls_file, os.path.join ( "/mnt/galaxyData/tmp/", "%s_peaks.xls" % ( os.path.basename(output_extra_path) )))
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
133 shutil.copyfile( created_peak_xls_file, output_peaks )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
134
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
135 #peaks.encodepeaks (narrowpeaks) file
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
136 created_narrowpeak_file = os.path.join (tmp_dir, "%s_peaks.encodePeak" % experiment_name )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
137 if os.path.exists( created_narrowpeak_file ):
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
138 shutil.move (created_narrowpeak_file, output_narrowpeaks )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
139
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
140 #parse xls files to interval files as needed
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
141 #if 'xls_to_interval' in options:
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
142 if (options['xls_to_interval'] == "True"):
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
143 create_peak_xls_file = os.path.join( tmp_dir, '%s_peaks.xls' % experiment_name )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
144 if os.path.exists( create_peak_xls_file ):
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
145 xls_to_interval( create_peak_xls_file, output_xls_to_interval_peaks_file, header = 'peaks file' )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
146 create_peak_xls_file = os.path.join( tmp_dir, '%s_negative_peaks.xls' % experiment_name )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
147 if os.path.exists( create_peak_xls_file ):
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
148 print "negative file exists"
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
149 xls_to_interval( create_peak_xls_file, output_xls_to_interval_negative_peaks_file, header = 'negative peaks file' )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
150
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
151 #move all remaining files to extra files path of html file output to allow user download
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
152 out_html = open( output_extra_html, 'wb' )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
153 out_html.write( '<html><head><title>Additional output created by MACS (%s)</title></head><body><h3>Additional Files:</h3><p><ul>\n' % experiment_name )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
154 os.mkdir( output_extra_path )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
155 for filename in sorted( os.listdir( tmp_dir ) ):
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
156 shutil.move( os.path.join( tmp_dir, filename ), os.path.join( output_extra_path, filename ) )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
157 out_html.write( '<li><a href="%s">%s</a></li>\n' % ( filename, filename ) )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
158 #out_html.write( '<li><a href="%s">%s</a>peakxls %s SomethingDifferent tmp_dir %s path %s exp_name %s</li>\n' % ( created_peak_xls_file, filename, filename, tmp_dir, output_extra_path, experiment_name ) )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
159 out_html.write( '</ul></p>\n' )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
160 out_html.write( '<h3>Messages from MACS:</h3>\n<p><pre>%s</pre></p>\n' % open( stderr_name, 'rb' ).read() )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
161 out_html.write( '</body></html>\n' )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
162 out_html.close()
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
163
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
164 #=================================================================================
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
165 #move files generated by bdgcmp command
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
166 if (options['command'] == "bdgcmp"):
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
167 created_bdgcmp_file = os.path.join (tmp_dir, "bdgcmp_out.bdg" )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
168 if os.path.exists( created_bdgcmp_file ):
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
169 shutil.move (created_bdgcmp_file, output_bdgcmp )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
170
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
171 #=================================================================================
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
172 #cleanup
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
173 #=================================================================================
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
174 os.unlink( stderr_name )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
175 os.rmdir( tmp_dir )
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
176
6bc303d12c70 planemo upload
eduardo
parents:
diff changeset
177 if __name__ == "__main__": main()