Mercurial > repos > pjbriggs > amplicon_analysis_pipeline
comparison amplicon_analysis_pipeline.py @ 42:098ad1dd7760 draft
planemo upload for repository https://github.com/pjbriggs/Amplicon_analysis-galaxy commit 10be6f00106e853a6720e4052871d9d84e027137
author | pjbriggs |
---|---|
date | Thu, 05 Dec 2019 11:48:01 +0000 |
parents | |
children | 1bc0077b2c91 |
comparison
equal
deleted
inserted
replaced
41:7b9786a43a16 | 42:098ad1dd7760 |
---|---|
1 #!/usr/bin/env python | |
2 # | |
3 # Wrapper script to run Amplicon_analysis_pipeline.sh | |
4 # from Galaxy tool | |
5 | |
6 import sys | |
7 import os | |
8 import argparse | |
9 import subprocess | |
10 import glob | |
11 | |
12 class PipelineCmd(object): | |
13 def __init__(self,cmd): | |
14 self.cmd = [str(cmd)] | |
15 def add_args(self,*args): | |
16 for arg in args: | |
17 self.cmd.append(str(arg)) | |
18 def __repr__(self): | |
19 return ' '.join([str(arg) for arg in self.cmd]) | |
20 | |
21 def ahref(target,name=None,type=None): | |
22 if name is None: | |
23 name = os.path.basename(target) | |
24 ahref = "<a href='%s'" % target | |
25 if type is not None: | |
26 ahref += " type='%s'" % type | |
27 ahref += ">%s</a>" % name | |
28 return ahref | |
29 | |
30 def check_errors(): | |
31 # Errors in Amplicon_analysis_pipeline.log | |
32 with open('Amplicon_analysis_pipeline.log','r') as pipeline_log: | |
33 log = pipeline_log.read() | |
34 if "Names in the first column of Metatable.txt and in the second column of Final_name.txt do not match" in log: | |
35 print_error("""*** Sample IDs don't match dataset names *** | |
36 | |
37 The sample IDs (first column of the Metatable file) don't match the | |
38 supplied sample names for the input Fastq pairs. | |
39 """) | |
40 # Errors in pipeline output | |
41 with open('pipeline.log','r') as pipeline_log: | |
42 log = pipeline_log.read() | |
43 if "Errors and/or warnings detected in mapping file" in log: | |
44 with open("Metatable_log/Metatable.log","r") as metatable_log: | |
45 # Echo the Metatable log file to the tool log | |
46 print_error("""*** Error in Metatable mapping file *** | |
47 | |
48 %s""" % metatable_log.read()) | |
49 elif "No header line was found in mapping file" in log: | |
50 # Report error to the tool log | |
51 print_error("""*** No header in Metatable mapping file *** | |
52 | |
53 Check you've specified the correct file as the input Metatable""") | |
54 | |
55 def print_error(message): | |
56 width = max([len(line) for line in message.split('\n')]) + 4 | |
57 sys.stderr.write("\n%s\n" % ('*'*width)) | |
58 for line in message.split('\n'): | |
59 sys.stderr.write("* %s%s *\n" % (line,' '*(width-len(line)-4))) | |
60 sys.stderr.write("%s\n\n" % ('*'*width)) | |
61 | |
62 def clean_up_name(sample): | |
63 # Remove extensions and trailing "_L[0-9]+_001" from | |
64 # Fastq pair names | |
65 sample_name = '.'.join(sample.split('.')[:1]) | |
66 split_name = sample_name.split('_') | |
67 if split_name[-1] == "001": | |
68 split_name = split_name[:-1] | |
69 if split_name[-1].startswith('L'): | |
70 try: | |
71 int(split_name[-1][1:]) | |
72 split_name = split_name[:-1] | |
73 except ValueError: | |
74 pass | |
75 return '_'.join(split_name) | |
76 | |
77 def list_outputs(filen=None): | |
78 # List the output directory contents | |
79 # If filen is specified then will be the filename to | |
80 # write to, otherwise write to stdout | |
81 if filen is not None: | |
82 fp = open(filen,'w') | |
83 else: | |
84 fp = sys.stdout | |
85 results_dir = os.path.abspath("RESULTS") | |
86 fp.write("Listing contents of output dir %s:\n" % results_dir) | |
87 ix = 0 | |
88 for d,dirs,files in os.walk(results_dir): | |
89 ix += 1 | |
90 fp.write("-- %d: %s\n" % (ix, | |
91 os.path.relpath(d,results_dir))) | |
92 for f in files: | |
93 ix += 1 | |
94 fp.write("---- %d: %s\n" % (ix, | |
95 os.path.relpath(f,results_dir))) | |
96 # Close output file | |
97 if filen is not None: | |
98 fp.close() | |
99 | |
100 if __name__ == "__main__": | |
101 # Command line | |
102 print "Amplicon analysis: starting" | |
103 p = argparse.ArgumentParser() | |
104 p.add_argument("metatable", | |
105 metavar="METATABLE_FILE", | |
106 help="Metatable.txt file") | |
107 p.add_argument("fastq_pairs", | |
108 metavar="SAMPLE_NAME FQ_R1 FQ_R2", | |
109 nargs="+", | |
110 default=list(), | |
111 help="Triplets of SAMPLE_NAME followed by " | |
112 "a R1/R2 FASTQ file pair") | |
113 p.add_argument("-g",dest="forward_pcr_primer") | |
114 p.add_argument("-G",dest="reverse_pcr_primer") | |
115 p.add_argument("-q",dest="trimming_threshold") | |
116 p.add_argument("-O",dest="minimum_overlap") | |
117 p.add_argument("-L",dest="minimum_length") | |
118 p.add_argument("-l",dest="sliding_window_length") | |
119 p.add_argument("-P",dest="pipeline", | |
120 choices=["Vsearch","DADA2"], | |
121 type=str, | |
122 default="Vsearch") | |
123 p.add_argument("-S",dest="use_silva",action="store_true") | |
124 p.add_argument("-H",dest="use_homd",action="store_true") | |
125 p.add_argument("-r",dest="reference_data_path") | |
126 p.add_argument("-c",dest="categories_file") | |
127 args = p.parse_args() | |
128 | |
129 # Build the environment for running the pipeline | |
130 print "Amplicon analysis: building the environment" | |
131 metatable_file = os.path.abspath(args.metatable) | |
132 os.symlink(metatable_file,"Metatable.txt") | |
133 print "-- made symlink to Metatable.txt" | |
134 | |
135 # Link to Categories.txt file (if provided) | |
136 if args.categories_file is not None: | |
137 categories_file = os.path.abspath(args.categories_file) | |
138 os.symlink(categories_file,"Categories.txt") | |
139 print "-- made symlink to Categories.txt" | |
140 | |
141 # Link to FASTQs and construct Final_name.txt file | |
142 sample_names = [] | |
143 print "-- making Final_name.txt" | |
144 with open("Final_name.txt",'w') as final_name: | |
145 fastqs = iter(args.fastq_pairs) | |
146 for sample_name,fqr1,fqr2 in zip(fastqs,fastqs,fastqs): | |
147 sample_name = clean_up_name(sample_name) | |
148 print " %s" % sample_name | |
149 r1 = "%s_R1_.fastq" % sample_name | |
150 r2 = "%s_R2_.fastq" % sample_name | |
151 os.symlink(fqr1,r1) | |
152 os.symlink(fqr2,r2) | |
153 final_name.write("%s\n" % '\t'.join((r1,sample_name))) | |
154 final_name.write("%s\n" % '\t'.join((r2,sample_name))) | |
155 sample_names.append(sample_name) | |
156 | |
157 # Reference database | |
158 if args.use_silva: | |
159 ref_database = "silva" | |
160 elif args.use_homd: | |
161 ref_database = "homd" | |
162 else: | |
163 ref_database = "gg" | |
164 | |
165 # Construct the pipeline command | |
166 print "Amplicon analysis: constructing pipeline command" | |
167 pipeline = PipelineCmd("Amplicon_analysis_pipeline.sh") | |
168 if args.forward_pcr_primer: | |
169 pipeline.add_args("-g",args.forward_pcr_primer) | |
170 if args.reverse_pcr_primer: | |
171 pipeline.add_args("-G",args.reverse_pcr_primer) | |
172 if args.trimming_threshold: | |
173 pipeline.add_args("-q",args.trimming_threshold) | |
174 if args.minimum_overlap: | |
175 pipeline.add_args("-O",args.minimum_overlap) | |
176 if args.minimum_length: | |
177 pipeline.add_args("-L",args.minimum_length) | |
178 if args.sliding_window_length: | |
179 pipeline.add_args("-l",args.sliding_window_length) | |
180 if args.reference_data_path: | |
181 pipeline.add_args("-r",args.reference_data_path) | |
182 pipeline.add_args("-P",args.pipeline) | |
183 if ref_database == "silva": | |
184 pipeline.add_args("-S") | |
185 elif ref_database == "homd": | |
186 pipeline.add_args("-H") | |
187 | |
188 # Echo the pipeline command to stdout | |
189 print "Running %s" % pipeline | |
190 | |
191 # Run the pipeline | |
192 with open("pipeline.log","w") as pipeline_out: | |
193 try: | |
194 subprocess.check_call(pipeline.cmd, | |
195 stdout=pipeline_out, | |
196 stderr=subprocess.STDOUT) | |
197 exit_code = 0 | |
198 print "Pipeline completed ok" | |
199 except subprocess.CalledProcessError as ex: | |
200 # Non-zero exit status | |
201 sys.stderr.write("Pipeline failed: exit code %s\n" % | |
202 ex.returncode) | |
203 exit_code = ex.returncode | |
204 except Exception as ex: | |
205 # Some other problem | |
206 sys.stderr.write("Unexpected error: %s\n" % str(ex)) | |
207 exit_code = 1 | |
208 | |
209 # Write out the list of outputs | |
210 outputs_file = "Pipeline_outputs.txt" | |
211 list_outputs(outputs_file) | |
212 | |
213 # Check for log file | |
214 log_file = "Amplicon_analysis_pipeline.log" | |
215 if os.path.exists(log_file): | |
216 print "Found log file: %s" % log_file | |
217 if exit_code == 0: | |
218 # Create an HTML file to link to log files etc | |
219 # NB the paths to the files should be correct once | |
220 # copied by Galaxy on job completion | |
221 with open("pipeline_outputs.html","w") as html_out: | |
222 html_out.write("""<html> | |
223 <head> | |
224 <title>Amplicon analysis pipeline: log files</title> | |
225 <head> | |
226 <body> | |
227 <h1>Amplicon analysis pipeline: log files</h1> | |
228 <ul> | |
229 """) | |
230 html_out.write( | |
231 "<li>%s</li>\n" % | |
232 ahref("Amplicon_analysis_pipeline.log", | |
233 type="text/plain")) | |
234 html_out.write( | |
235 "<li>%s</li>\n" % | |
236 ahref("pipeline.log",type="text/plain")) | |
237 html_out.write( | |
238 "<li>%s</li>\n" % | |
239 ahref("Pipeline_outputs.txt", | |
240 type="text/plain")) | |
241 html_out.write( | |
242 "<li>%s</li>\n" % | |
243 ahref("Metatable.html")) | |
244 html_out.write("""<ul> | |
245 </body> | |
246 </html> | |
247 """) | |
248 else: | |
249 # Check for known error messages | |
250 check_errors() | |
251 # Write pipeline stdout to tool stderr | |
252 sys.stderr.write("\nOutput from pipeline:\n") | |
253 with open("pipeline.log",'r') as log: | |
254 sys.stderr.write("%s" % log.read()) | |
255 # Write log file contents to tool log | |
256 print "\nAmplicon_analysis_pipeline.log:" | |
257 with open(log_file,'r') as log: | |
258 print "%s" % log.read() | |
259 else: | |
260 sys.stderr.write("ERROR missing log file \"%s\"\n" % | |
261 log_file) | |
262 | |
263 # Handle FastQC boxplots | |
264 print "Amplicon analysis: collating per base quality boxplots" | |
265 with open("fastqc_quality_boxplots.html","w") as quality_boxplots: | |
266 # PHRED value for trimming | |
267 phred_score = 20 | |
268 if args.trimming_threshold is not None: | |
269 phred_score = args.trimming_threshold | |
270 # Write header for HTML output file | |
271 quality_boxplots.write("""<html> | |
272 <head> | |
273 <title>Amplicon analysis pipeline: Per-base Quality Boxplots (FastQC)</title> | |
274 <head> | |
275 <body> | |
276 <h1>Amplicon analysis pipeline: Per-base Quality Boxplots (FastQC)</h1> | |
277 """) | |
278 # Look for raw and trimmed FastQC output for each sample | |
279 for sample_name in sample_names: | |
280 fastqc_dir = os.path.join(sample_name,"FastQC") | |
281 quality_boxplots.write("<h2>%s</h2>" % sample_name) | |
282 for d in ("Raw","cutdapt_sickle/Q%s" % phred_score): | |
283 quality_boxplots.write("<h3>%s</h3>" % d) | |
284 fastqc_html_files = glob.glob( | |
285 os.path.join(fastqc_dir,d,"*_fastqc.html")) | |
286 if not fastqc_html_files: | |
287 quality_boxplots.write("<p>No FastQC outputs found</p>") | |
288 continue | |
289 # Pull out the per-base quality boxplots | |
290 for f in fastqc_html_files: | |
291 boxplot = None | |
292 with open(f) as fp: | |
293 for line in fp.read().split(">"): | |
294 try: | |
295 line.index("alt=\"Per base quality graph\"") | |
296 boxplot = line + ">" | |
297 break | |
298 except ValueError: | |
299 pass | |
300 if boxplot is None: | |
301 boxplot = "Missing plot" | |
302 quality_boxplots.write("<h4>%s</h4><p>%s</p>" % | |
303 | |
304 (os.path.basename(f), | |
305 boxplot)) | |
306 quality_boxplots.write("""</body> | |
307 </html> | |
308 """) | |
309 | |
310 # Handle DADA2 error rate plot PDFs | |
311 if args.pipeline == "DADA2": | |
312 print("Amplicon analysis: collecting error rate plots") | |
313 error_rate_plots_dir = os.path.abspath( | |
314 os.path.join("DADA2_OTU_tables", | |
315 "Error_rate_plots")) | |
316 error_rate_plot_pdfs = [os.path.basename(pdf) | |
317 for pdf in | |
318 sorted(glob.glob( | |
319 os.path.join(error_rate_plots_dir,"*.pdf")))] | |
320 with open("error_rate_plots.html","w") as error_rate_plots_out: | |
321 error_rate_plots_out.write("""<html> | |
322 <head> | |
323 <title>Amplicon analysis pipeline: DADA2 Error Rate Plots</title> | |
324 <head> | |
325 <body> | |
326 <h1>Amplicon analysis pipeline: DADA2 Error Rate Plots</h1> | |
327 """) | |
328 error_rate_plots_out.write("<ul>\n") | |
329 for pdf in error_rate_plot_pdfs: | |
330 error_rate_plots_out.write("<li>%s</li>\n" % ahref(pdf)) | |
331 error_rate_plots_out.write("<ul>\n") | |
332 error_rate_plots_out.write("""</body> | |
333 </html> | |
334 """) | |
335 | |
336 # Handle additional output when categories file was supplied | |
337 if args.categories_file is not None: | |
338 # Alpha diversity boxplots | |
339 print "Amplicon analysis: indexing alpha diversity boxplots" | |
340 boxplots_dir = os.path.abspath( | |
341 os.path.join("RESULTS", | |
342 "%s_%s" % (args.pipeline, | |
343 ref_database), | |
344 "Alpha_diversity", | |
345 "Alpha_diversity_boxplot", | |
346 "Categories_shannon")) | |
347 print "Amplicon analysis: gathering PDFs from %s" % boxplots_dir | |
348 boxplot_pdfs = [os.path.basename(pdf) | |
349 for pdf in | |
350 sorted(glob.glob( | |
351 os.path.join(boxplots_dir,"*.pdf")))] | |
352 with open("alpha_diversity_boxplots.html","w") as boxplots_out: | |
353 boxplots_out.write("""<html> | |
354 <head> | |
355 <title>Amplicon analysis pipeline: Alpha Diversity Boxplots (Shannon)</title> | |
356 <head> | |
357 <body> | |
358 <h1>Amplicon analysis pipeline: Alpha Diversity Boxplots (Shannon)</h1> | |
359 """) | |
360 boxplots_out.write("<ul>\n") | |
361 for pdf in boxplot_pdfs: | |
362 boxplots_out.write("<li>%s</li>\n" % ahref(pdf)) | |
363 boxplots_out.write("<ul>\n") | |
364 boxplots_out.write("""</body> | |
365 </html> | |
366 """) | |
367 | |
368 # Finish | |
369 print "Amplicon analysis: finishing, exit code: %s" % exit_code | |
370 sys.exit(exit_code) |