41
|
1 #!/usr/bin/env python
|
|
2 #
|
|
3 # Wrapper script to run Amplicon_analysis_pipeline.sh
|
|
4 # from Galaxy tool
|
|
5
|
|
6 import sys
|
|
7 import os
|
|
8 import argparse
|
|
9 import subprocess
|
|
10 import glob
|
|
11
|
|
12 class PipelineCmd(object):
|
|
13 def __init__(self,cmd):
|
|
14 self.cmd = [str(cmd)]
|
|
15 def add_args(self,*args):
|
|
16 for arg in args:
|
|
17 self.cmd.append(str(arg))
|
|
18 def __repr__(self):
|
|
19 return ' '.join([str(arg) for arg in self.cmd])
|
|
20
|
|
21 def ahref(target,name=None,type=None):
|
|
22 if name is None:
|
|
23 name = os.path.basename(target)
|
|
24 ahref = "<a href='%s'" % target
|
|
25 if type is not None:
|
|
26 ahref += " type='%s'" % type
|
|
27 ahref += ">%s</a>" % name
|
|
28 return ahref
|
|
29
|
|
30 def check_errors():
|
|
31 # Errors in Amplicon_analysis_pipeline.log
|
|
32 with open('Amplicon_analysis_pipeline.log','r') as pipeline_log:
|
|
33 log = pipeline_log.read()
|
|
34 if "Names in the first column of Metatable.txt and in the second column of Final_name.txt do not match" in log:
|
|
35 print_error("""*** Sample IDs don't match dataset names ***
|
|
36
|
|
37 The sample IDs (first column of the Metatable file) don't match the
|
|
38 supplied sample names for the input Fastq pairs.
|
|
39 """)
|
|
40 # Errors in pipeline output
|
|
41 with open('pipeline.log','r') as pipeline_log:
|
|
42 log = pipeline_log.read()
|
|
43 if "Errors and/or warnings detected in mapping file" in log:
|
|
44 with open("Metatable_log/Metatable.log","r") as metatable_log:
|
|
45 # Echo the Metatable log file to the tool log
|
|
46 print_error("""*** Error in Metatable mapping file ***
|
|
47
|
|
48 %s""" % metatable_log.read())
|
|
49 elif "No header line was found in mapping file" in log:
|
|
50 # Report error to the tool log
|
|
51 print_error("""*** No header in Metatable mapping file ***
|
|
52
|
|
53 Check you've specified the correct file as the input Metatable""")
|
|
54
|
|
55 def print_error(message):
|
|
56 width = max([len(line) for line in message.split('\n')]) + 4
|
|
57 sys.stderr.write("\n%s\n" % ('*'*width))
|
|
58 for line in message.split('\n'):
|
|
59 sys.stderr.write("* %s%s *\n" % (line,' '*(width-len(line)-4)))
|
|
60 sys.stderr.write("%s\n\n" % ('*'*width))
|
|
61
|
|
62 def clean_up_name(sample):
|
|
63 # Remove extensions and trailing "_L[0-9]+_001" from
|
|
64 # Fastq pair names
|
|
65 sample_name = '.'.join(sample.split('.')[:1])
|
|
66 split_name = sample_name.split('_')
|
|
67 if split_name[-1] == "001":
|
|
68 split_name = split_name[:-1]
|
|
69 if split_name[-1].startswith('L'):
|
|
70 try:
|
|
71 int(split_name[-1][1:])
|
|
72 split_name = split_name[:-1]
|
|
73 except ValueError:
|
|
74 pass
|
|
75 return '_'.join(split_name)
|
|
76
|
|
77 def list_outputs(filen=None):
|
|
78 # List the output directory contents
|
|
79 # If filen is specified then will be the filename to
|
|
80 # write to, otherwise write to stdout
|
|
81 if filen is not None:
|
|
82 fp = open(filen,'w')
|
|
83 else:
|
|
84 fp = sys.stdout
|
|
85 results_dir = os.path.abspath("RESULTS")
|
|
86 fp.write("Listing contents of output dir %s:\n" % results_dir)
|
|
87 ix = 0
|
|
88 for d,dirs,files in os.walk(results_dir):
|
|
89 ix += 1
|
|
90 fp.write("-- %d: %s\n" % (ix,
|
|
91 os.path.relpath(d,results_dir)))
|
|
92 for f in files:
|
|
93 ix += 1
|
|
94 fp.write("---- %d: %s\n" % (ix,
|
|
95 os.path.relpath(f,results_dir)))
|
|
96 # Close output file
|
|
97 if filen is not None:
|
|
98 fp.close()
|
|
99
|
|
100 if __name__ == "__main__":
|
|
101 # Command line
|
|
102 print "Amplicon analysis: starting"
|
|
103 p = argparse.ArgumentParser()
|
|
104 p.add_argument("metatable",
|
|
105 metavar="METATABLE_FILE",
|
|
106 help="Metatable.txt file")
|
|
107 p.add_argument("fastq_pairs",
|
|
108 metavar="SAMPLE_NAME FQ_R1 FQ_R2",
|
|
109 nargs="+",
|
|
110 default=list(),
|
|
111 help="Triplets of SAMPLE_NAME followed by "
|
|
112 "a R1/R2 FASTQ file pair")
|
|
113 p.add_argument("-g",dest="forward_pcr_primer")
|
|
114 p.add_argument("-G",dest="reverse_pcr_primer")
|
|
115 p.add_argument("-q",dest="trimming_threshold")
|
|
116 p.add_argument("-O",dest="minimum_overlap")
|
|
117 p.add_argument("-L",dest="minimum_length")
|
|
118 p.add_argument("-l",dest="sliding_window_length")
|
|
119 p.add_argument("-P",dest="pipeline",
|
|
120 choices=["Vsearch","DADA2"],
|
|
121 type=str,
|
|
122 default="Vsearch")
|
|
123 p.add_argument("-S",dest="use_silva",action="store_true")
|
|
124 p.add_argument("-H",dest="use_homd",action="store_true")
|
|
125 p.add_argument("-r",dest="reference_data_path")
|
|
126 p.add_argument("-c",dest="categories_file")
|
|
127 args = p.parse_args()
|
|
128
|
|
129 # Build the environment for running the pipeline
|
|
130 print "Amplicon analysis: building the environment"
|
|
131 metatable_file = os.path.abspath(args.metatable)
|
|
132 os.symlink(metatable_file,"Metatable.txt")
|
|
133 print "-- made symlink to Metatable.txt"
|
|
134
|
|
135 # Link to Categories.txt file (if provided)
|
|
136 if args.categories_file is not None:
|
|
137 categories_file = os.path.abspath(args.categories_file)
|
|
138 os.symlink(categories_file,"Categories.txt")
|
|
139 print "-- made symlink to Categories.txt"
|
|
140
|
|
141 # Link to FASTQs and construct Final_name.txt file
|
|
142 sample_names = []
|
|
143 print "-- making Final_name.txt"
|
|
144 with open("Final_name.txt",'w') as final_name:
|
|
145 fastqs = iter(args.fastq_pairs)
|
|
146 for sample_name,fqr1,fqr2 in zip(fastqs,fastqs,fastqs):
|
|
147 sample_name = clean_up_name(sample_name)
|
|
148 print " %s" % sample_name
|
|
149 r1 = "%s_R1_.fastq" % sample_name
|
|
150 r2 = "%s_R2_.fastq" % sample_name
|
|
151 os.symlink(fqr1,r1)
|
|
152 os.symlink(fqr2,r2)
|
|
153 final_name.write("%s\n" % '\t'.join((r1,sample_name)))
|
|
154 final_name.write("%s\n" % '\t'.join((r2,sample_name)))
|
|
155 sample_names.append(sample_name)
|
|
156
|
|
157 # Reference database
|
|
158 if args.use_silva:
|
|
159 ref_database = "silva"
|
|
160 elif args.use_homd:
|
|
161 ref_database = "homd"
|
|
162 else:
|
|
163 ref_database = "gg"
|
|
164
|
|
165 # Construct the pipeline command
|
|
166 print "Amplicon analysis: constructing pipeline command"
|
|
167 pipeline = PipelineCmd("Amplicon_analysis_pipeline.sh")
|
|
168 if args.forward_pcr_primer:
|
|
169 pipeline.add_args("-g",args.forward_pcr_primer)
|
|
170 if args.reverse_pcr_primer:
|
|
171 pipeline.add_args("-G",args.reverse_pcr_primer)
|
|
172 if args.trimming_threshold:
|
|
173 pipeline.add_args("-q",args.trimming_threshold)
|
|
174 if args.minimum_overlap:
|
|
175 pipeline.add_args("-O",args.minimum_overlap)
|
|
176 if args.minimum_length:
|
|
177 pipeline.add_args("-L",args.minimum_length)
|
|
178 if args.sliding_window_length:
|
|
179 pipeline.add_args("-l",args.sliding_window_length)
|
|
180 if args.reference_data_path:
|
|
181 pipeline.add_args("-r",args.reference_data_path)
|
|
182 pipeline.add_args("-P",args.pipeline)
|
|
183 if ref_database == "silva":
|
|
184 pipeline.add_args("-S")
|
|
185 elif ref_database == "homd":
|
|
186 pipeline.add_args("-H")
|
|
187
|
|
188 # Echo the pipeline command to stdout
|
|
189 print "Running %s" % pipeline
|
|
190
|
|
191 # Run the pipeline
|
|
192 with open("pipeline.log","w") as pipeline_out:
|
|
193 try:
|
|
194 subprocess.check_call(pipeline.cmd,
|
|
195 stdout=pipeline_out,
|
|
196 stderr=subprocess.STDOUT)
|
|
197 exit_code = 0
|
|
198 print "Pipeline completed ok"
|
|
199 except subprocess.CalledProcessError as ex:
|
|
200 # Non-zero exit status
|
|
201 sys.stderr.write("Pipeline failed: exit code %s\n" %
|
|
202 ex.returncode)
|
|
203 exit_code = ex.returncode
|
|
204 except Exception as ex:
|
|
205 # Some other problem
|
|
206 sys.stderr.write("Unexpected error: %s\n" % str(ex))
|
|
207 exit_code = 1
|
|
208
|
|
209 # Write out the list of outputs
|
|
210 outputs_file = "Pipeline_outputs.txt"
|
|
211 list_outputs(outputs_file)
|
|
212
|
|
213 # Check for log file
|
|
214 log_file = "Amplicon_analysis_pipeline.log"
|
|
215 if os.path.exists(log_file):
|
|
216 print "Found log file: %s" % log_file
|
|
217 if exit_code == 0:
|
|
218 # Create an HTML file to link to log files etc
|
|
219 # NB the paths to the files should be correct once
|
|
220 # copied by Galaxy on job completion
|
|
221 with open("pipeline_outputs.html","w") as html_out:
|
|
222 html_out.write("""<html>
|
|
223 <head>
|
|
224 <title>Amplicon analysis pipeline: log files</title>
|
|
225 <head>
|
|
226 <body>
|
|
227 <h1>Amplicon analysis pipeline: log files</h1>
|
|
228 <ul>
|
|
229 """)
|
|
230 html_out.write(
|
|
231 "<li>%s</li>\n" %
|
|
232 ahref("Amplicon_analysis_pipeline.log",
|
|
233 type="text/plain"))
|
|
234 html_out.write(
|
|
235 "<li>%s</li>\n" %
|
|
236 ahref("pipeline.log",type="text/plain"))
|
|
237 html_out.write(
|
|
238 "<li>%s</li>\n" %
|
|
239 ahref("Pipeline_outputs.txt",
|
|
240 type="text/plain"))
|
|
241 html_out.write(
|
|
242 "<li>%s</li>\n" %
|
|
243 ahref("Metatable.html"))
|
|
244 html_out.write("""<ul>
|
|
245 </body>
|
|
246 </html>
|
|
247 """)
|
|
248 else:
|
|
249 # Check for known error messages
|
|
250 check_errors()
|
|
251 # Write pipeline stdout to tool stderr
|
|
252 sys.stderr.write("\nOutput from pipeline:\n")
|
|
253 with open("pipeline.log",'r') as log:
|
|
254 sys.stderr.write("%s" % log.read())
|
|
255 # Write log file contents to tool log
|
|
256 print "\nAmplicon_analysis_pipeline.log:"
|
|
257 with open(log_file,'r') as log:
|
|
258 print "%s" % log.read()
|
|
259 else:
|
|
260 sys.stderr.write("ERROR missing log file \"%s\"\n" %
|
|
261 log_file)
|
|
262
|
|
263 # Handle FastQC boxplots
|
|
264 print "Amplicon analysis: collating per base quality boxplots"
|
|
265 with open("fastqc_quality_boxplots.html","w") as quality_boxplots:
|
|
266 # PHRED value for trimming
|
|
267 phred_score = 20
|
|
268 if args.trimming_threshold is not None:
|
|
269 phred_score = args.trimming_threshold
|
|
270 # Write header for HTML output file
|
|
271 quality_boxplots.write("""<html>
|
|
272 <head>
|
|
273 <title>Amplicon analysis pipeline: Per-base Quality Boxplots (FastQC)</title>
|
|
274 <head>
|
|
275 <body>
|
|
276 <h1>Amplicon analysis pipeline: Per-base Quality Boxplots (FastQC)</h1>
|
|
277 """)
|
|
278 # Look for raw and trimmed FastQC output for each sample
|
|
279 for sample_name in sample_names:
|
|
280 fastqc_dir = os.path.join(sample_name,"FastQC")
|
|
281 quality_boxplots.write("<h2>%s</h2>" % sample_name)
|
|
282 for d in ("Raw","cutdapt_sickle/Q%s" % phred_score):
|
|
283 quality_boxplots.write("<h3>%s</h3>" % d)
|
|
284 fastqc_html_files = glob.glob(
|
|
285 os.path.join(fastqc_dir,d,"*_fastqc.html"))
|
|
286 if not fastqc_html_files:
|
|
287 quality_boxplots.write("<p>No FastQC outputs found</p>")
|
|
288 continue
|
|
289 # Pull out the per-base quality boxplots
|
|
290 for f in fastqc_html_files:
|
|
291 boxplot = None
|
|
292 with open(f) as fp:
|
|
293 for line in fp.read().split(">"):
|
|
294 try:
|
|
295 line.index("alt=\"Per base quality graph\"")
|
|
296 boxplot = line + ">"
|
|
297 break
|
|
298 except ValueError:
|
|
299 pass
|
|
300 if boxplot is None:
|
|
301 boxplot = "Missing plot"
|
|
302 quality_boxplots.write("<h4>%s</h4><p>%s</p>" %
|
|
303
|
|
304 (os.path.basename(f),
|
|
305 boxplot))
|
|
306 quality_boxplots.write("""</body>
|
|
307 </html>
|
|
308 """)
|
|
309
|
|
310 # Handle DADA2 error rate plot PDFs
|
|
311 if args.pipeline == "DADA2":
|
|
312 print("Amplicon analysis: collecting error rate plots")
|
|
313 error_rate_plots_dir = os.path.abspath(
|
|
314 os.path.join("DADA2_OTU_tables",
|
|
315 "Error_rate_plots"))
|
|
316 error_rate_plot_pdfs = [os.path.basename(pdf)
|
|
317 for pdf in
|
|
318 sorted(glob.glob(
|
|
319 os.path.join(error_rate_plots_dir,"*.pdf")))]
|
|
320 with open("error_rate_plots.html","w") as error_rate_plots_out:
|
|
321 error_rate_plots_out.write("""<html>
|
|
322 <head>
|
|
323 <title>Amplicon analysis pipeline: DADA2 Error Rate Plots</title>
|
|
324 <head>
|
|
325 <body>
|
|
326 <h1>Amplicon analysis pipeline: DADA2 Error Rate Plots</h1>
|
|
327 """)
|
|
328 error_rate_plots_out.write("<ul>\n")
|
|
329 for pdf in error_rate_plot_pdfs:
|
|
330 error_rate_plots_out.write("<li>%s</li>\n" % ahref(pdf))
|
|
331 error_rate_plots_out.write("<ul>\n")
|
|
332 error_rate_plots_out.write("""</body>
|
|
333 </html>
|
|
334 """)
|
|
335
|
|
336 # Handle additional output when categories file was supplied
|
|
337 if args.categories_file is not None:
|
|
338 # Alpha diversity boxplots
|
|
339 print "Amplicon analysis: indexing alpha diversity boxplots"
|
|
340 boxplots_dir = os.path.abspath(
|
|
341 os.path.join("RESULTS",
|
|
342 "%s_%s" % (args.pipeline,
|
|
343 ref_database),
|
|
344 "Alpha_diversity",
|
|
345 "Alpha_diversity_boxplot",
|
|
346 "Categories_shannon"))
|
|
347 print "Amplicon analysis: gathering PDFs from %s" % boxplots_dir
|
|
348 boxplot_pdfs = [os.path.basename(pdf)
|
|
349 for pdf in
|
|
350 sorted(glob.glob(
|
|
351 os.path.join(boxplots_dir,"*.pdf")))]
|
|
352 with open("alpha_diversity_boxplots.html","w") as boxplots_out:
|
|
353 boxplots_out.write("""<html>
|
|
354 <head>
|
|
355 <title>Amplicon analysis pipeline: Alpha Diversity Boxplots (Shannon)</title>
|
|
356 <head>
|
|
357 <body>
|
|
358 <h1>Amplicon analysis pipeline: Alpha Diversity Boxplots (Shannon)</h1>
|
|
359 """)
|
|
360 boxplots_out.write("<ul>\n")
|
|
361 for pdf in boxplot_pdfs:
|
|
362 boxplots_out.write("<li>%s</li>\n" % ahref(pdf))
|
|
363 boxplots_out.write("<ul>\n")
|
|
364 boxplots_out.write("""</body>
|
|
365 </html>
|
|
366 """)
|
|
367
|
|
368 # Finish
|
|
369 print "Amplicon analysis: finishing, exit code: %s" % exit_code
|
|
370 sys.exit(exit_code)
|