annotate fastqc_checker.py @ 0:d8d131d08779 draft default tip

Initial upload.
author hackdna
date Tue, 21 May 2013 11:48:53 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
1 #!/usr/bin/env python
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
2
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
3 '''
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
4 FastQC checker for Galaxy biomedical data analysis platform
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
5
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
6 @author: Ilya Sytchev
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
7
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
8 Input: one or more files in fastq format
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
9 Output: sequencing quality report in text format
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
10
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
11 Requires FastQC 0.10.0 (http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/)
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
12
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
13 Partially based on:
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
14 fastqcwrapper (http://toolshed.g2.bx.psu.edu/repos/jjohnson/fastqc)
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
15 rgFastQC (https://bitbucket.org/galaxy/galaxy-dist/src/tip/tools/rgenetics/rgFastQC.py)
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
16
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
17 Tested with Python 2.6.1 and 2.7.2 on Mac OS 10.6.8
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
18 '''
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
19
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
20 import sys, os, optparse, tempfile, shutil, subprocess
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
21
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
22 def stop_err(msg, returncode=1):
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
23 sys.stderr.write(msg)
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
24 sys.exit(returncode)
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
25
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
26 def __main__():
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
27 usage = "Usage: %prog -e fastqc_executable -o output_file fastq_file [fastq_file ... ]"
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
28 version = "%prog 1.0.0"
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
29 op = optparse.OptionParser(usage=usage, version=version)
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
30 op.add_option('-e', '--executable', dest="executable", help="location of the FastQC program")
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
31 op.add_option('-o', '--output', dest="outfile", help="location of the output file")
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
32 (options, infiles) = op.parse_args()
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
33
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
34 # check if location of the FastQC program was provided
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
35 if options.executable == None:
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
36 op.error("Missing location of FastQC")
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
37
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
38 # check if FastQC program exists at the provided location
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
39 if not os.path.isfile(options.executable):
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
40 op.error("Cannot find FastQC at %s" % options.executable)
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
41
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
42 # check if any input files were provided
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
43 if infiles == None:
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
44 op.error("Missing input files")
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
45
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
46 # check if all input files exist
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
47 for f in infiles:
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
48 if not os.path.isfile(f):
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
49 op.error("Cannot find input file %s" % f)
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
50
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
51 # check if output file was provided
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
52 if options.outfile == None:
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
53 op.error("Missing output file name")
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
54
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
55 # assemble FastQC command line
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
56 cmd = [] # list is more secure than string for subprocess call
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
57 cmd.append(options.executable)
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
58 tmpdir = tempfile.mkdtemp() # create temp dir for FastQC output
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
59 cmd.extend(['-o', tmpdir])
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
60 cmd.extend(infiles)
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
61
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
62 # prepare files for FastQC stdout and stderr
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
63 tmp_stderr_name = tempfile.NamedTemporaryFile(dir=tmpdir, suffix='.err').name
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
64 tmp_stderr = open(tmp_stderr_name, 'w')
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
65 tmp_stdout_name = tempfile.NamedTemporaryFile(dir=tmpdir, suffix='.out').name
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
66 tmp_stdout = open(tmp_stdout_name, 'w')
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
67 # run FastQC
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
68 try:
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
69 subprocess.check_call(cmd, stderr=tmp_stderr.fileno(), stdout=tmp_stdout.fileno())
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
70 except subprocess.CalledProcessError as e:
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
71 stop_err("Error executing FastQC\n", e.returncode)
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
72 finally:
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
73 tmp_stderr.close()
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
74 tmp_stdout.close()
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
75
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
76 outfile = open(options.outfile, 'w')
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
77
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
78 # parse all summary.txt files produced by FastQC and write results into the output file
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
79 for f in infiles:
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
80 filename = os.path.basename(f)
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
81 (datasetname, extension) = os.path.splitext(filename)
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
82 # Need to account for FastQC removing .fastq extension from input file names before using them to create output file and dir names
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
83 # Alternative solution is to iterate over report directories instead of input file names
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
84 if extension == '.fastq':
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
85 summaryfilename = os.path.join(tmpdir, datasetname + '_fastqc', 'summary.txt')
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
86 else:
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
87 summaryfilename = os.path.join(tmpdir, filename + '_fastqc', 'summary.txt')
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
88 outfile.write("%s results:\n" % datasetname)
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
89 # if summary file exists, process and add results to the output file
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
90 if os.path.isfile(summaryfilename):
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
91 summaryfile = open(summaryfilename, 'r')
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
92 for line in summaryfile:
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
93 (result, test) = line.split('\t')[:2]
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
94 outfile.write(result + '\t' + test + '\n')
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
95 summaryfile.close()
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
96 else:
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
97 outfile.write("FastQC summary report was not found at %s.\n" % summaryfilename)
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
98 outfile.write("\n")
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
99
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
100 outfile.close()
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
101
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
102 # clean up temp dir, put in a try block so we don't fail on stale nfs handles
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
103 try:
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
104 if os.path.exists(tmpdir):
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
105 shutil.rmtree(tmpdir)
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
106 except:
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
107 pass
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
108
d8d131d08779 Initial upload.
hackdna
parents:
diff changeset
109 if __name__ == '__main__': __main__()