comparison fastqc_checker.py @ 0:d8d131d08779 draft default tip

Initial upload.
author hackdna
date Tue, 21 May 2013 11:48:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d8d131d08779
1 #!/usr/bin/env python
2
3 '''
4 FastQC checker for Galaxy biomedical data analysis platform
5
6 @author: Ilya Sytchev
7
8 Input: one or more files in fastq format
9 Output: sequencing quality report in text format
10
11 Requires FastQC 0.10.0 (http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/)
12
13 Partially based on:
14 fastqcwrapper (http://toolshed.g2.bx.psu.edu/repos/jjohnson/fastqc)
15 rgFastQC (https://bitbucket.org/galaxy/galaxy-dist/src/tip/tools/rgenetics/rgFastQC.py)
16
17 Tested with Python 2.6.1 and 2.7.2 on Mac OS 10.6.8
18 '''
19
20 import sys, os, optparse, tempfile, shutil, subprocess
21
22 def stop_err(msg, returncode=1):
23 sys.stderr.write(msg)
24 sys.exit(returncode)
25
26 def __main__():
27 usage = "Usage: %prog -e fastqc_executable -o output_file fastq_file [fastq_file ... ]"
28 version = "%prog 1.0.0"
29 op = optparse.OptionParser(usage=usage, version=version)
30 op.add_option('-e', '--executable', dest="executable", help="location of the FastQC program")
31 op.add_option('-o', '--output', dest="outfile", help="location of the output file")
32 (options, infiles) = op.parse_args()
33
34 # check if location of the FastQC program was provided
35 if options.executable == None:
36 op.error("Missing location of FastQC")
37
38 # check if FastQC program exists at the provided location
39 if not os.path.isfile(options.executable):
40 op.error("Cannot find FastQC at %s" % options.executable)
41
42 # check if any input files were provided
43 if infiles == None:
44 op.error("Missing input files")
45
46 # check if all input files exist
47 for f in infiles:
48 if not os.path.isfile(f):
49 op.error("Cannot find input file %s" % f)
50
51 # check if output file was provided
52 if options.outfile == None:
53 op.error("Missing output file name")
54
55 # assemble FastQC command line
56 cmd = [] # list is more secure than string for subprocess call
57 cmd.append(options.executable)
58 tmpdir = tempfile.mkdtemp() # create temp dir for FastQC output
59 cmd.extend(['-o', tmpdir])
60 cmd.extend(infiles)
61
62 # prepare files for FastQC stdout and stderr
63 tmp_stderr_name = tempfile.NamedTemporaryFile(dir=tmpdir, suffix='.err').name
64 tmp_stderr = open(tmp_stderr_name, 'w')
65 tmp_stdout_name = tempfile.NamedTemporaryFile(dir=tmpdir, suffix='.out').name
66 tmp_stdout = open(tmp_stdout_name, 'w')
67 # run FastQC
68 try:
69 subprocess.check_call(cmd, stderr=tmp_stderr.fileno(), stdout=tmp_stdout.fileno())
70 except subprocess.CalledProcessError as e:
71 stop_err("Error executing FastQC\n", e.returncode)
72 finally:
73 tmp_stderr.close()
74 tmp_stdout.close()
75
76 outfile = open(options.outfile, 'w')
77
78 # parse all summary.txt files produced by FastQC and write results into the output file
79 for f in infiles:
80 filename = os.path.basename(f)
81 (datasetname, extension) = os.path.splitext(filename)
82 # Need to account for FastQC removing .fastq extension from input file names before using them to create output file and dir names
83 # Alternative solution is to iterate over report directories instead of input file names
84 if extension == '.fastq':
85 summaryfilename = os.path.join(tmpdir, datasetname + '_fastqc', 'summary.txt')
86 else:
87 summaryfilename = os.path.join(tmpdir, filename + '_fastqc', 'summary.txt')
88 outfile.write("%s results:\n" % datasetname)
89 # if summary file exists, process and add results to the output file
90 if os.path.isfile(summaryfilename):
91 summaryfile = open(summaryfilename, 'r')
92 for line in summaryfile:
93 (result, test) = line.split('\t')[:2]
94 outfile.write(result + '\t' + test + '\n')
95 summaryfile.close()
96 else:
97 outfile.write("FastQC summary report was not found at %s.\n" % summaryfilename)
98 outfile.write("\n")
99
100 outfile.close()
101
102 # clean up temp dir, put in a try block so we don't fail on stale nfs handles
103 try:
104 if os.path.exists(tmpdir):
105 shutil.rmtree(tmpdir)
106 except:
107 pass
108
109 if __name__ == '__main__': __main__()