Mercurial > repos > hackdna > fastqc
view fastqc_checker.py @ 0:d8d131d08779 draft default tip
Initial upload.
author | hackdna |
---|---|
date | Tue, 21 May 2013 11:48:53 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python ''' FastQC checker for Galaxy biomedical data analysis platform @author: Ilya Sytchev Input: one or more files in fastq format Output: sequencing quality report in text format Requires FastQC 0.10.0 (http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/) Partially based on: fastqcwrapper (http://toolshed.g2.bx.psu.edu/repos/jjohnson/fastqc) rgFastQC (https://bitbucket.org/galaxy/galaxy-dist/src/tip/tools/rgenetics/rgFastQC.py) Tested with Python 2.6.1 and 2.7.2 on Mac OS 10.6.8 ''' import sys, os, optparse, tempfile, shutil, subprocess def stop_err(msg, returncode=1): sys.stderr.write(msg) sys.exit(returncode) def __main__(): usage = "Usage: %prog -e fastqc_executable -o output_file fastq_file [fastq_file ... ]" version = "%prog 1.0.0" op = optparse.OptionParser(usage=usage, version=version) op.add_option('-e', '--executable', dest="executable", help="location of the FastQC program") op.add_option('-o', '--output', dest="outfile", help="location of the output file") (options, infiles) = op.parse_args() # check if location of the FastQC program was provided if options.executable == None: op.error("Missing location of FastQC") # check if FastQC program exists at the provided location if not os.path.isfile(options.executable): op.error("Cannot find FastQC at %s" % options.executable) # check if any input files were provided if infiles == None: op.error("Missing input files") # check if all input files exist for f in infiles: if not os.path.isfile(f): op.error("Cannot find input file %s" % f) # check if output file was provided if options.outfile == None: op.error("Missing output file name") # assemble FastQC command line cmd = [] # list is more secure than string for subprocess call cmd.append(options.executable) tmpdir = tempfile.mkdtemp() # create temp dir for FastQC output cmd.extend(['-o', tmpdir]) cmd.extend(infiles) # prepare files for FastQC stdout and stderr tmp_stderr_name = tempfile.NamedTemporaryFile(dir=tmpdir, suffix='.err').name tmp_stderr = open(tmp_stderr_name, 'w') tmp_stdout_name = tempfile.NamedTemporaryFile(dir=tmpdir, suffix='.out').name tmp_stdout = open(tmp_stdout_name, 'w') # run FastQC try: subprocess.check_call(cmd, stderr=tmp_stderr.fileno(), stdout=tmp_stdout.fileno()) except subprocess.CalledProcessError as e: stop_err("Error executing FastQC\n", e.returncode) finally: tmp_stderr.close() tmp_stdout.close() outfile = open(options.outfile, 'w') # parse all summary.txt files produced by FastQC and write results into the output file for f in infiles: filename = os.path.basename(f) (datasetname, extension) = os.path.splitext(filename) # Need to account for FastQC removing .fastq extension from input file names before using them to create output file and dir names # Alternative solution is to iterate over report directories instead of input file names if extension == '.fastq': summaryfilename = os.path.join(tmpdir, datasetname + '_fastqc', 'summary.txt') else: summaryfilename = os.path.join(tmpdir, filename + '_fastqc', 'summary.txt') outfile.write("%s results:\n" % datasetname) # if summary file exists, process and add results to the output file if os.path.isfile(summaryfilename): summaryfile = open(summaryfilename, 'r') for line in summaryfile: (result, test) = line.split('\t')[:2] outfile.write(result + '\t' + test + '\n') summaryfile.close() else: outfile.write("FastQC summary report was not found at %s.\n" % summaryfilename) outfile.write("\n") outfile.close() # clean up temp dir, put in a try block so we don't fail on stale nfs handles try: if os.path.exists(tmpdir): shutil.rmtree(tmpdir) except: pass if __name__ == '__main__': __main__()