Mercurial > repos > hackdna > fastqc
changeset 0:d8d131d08779 draft default tip
Initial upload.
author | hackdna |
---|---|
date | Tue, 21 May 2013 11:48:53 -0400 |
parents | |
children | |
files | fastqc_checker.py fastqc_checker.xml |
diffstat | 2 files changed, 166 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastqc_checker.py Tue May 21 11:48:53 2013 -0400 @@ -0,0 +1,109 @@ +#!/usr/bin/env python + +''' +FastQC checker for Galaxy biomedical data analysis platform + +@author: Ilya Sytchev + +Input: one or more files in fastq format +Output: sequencing quality report in text format + +Requires FastQC 0.10.0 (http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/) + +Partially based on: +fastqcwrapper (http://toolshed.g2.bx.psu.edu/repos/jjohnson/fastqc) +rgFastQC (https://bitbucket.org/galaxy/galaxy-dist/src/tip/tools/rgenetics/rgFastQC.py) + +Tested with Python 2.6.1 and 2.7.2 on Mac OS 10.6.8 +''' + +import sys, os, optparse, tempfile, shutil, subprocess + +def stop_err(msg, returncode=1): + sys.stderr.write(msg) + sys.exit(returncode) + +def __main__(): + usage = "Usage: %prog -e fastqc_executable -o output_file fastq_file [fastq_file ... ]" + version = "%prog 1.0.0" + op = optparse.OptionParser(usage=usage, version=version) + op.add_option('-e', '--executable', dest="executable", help="location of the FastQC program") + op.add_option('-o', '--output', dest="outfile", help="location of the output file") + (options, infiles) = op.parse_args() + + # check if location of the FastQC program was provided + if options.executable == None: + op.error("Missing location of FastQC") + + # check if FastQC program exists at the provided location + if not os.path.isfile(options.executable): + op.error("Cannot find FastQC at %s" % options.executable) + + # check if any input files were provided + if infiles == None: + op.error("Missing input files") + + # check if all input files exist + for f in infiles: + if not os.path.isfile(f): + op.error("Cannot find input file %s" % f) + + # check if output file was provided + if options.outfile == None: + op.error("Missing output file name") + + # assemble FastQC command line + cmd = [] # list is more secure than string for subprocess call + cmd.append(options.executable) + tmpdir = tempfile.mkdtemp() # create temp dir for FastQC output + cmd.extend(['-o', tmpdir]) + cmd.extend(infiles) + + # prepare files for FastQC stdout and stderr + tmp_stderr_name = tempfile.NamedTemporaryFile(dir=tmpdir, suffix='.err').name + tmp_stderr = open(tmp_stderr_name, 'w') + tmp_stdout_name = tempfile.NamedTemporaryFile(dir=tmpdir, suffix='.out').name + tmp_stdout = open(tmp_stdout_name, 'w') + # run FastQC + try: + subprocess.check_call(cmd, stderr=tmp_stderr.fileno(), stdout=tmp_stdout.fileno()) + except subprocess.CalledProcessError as e: + stop_err("Error executing FastQC\n", e.returncode) + finally: + tmp_stderr.close() + tmp_stdout.close() + + outfile = open(options.outfile, 'w') + + # parse all summary.txt files produced by FastQC and write results into the output file + for f in infiles: + filename = os.path.basename(f) + (datasetname, extension) = os.path.splitext(filename) + # Need to account for FastQC removing .fastq extension from input file names before using them to create output file and dir names + # Alternative solution is to iterate over report directories instead of input file names + if extension == '.fastq': + summaryfilename = os.path.join(tmpdir, datasetname + '_fastqc', 'summary.txt') + else: + summaryfilename = os.path.join(tmpdir, filename + '_fastqc', 'summary.txt') + outfile.write("%s results:\n" % datasetname) + # if summary file exists, process and add results to the output file + if os.path.isfile(summaryfilename): + summaryfile = open(summaryfilename, 'r') + for line in summaryfile: + (result, test) = line.split('\t')[:2] + outfile.write(result + '\t' + test + '\n') + summaryfile.close() + else: + outfile.write("FastQC summary report was not found at %s.\n" % summaryfilename) + outfile.write("\n") + + outfile.close() + + # clean up temp dir, put in a try block so we don't fail on stale nfs handles + try: + if os.path.exists(tmpdir): + shutil.rmtree(tmpdir) + except: + pass + +if __name__ == '__main__': __main__()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastqc_checker.xml Tue May 21 11:48:53 2013 -0400 @@ -0,0 +1,57 @@ +<tool id="fastqc_checker_1" name="FastQC checker" version="1.0.0"> + <description>for quality control of high throughput sequence data</description> + + <command interpreter="python"> + fastqc_checker.py -e ${GALAXY_DATA_INDEX_DIR}/shared/jars/FastQC/fastqc -o $output $input + </command> + + <inputs> + <param format="fastq" name="input" type="data" label="Source files" multiple="true"/> + </inputs> + <outputs> + <data format="txt" name="output"/> + </outputs> + + <help> + +.. class:: infomark + +**Purpose** + +FastQC aims to provide a simple way to do some quality control checks on raw +sequence data coming from high throughput sequencing pipelines. +It provides a modular set of analyses which you can use to give a quick +impression of whether your data has any problems of +which you should be aware before doing any further analysis. + +**FastQC documentation** + +This is a Galaxy interface to the external package FastQC_. +Specific documentation on FastQC can be found on that site. + + .. _FastQC: http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/ + +----- + +.. class:: infomark + +**Inputs and outputs** + +This wrapper will accept one or more fastq files. + +The tool produces a single output file that contains summary of all the results, including the following: + +- Basic Statistics +- Per base sequence quality +- Per sequence quality scores +- Per base sequence content +- Per base GC content +- Per sequence GC content +- Per base N content +- Sequence Length Distribution +- Sequence Duplication Levels +- Overrepresented sequences +- Kmer Content + + </help> +</tool>