changeset 0:d8d131d08779 draft default tip

Initial upload.
author hackdna
date Tue, 21 May 2013 11:48:53 -0400
parents
children
files fastqc_checker.py fastqc_checker.xml
diffstat 2 files changed, 166 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fastqc_checker.py	Tue May 21 11:48:53 2013 -0400
@@ -0,0 +1,109 @@
+#!/usr/bin/env python
+
+'''
+FastQC checker for Galaxy biomedical data analysis platform
+
+@author: Ilya Sytchev
+
+Input: one or more files in fastq format
+Output: sequencing quality report in text format
+
+Requires FastQC 0.10.0 (http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/)
+
+Partially based on:
+fastqcwrapper (http://toolshed.g2.bx.psu.edu/repos/jjohnson/fastqc)
+rgFastQC (https://bitbucket.org/galaxy/galaxy-dist/src/tip/tools/rgenetics/rgFastQC.py)
+
+Tested with Python 2.6.1 and 2.7.2 on Mac OS 10.6.8
+'''
+
+import sys, os, optparse, tempfile, shutil, subprocess
+
+def stop_err(msg, returncode=1):
+    sys.stderr.write(msg)
+    sys.exit(returncode)
+
+def __main__():
+    usage = "Usage: %prog -e fastqc_executable -o output_file fastq_file [fastq_file ... ]"
+    version = "%prog 1.0.0"
+    op = optparse.OptionParser(usage=usage, version=version)
+    op.add_option('-e', '--executable', dest="executable", help="location of the FastQC program")
+    op.add_option('-o', '--output', dest="outfile", help="location of the output file")
+    (options, infiles) = op.parse_args()
+
+    # check if location of the FastQC program was provided
+    if options.executable == None:
+        op.error("Missing location of FastQC")
+
+    # check if FastQC program exists at the provided location
+    if not os.path.isfile(options.executable):
+        op.error("Cannot find FastQC at %s" % options.executable)
+
+    # check if any input files were provided
+    if infiles == None:
+        op.error("Missing input files")
+
+    # check if all input files exist
+    for f in infiles:
+        if not os.path.isfile(f):
+            op.error("Cannot find input file %s" % f)
+    
+    # check if output file was provided
+    if options.outfile == None:
+        op.error("Missing output file name")
+    
+    # assemble FastQC command line
+    cmd = []    # list is more secure than string for subprocess call 
+    cmd.append(options.executable)
+    tmpdir = tempfile.mkdtemp()    # create temp dir for FastQC output
+    cmd.extend(['-o', tmpdir])
+    cmd.extend(infiles)
+
+    # prepare files for FastQC stdout and stderr
+    tmp_stderr_name = tempfile.NamedTemporaryFile(dir=tmpdir, suffix='.err').name
+    tmp_stderr = open(tmp_stderr_name, 'w')
+    tmp_stdout_name = tempfile.NamedTemporaryFile(dir=tmpdir, suffix='.out').name
+    tmp_stdout = open(tmp_stdout_name, 'w')
+    # run FastQC
+    try:
+        subprocess.check_call(cmd, stderr=tmp_stderr.fileno(), stdout=tmp_stdout.fileno())
+    except subprocess.CalledProcessError as e:
+        stop_err("Error executing FastQC\n", e.returncode)
+    finally:
+        tmp_stderr.close()
+        tmp_stdout.close()
+
+    outfile = open(options.outfile, 'w')
+    
+    # parse all summary.txt files produced by FastQC and write results into the output file
+    for f in infiles:
+        filename = os.path.basename(f)
+        (datasetname, extension) = os.path.splitext(filename)
+        # Need to account for FastQC removing .fastq extension from input file names before using them to create output file and dir names
+        # Alternative solution is to iterate over report directories instead of input file names
+        if extension == '.fastq':
+            summaryfilename = os.path.join(tmpdir, datasetname + '_fastqc', 'summary.txt')
+        else:
+            summaryfilename = os.path.join(tmpdir, filename + '_fastqc', 'summary.txt')
+        outfile.write("%s results:\n" % datasetname)
+        # if summary file exists, process and add results to the output file
+        if os.path.isfile(summaryfilename):
+            summaryfile = open(summaryfilename, 'r')
+            for line in summaryfile:
+                (result, test) = line.split('\t')[:2]
+                outfile.write(result + '\t' + test + '\n') 
+            summaryfile.close()
+        else:
+            outfile.write("FastQC summary report was not found at %s.\n" % summaryfilename)
+        outfile.write("\n")
+    
+    outfile.close()
+
+    # clean up temp dir, put in a try block so we don't fail on stale nfs handles
+    try: 
+        if os.path.exists(tmpdir):
+            shutil.rmtree(tmpdir)
+    except:
+        pass
+
+if __name__ == '__main__': __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fastqc_checker.xml	Tue May 21 11:48:53 2013 -0400
@@ -0,0 +1,57 @@
+<tool id="fastqc_checker_1" name="FastQC checker" version="1.0.0">
+	<description>for quality control of high throughput sequence data</description>
+
+	<command interpreter="python">
+		fastqc_checker.py -e ${GALAXY_DATA_INDEX_DIR}/shared/jars/FastQC/fastqc -o $output $input
+	</command>
+
+	<inputs>
+		<param format="fastq" name="input" type="data" label="Source files" multiple="true"/>
+	</inputs>
+	<outputs>
+		<data format="txt" name="output"/>
+	</outputs>
+	
+	<help>
+
+.. class:: infomark
+
+**Purpose**
+
+FastQC aims to provide a simple way to do some quality control checks on raw
+sequence data coming from high throughput sequencing pipelines. 
+It provides a modular set of analyses which you can use to give a quick
+impression of whether your data has any problems of 
+which you should be aware before doing any further analysis.
+
+**FastQC documentation**
+
+This is a Galaxy interface to the external package FastQC_.
+Specific documentation on FastQC can be found on that site.
+
+ .. _FastQC: http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/
+
+-----
+
+.. class:: infomark
+
+**Inputs and outputs**
+
+This wrapper will accept one or more fastq files.
+
+The tool produces a single output file that contains summary of all the results, including the following:
+
+- Basic Statistics
+- Per base sequence quality
+- Per sequence quality scores
+- Per base sequence content
+- Per base GC content
+- Per sequence GC content
+- Per base N content
+- Sequence Length Distribution
+- Sequence Duplication Levels
+- Overrepresented sequences
+- Kmer Content
+
+	</help>
+</tool>