Mercurial > repos > peterjc > seq_filter_by_mapping

--- a/tools/seq_filter_by_mapping/README.rst	Wed Nov 19 10:00:34 2014 -0500
+++ b/tools/seq_filter_by_mapping/README.rst	Tue Jan 27 08:29:54 2015 -0500
@@ -61,6 +61,7 @@
 Version Changes
 ------- ----------------------------------------------------------------------
 v0.0.1  - Initial version.
+v0.0.2  - Fixed some error messages.
 ======= ======================================================================


@@ -76,7 +77,7 @@
 For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use
 the following command from the Galaxy root folder::

-    $ tar -czf seq_filter_by_mapping.tar.gz tools/seq_filter_by_mapping/README.rst tools/seq_filter_by_mapping/seq_filter_by_mapping.* tools/seq_filter_by_mapping/tool_dependencies.xml test-data/SRR639755_mito_pairs.fastq.gz test-data/SRR639755_sample_by_coord.sam test-data/SRR639755_sample_strict.fastq test-data/SRR639755_sample_lax.fastq
+    $ tar -czf seq_filter_by_mapping.tar.gz tools/seq_filter_by_mapping/README.rst tools/seq_filter_by_mapping/seq_filter_by_mapping.py tools/seq_filter_by_mapping/seq_filter_by_mapping.xml tools/seq_filter_by_mapping/tool_dependencies.xml test-data/SRR639755_mito_pairs.fastq.gz test-data/SRR639755_sample_by_coord.sam test-data/SRR639755_sample_strict.fastq test-data/SRR639755_sample_lax.fastq

 Check this worked::
--- a/tools/seq_filter_by_mapping/seq_filter_by_mapping.py	Wed Nov 19 10:00:34 2014 -0500
+++ b/tools/seq_filter_by_mapping/seq_filter_by_mapping.py	Tue Jan 27 08:29:54 2015 -0500
@@ -24,7 +24,7 @@
 import subprocess
 from optparse import OptionParser

-def stop_err(msg, err=1):
+def sys_exit(msg, err=1):
     sys.stderr.write(msg.rstrip() + "\n")
     sys.exit(err)

@@ -64,7 +64,7 @@
 options, args = parser.parse_args()

 if options.version:
-    print "v0.0.1"
+    print "v0.0.2"
     sys.exit(0)

 in_file = options.input
@@ -74,18 +74,18 @@
 pair_mode = options.pair_mode

 if in_file is None or not os.path.isfile(in_file):
-    stop_err("Missing input file: %r" % in_file)
+    sys_exit("Missing input file: %r" % in_file)
 if out_positive_file is None and out_negative_file is None:
-    stop_err("Neither output file requested")
+    sys_exit("Neither output file requested")
 if seq_format is None:
-    stop_err("Missing sequence format")
+    sys_exit("Missing sequence format")
 if pair_mode not in ["lax", "strict"]:
-    stop_err("Pair mode argument should be 'lax' or 'strict', not %r" % logic)
+    sys_exit("Pair mode argument should be 'lax' or 'strict', not %r" % pair_mode)
 for mapping in args:
     if not os.path.isfile(mapping):
-        stop_err("Mapping file %r not found")
+        sys_exit("Mapping file %r not found" % mapping)
 if not args:
-    stop_err("At least one SAM/BAM mapping file is required")
+    sys_exit("At least one SAM/BAM mapping file is required")


 #Cope with three widely used suffix naming convensions,
@@ -187,8 +187,9 @@
         stdout, stderr = child.communicate()
         assert child.returncode is not None
         if child.returncode:
-            msg = "Error %i from 'samtools view %s'\n%s" % (filename, stderr)
-            stop_err(msg.strip(), child.returncode)
+            msg = "Error %i from 'samtools view %s'\n%s" % (child.returncode,
+                                                            filename, stderr)
+            sys_exit(msg.strip(), child.returncode)
     else:
         handle.close()

@@ -317,7 +318,7 @@
     try:
         from Bio.SeqIO.SffIO import SffIterator, SffWriter
     except ImportError:
-        stop_err("SFF filtering requires Biopython 1.54 or later")
+        sys_exit("SFF filtering requires Biopython 1.54 or later")

     try:
         from Bio.SeqIO.SffIO import ReadRocheXmlManifest
@@ -366,4 +367,4 @@
     fastq_filter(in_file, out_positive_file, out_negative_file, ids)
     # This does not currently track the counts
 else:
-    stop_err("Unsupported file type %r" % seq_format)
+    sys_exit("Unsupported file type %r" % seq_format)
--- a/tools/seq_filter_by_mapping/seq_filter_by_mapping.py~	Wed Nov 19 10:00:34 2014 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,307 +0,0 @@
-#!/usr/bin/env python
-"""Filter a FASTA, FASTQ or SSF file with SAM/BAM mapping information.
-
-When filtering an SFF file, any Roche XML manifest in the input file is
-preserved in both output files.
-
-This tool is a short Python script which requires Biopython 1.54 or later
-for SFF file support. If you use this tool in scientific work leading to a
-publication, please cite the Biopython application note:
-
-Cock et al 2009. Biopython: freely available Python tools for computational
-molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
-http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
-
-This script is copyright 2014 by Peter Cock, The James Hutton Institute
-(formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.
-See accompanying text file for licence details (MIT license).
-
-Use -v or --version to get the version, -h or --help for help.
-"""
-import os
-import sys
-import re
-from optparse import OptionParser
-
-def stop_err(msg, err=1):
-    sys.stderr.write(msg.rstrip() + "\n")
-    sys.exit(err)
-
-#Parse Command Line
-usage = """Use as follows:
-
-$ python seq_filter_by_mapping.py [options] mapping.sam/bam [more mappings]
-
-e.g. Positive matches using column one from a single BAM file:
-
-$ seq_filter_by_mapping.py -i my_seqs.fastq -f fastq -p matches.fastq mapping.bam
-
-Multiple SAM/BAM mapping files may be given.
-"""
-parser = OptionParser(usage=usage)
-parser.add_option('-i', '--input', dest='input',
-                  default=None, help='Input sequences filename',
-                  metavar="FILE")
-parser.add_option('-f', '--format', dest='format',
-                  default=None,
-                  help='Input sequence format (e.g. fasta, fastq, sff)')
-parser.add_option('-p', '--positive', dest='output_positive',
-                  default=None,
-                  help='Output filename for mapping reads',
-                  metavar="FILE")
-parser.add_option('-n', '--negative', dest='output_negative',
-                  default=None,
-                  help='Output filename for non-mapping reads',
-                  metavar="FILE")
-parser.add_option("-m", "--pair-mode", dest="pair_mode",
-                  default="lax",
-                  help="How to treat paired reads (lax or strict)")
-parser.add_option("-v", "--version", dest="version",
-                  default=False, action="store_true",
-                  help="Show version and quit")
-
-options, args = parser.parse_args()
-
-if options.version:
-    print "v0.0.1"
-    sys.exit(0)
-
-in_file = options.input
-seq_format = options.format
-out_positive_file = options.output_positive
-out_negative_file = options.output_negative
-pair_mode = options.pair_mode
-
-if in_file is None or not os.path.isfile(in_file):
-    stop_err("Missing input file: %r" % in_file)
-if out_positive_file is None and out_negative_file is None:
-    stop_err("Neither output file requested")
-if seq_format is None:
-    stop_err("Missing sequence format")
-if pair_mode not in ["lax", "strict"]:
-    stop_err("Pair mode argument should be 'lax' or 'strict', not %r" % logic)
-for mapping in args:
-    if not os.path.isfile(mapping):
-        stop_err("Mapping file %r not found")
-if not args:
-    stop_err("At least one SAM/BAM mapping file is required")
-
-
-#Cope with three widely used suffix naming convensions,
-#Illumina: /1 or /2
-#Forward/revered: .f or .r
-#Sanger, e.g. .p1k and .q1k
-#See http://staden.sourceforge.net/manual/pregap4_unix_50.html
-#re_f = re.compile(r"(/1|\.f|\.[sfp]\d\w*)$")
-#re_r = re.compile(r"(/2|\.r|\.[rq]\d\w*)$")
-re_suffix = re.compile(r"(/1|\.f|\.[sfp]\d\w*|/2|\.r|\.[rq]\d\w*)$")
-assert re_suffix.search("demo.f")
-assert re_suffix.search("demo.s1")
-assert re_suffix.search("demo.f1k")
-assert re_suffix.search("demo.p1")
-assert re_suffix.search("demo.p1k")
-assert re_suffix.search("demo.p1lk")
-assert re_suffix.search("demo/2")
-assert re_suffix.search("demo.r")
-assert re_suffix.search("demo.q1")
-assert re_suffix.search("demo.q1lk")
-
-def clean_name(name):
-    """Remove suffix."""
-    match = re_suffix.search(name)
-    if match:
-        # Use the fact this is a suffix, and regular expression will be
-        # anchored to the end of the name:
-        return name[:match.start()]
-    else:
-        # Nothing to do
-        return name
-assert clean_name("foo/1") == "foo"
-assert clean_name("foo/2") == "foo"
-assert clean_name("bar.f") == "bar"
-assert clean_name("bar.r") == "bar"
-assert clean_name("baz.p1") == "baz"
-assert clean_name("baz.q2") == "baz"
-
-mapped_chars = { '>' :'__gt__',
-                 '<' :'__lt__',
-                 "'" :'__sq__',
-                 '"' :'__dq__',
-                 '[' :'__ob__',
-                 ']' :'__cb__',
-                 '{' :'__oc__',
-                 '}' :'__cc__',
-                 '@' : '__at__',
-                 '\n' : '__cn__',
-                 '\r' : '__cr__',
-                 '\t' : '__tc__',
-                 '#' : '__pd__'
-                 }
-
-# Read mapping file(s) and record all mapped identifiers
-ids = set()
-for mapping in args:
-    handle = open(mapping)
-    # Hope this is SAM format...
-    for line in handle:
-        # Ignore header lines
-        if line[0] != "@":
-            qname, flag, rest = line.split("\t", 2)
-            flag = int(flag)
-            # TODO - pair mode
-            if flag & 0x4:
-                ids.add(qname)
-# TODO - If want to support naive paired mode, have to record
-# more than just qname (need /1 or /2 indicator)
-print("Loaded %i mapped IDs" % (len(ids)))
-
-
-def crude_fasta_iterator(handle):
-    """Yields tuples, record ID and the full record as a string."""
-    while True:
-        line = handle.readline()
-        if line == "":
-            return # Premature end of file, or just empty?
-        if line[0] == ">":
-            break
-
-    no_id_warned = False
-    while True:
-        if line[0] != ">":
-            raise ValueError(
-                "Records in Fasta files should start with '>' character")
-        try:
-            id = line[1:].split(None, 1)[0]
-        except IndexError:
-            if not no_id_warned:
-                sys.stderr.write("WARNING - Malformed FASTA entry with no identifier\n")
-                no_id_warned = True
-            id = None
-        lines = [line]
-        line = handle.readline()
-        while True:
-            if not line:
-                break
-            if line[0] == ">":
-                break
-            lines.append(line)
-            line = handle.readline()
-        yield id, "".join(lines)
-        if not line:
-            return # StopIteration
-
-
-def fasta_filter(in_file, pos_file, neg_file, wanted):
-    """FASTA filter producing 60 character line wrapped outout."""
-    pos_count = neg_count = 0
-    #Galaxy now requires Python 2.5+ so can use with statements,
-    with open(in_file) as in_handle:
-        #Doing the if statement outside the loop for speed
-        #(with the downside of three very similar loops).
-        if pos_file is not None and neg_file is not None:
-            print "Generating two FASTA files"
-            with open(pos_file, "w") as pos_handle:
-                with open(neg_file, "w") as neg_handle:
-                    for identifier, record in crude_fasta_iterator(in_handle):
-                        if clean_name(identifier) in wanted:
-                            pos_handle.write(record)
-                            pos_count += 1
-                        else:
-                            neg_handle.write(record)
-                            neg_count += 1
-        elif pos_file is not None:
-            print "Generating matching FASTA file"
-            with open(pos_file, "w") as pos_handle:
-                for identifier, record in crude_fasta_iterator(in_handle):
-                    if clean_name(identifier) in wanted:
-                        pos_handle.write(record)
-                        pos_count += 1
-                    else:
-                        neg_count += 1
-        else:
-            print "Generating non-matching FASTA file"
-            assert neg_file is not None
-            with open(neg_file, "w") as neg_handle:
-                for identifier, record in crude_fasta_iterator(in_handle):
-                    if clean_name(identifier) in wanted:
-                        pos_count += 1
-                    else:
-                        neg_handle.write(record)
-                        neg_count += 1
-    return pos_count, neg_count
-
-
-if seq_format.lower()=="sff":
-    #Now write filtered SFF file based on IDs from BLAST file
-    try:
-        from Bio.SeqIO.SffIO import SffIterator, SffWriter
-    except ImportError:
-        stop_err("SFF filtering requires Biopython 1.54 or later")
-
-    try:
-        from Bio.SeqIO.SffIO import ReadRocheXmlManifest
-    except ImportError:
-        #Prior to Biopython 1.56 this was a private function
-        from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
-    in_handle = open(in_file, "rb") #must be binary mode!
-    try:
-        manifest = ReadRocheXmlManifest(in_handle)
-    except ValueError:
-        manifest = None
-    #This makes two passes though the SFF file with isn't so efficient,
-    #but this makes the code simple.
-    pos_count = neg_count = 0
-    if out_positive_file is not None:
-        out_handle = open(out_positive_file, "wb")
-        writer = SffWriter(out_handle, xml=manifest)
-        in_handle.seek(0) #start again after getting manifest
-        pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in ids)
-        out_handle.close()
-    if out_negative_file is not None:
-        out_handle = open(out_negative_file, "wb")
-        writer = SffWriter(out_handle, xml=manifest)
-        in_handle.seek(0) #start again
-        neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in ids)
-        out_handle.close()
-    #And we're done
-    in_handle.close()
-    #At the time of writing, Galaxy doesn't show SFF file read counts,
-    #so it is useful to put them in stdout and thus shown in job info.
-    print "%i with and %i without specified IDs" % (pos_count, neg_count)
-elif seq_format.lower()=="fasta":
-    #Write filtered FASTA file based on IDs from tabular file
-    pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids)
-    print "%i with and %i without specified IDs" % (pos_count, neg_count)
-elif seq_format.lower().startswith("fastq"):
-    #Write filtered FASTQ file based on IDs from mapping file
-    from Bio.SeqIO.QualityIO import FastqGeneralIterator
-    handle = open(in_file, "r")
-    if out_positive_file is not None and out_negative_file is not None:
-        print "Generating two FASTQ files"
-        positive_handle = open(out_positive_file, "w")
-        negative_handle = open(out_negative_file, "w")
-        for title, seq, qual in FastqGeneralIterator(handle):
-            print("%s --> %s" % (title, clean_name(title.split(None, 1)[0])))
-            if clean_name(title.split(None, 1)[0]) in ids:
-                positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
-            else:
-                negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
-        positive_handle.close()
-        negative_handle.close()
-    elif out_positive_file is not None:
-        print "Generating matching FASTQ file"
-        positive_handle = open(out_positive_file, "w")
-        for title, seq, qual in FastqGeneralIterator(handle):
-            if clean_name(title.split(None, 1)[0]) in ids:
-                positive_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
-        positive_handle.close()
-    elif out_negative_file is not None:
-        print "Generating non-matching FASTQ file"
-        negative_handle = open(out_negative_file, "w")
-        for title, seq, qual in FastqGeneralIterator(handle):
-            if clean_name(title.split(None, 1)[0]) not in ids:
-                negative_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
-        negative_handle.close()
-    handle.close()
-else:
-    stop_err("Unsupported file type %r" % seq_format)
--- a/tools/seq_filter_by_mapping/seq_filter_by_mapping.xml	Wed Nov 19 10:00:34 2014 -0500
+++ b/tools/seq_filter_by_mapping/seq_filter_by_mapping.xml	Tue Jan 27 08:29:54 2015 -0500
@@ -1,4 +1,4 @@
-<tool id="seq_filter_by_mapping" name="Filter sequences by mapping" version="0.0.1">
+<tool id="seq_filter_by_mapping" name="Filter sequences by mapping" version="0.0.2">
     <description>from SAM/BAM file</description>
     <requirements>
         <requirement type="package" version="1.64">biopython</requirement>
--- a/tools/seq_filter_by_mapping/seq_filter_by_mapping.xml~	Wed Nov 19 10:00:34 2014 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,122 +0,0 @@
-<tool id="seq_filter_by_mapping" name="Filter sequences by mapping" version="0.0.1">
-    <description>from SAM/BAM file</description>
-    <requirements>
-        <requirement type="package" version="1.64">biopython</requirement>
-        <requirement type="python-module">Bio</requirement>
-    </requirements>
-    <version_command interpreter="python">seq_filter_by_mapping.py --version</version_command>
-    <command interpreter="python">
-seq_filter_by_mapping.py -i "$input_file" -f "$input_file.ext" -m $pair_mode
-#if $output_choice_cond.output_choice=="both"
- -p $output_pos -n $output_neg
-#elif $output_choice_cond.output_choice=="pos"
- -p $output_pos
-#elif $output_choice_cond.output_choice=="neg"
- -n $output_neg
-#end if
-    </command>
-    <stdio>
-        <!-- Anything other than zero is an error -->
-        <exit_code range="1:" />
-        <exit_code range=":-1" />
-    </stdio>
-    <inputs>
-        <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file to be filtered" help="FASTA, FASTQ, or SFF format." />
-	<param name="mapping_file" type="data" format="sam,bam" label="SAM/BAM mapping of those sequences" help="SAM or BAM format." />
-        <conditional name="output_choice_cond">
-            <param name="output_choice" type="select" label="Output mapped reads, unmapped reads, or both?">
-                <option value="both">Both mapped and unmapped reads, as two files</option>
-                <option value="pos">Just mapped reads, as a single file</option>
-                <option value="neg">Just unmapped reads, as a single file</option>
-            </param>
-            <!-- Seems need these dummy entries here, compare this to indels/indel_sam2interval.xml -->
-            <when value="both" />
-            <when value="pos" />
-            <when value="neg" />
-        </conditional>
-	<param name="pair_mode" type="select" label="Paired read treatment">
-            <option value="strict">Treat as a pair, require both reads to be mapped</option>
-            <option value="lax" selected="true">Treat as a pair, allow either read to be mapped</option>
-	    <option value="strict">Treat as a pair, require both reads to be mapped</option>
-	    <!-- The following would actually be more work as have to store qname/1 and qname/2 separately for filter...
-            <option value="solo">Treat independently (will split partners when only one maps)</option>
-	    -->
-        </param>
-    </inputs>
-    <outputs>
-        <data name="output_pos" format="input" metadata_source="input_file" label="With matched ID">
-            <filter>output_choice_cond["output_choice"] != "neg"</filter>
-        </data>
-        <data name="output_neg" format="input" metadata_source="input_file" label="Without matched ID">
-            <filter>output_choice_cond["output_choice"] != "pos"</filter>
-        </data>
-    </outputs>
-    <tests>
-        <test>
-            <param name="input_file" value="k12_ten_proteins.fasta" ftype="fasta" />
-            <param name="input_tabular" value="k12_hypothetical.tabular" ftype="tabular" />
-            <param name="output_choice" value="pos" />
-            <output name="output_pos" file="k12_hypothetical.fasta" ftype="fasta" />
-        </test>
-        <test>
-            <param name="input_file" value="k12_ten_proteins.fasta" ftype="fasta" />
-            <param name="input_tabular" value="k12_hypothetical.tabular" ftype="tabular" />
-            <output name="output_pos" file="k12_hypothetical.fasta" ftype="fasta" />
-        </test>
-        <test>
-            <param name="input_file" value="k12_ten_proteins.fasta" ftype="fasta" />
-            <output name="output_pos" file="k12_hypothetical.fasta" ftype="fasta" />
-        </test>
-        <test>
-            <param name="input_file" value="sanger-pairs-mixed.fastq" ftype="fastq" />
-            <output name="output_pos" file="sanger-sample.fastq" ftype="fastq" />
-        </test>
-        <test>
-            <param name="input_file" value="sanger-pairs-mixed.fastq" ftype="fastq" />
-            <output name="output_pos" file="sanger-pairs-mixed.fastq" ftype="fastq" />
-	    <output name="output_neg" file="empty_file.dat" ftype="fastq" />
-        </test>
-        <test>
-            <param name="input_file" value="sanger-pairs-mixed.fastq" ftype="fastq" />
-            <output name="output_pos" file="empty_file.dat" ftype="fastq" />
-            <output name="output_neg" file="sanger-pairs-mixed.fastq" ftype="fastq" />
-        </test>
-    </tests>
-    <help>
-**What it does**
-
-By default it divides a FASTA, FASTQ or Standard Flowgram Format (SFF) file in
-two, those sequences (or read pairs) which do or don't map in the provided
-SAM/BAM file. You can opt to have a single output file of just the mapping reads,
-or just the non-mapping ones.
-
-**Example Usage**
-
-You might wish to perform a contamination screan by mapping your reads against
-known contaminant reference sequences, then use this tool to select only the
-unmapped reads for further analysis (e.g. *de novo* assembly).
-
-Similarly you might wish to map your reads against a known bacterial reference,
-then take the non-mapping sequences forward for analysis if looking for novel
-plasmids.
-
-
-**References**
-
-If you use this Galaxy tool in work leading to a scientific publication please
-cite:
-
-Peter J.A. Cock (2014), Galaxy tool for filtering reads by mapping
-http://toolshed.g2.bx.psu.edu/view/peterjc/seq_filter_by_mapping
-
-This tool uses Biopython to read and write SFF files, so you may also wish to
-cite the Biopython application note (and Galaxy too of course):
-
-Cock et al (2009). Biopython: freely available Python tools for computational
-molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
-http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
-
-This tool is available to install into other Galaxy Instances via the Galaxy
-Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/seq_filter_by_mapping
-    </help>
-</tool>