changeset 10:e504a2e18cba draft

Uploaded v0.0.5g, renamed folder and README file
author peterjc
date Thu, 26 Sep 2013 10:04:13 -0400
parents 7a12915b511f
children 308ec20faefb
files tools/filters/get_orfs_or_cdss.py tools/filters/get_orfs_or_cdss.rst tools/filters/get_orfs_or_cdss.xml tools/filters/repository_dependencies.xml tools/get_orfs_or_cdss/README.rst tools/get_orfs_or_cdss/get_orfs_or_cdss.py tools/get_orfs_or_cdss/get_orfs_or_cdss.xml tools/get_orfs_or_cdss/repository_dependencies.xml
diffstat 8 files changed, 527 insertions(+), 527 deletions(-) [+]
line wrap: on
line diff
--- a/tools/filters/get_orfs_or_cdss.py	Tue Sep 17 11:48:29 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,223 +0,0 @@
-#!/usr/bin/env python
-"""Find ORFs in a nucleotide sequence file.
-
-get_orfs_or_cdss.py $input_fasta $input_format $table $ftype $ends $mode $min_len $strand $out_nuc_file $out_prot_file
-
-Takes ten command line options, input sequence filename, format, genetic
-code, CDS vs ORF, end type (open, closed), selection mode (all, top, one),
-minimum length (in amino acids), strand (both, forward, reverse), output
-nucleotide filename, and output protein filename.
-
-This tool is a short Python script which requires Biopython. If you use
-this tool in scientific work leading to a publication, please cite the
-Biopython application note:
-
-Cock et al 2009. Biopython: freely available Python tools for computational
-molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
-http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
-
-This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute
-(formerly SCRI), Dundee, UK. All rights reserved.
-
-See accompanying text file for licence details (MIT/BSD style).
-
-This is version 0.0.3 of the script.
-"""
-import sys
-import re
-
-if "-v" in sys.argv or "--version" in sys.argv:
-    print "v0.0.3"
-    sys.exit(0)
-
-def stop_err(msg, err=1):
-    sys.stderr.write(msg.rstrip() + "\n")
-    sys.exit(err)
-
-try:
-    from Bio.Seq import Seq, reverse_complement, translate
-    from Bio.SeqRecord import SeqRecord
-    from Bio import SeqIO
-    from Bio.Data import CodonTable
-except ImportError:
-    stop_err("Missing Biopython library")
-
-#Parse Command Line
-try:
-    input_file, seq_format, table, ftype, ends, mode, min_len, strand, out_nuc_file, out_prot_file = sys.argv[1:]
-except ValueError:
-    stop_err("Expected ten arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
-
-try:
-    table = int(table)
-except ValueError:
-    stop_err("Expected integer for genetic code table, got %s" % table)
-
-try:
-    table_obj = CodonTable.ambiguous_generic_by_id[table]
-except KeyError:
-    stop_err("Unknown codon table %i" % table)
-
-if ftype not in ["CDS", "ORF"]:
-    stop_err("Expected CDS or ORF, got %s" % ftype)
-
-if ends not in ["open", "closed"]:
-    stop_err("Expected open or closed for end treatment, got %s" % ends)
-
-try:
-    min_len = int(min_len)
-except ValueError:
-    stop_err("Expected integer for min_len, got %s" % min_len)
-
-if seq_format.lower()=="sff":
-    seq_format = "sff-trim"
-elif seq_format.lower()=="fasta":
-    seq_format = "fasta"
-elif seq_format.lower().startswith("fastq"):
-    seq_format = "fastq"
-else:
-    stop_err("Unsupported file type %r" % seq_format)
-
-print "Genetic code table %i" % table
-print "Minimum length %i aa" % min_len
-#print "Taking %s ORF(s) from %s strand(s)" % (mode, strand)
-
-starts = sorted(table_obj.start_codons)
-assert "NNN" not in starts
-re_starts = re.compile("|".join(starts))
-
-stops = sorted(table_obj.stop_codons)
-assert "NNN" not in stops
-re_stops = re.compile("|".join(stops))
-
-def start_chop_and_trans(s, strict=True):
-    """Returns offset, trimmed nuc, protein."""
-    if strict:
-        assert s[-3:] in stops, s
-    assert len(s) % 3 == 0
-    for match in re_starts.finditer(s):
-        #Must check the start is in frame
-        start = match.start()
-        if start % 3 == 0:
-            n = s[start:]
-            assert len(n) % 3 == 0, "%s is len %i" % (n, len(n))
-            if strict:
-                t = translate(n, table, cds=True)
-            else:
-                #Use when missing stop codon,
-                t = "M" + translate(n[3:], table, to_stop=True)
-            return start, n, t
-    return None, None, None
-
-def break_up_frame(s):
-    """Returns offset, nuc, protein."""
-    start = 0
-    for match in re_stops.finditer(s):
-        index = match.start() + 3
-        if index % 3 != 0:
-            continue
-        n = s[start:index]
-        if ftype=="CDS":
-            offset, n, t = start_chop_and_trans(n)
-        else:
-            offset = 0
-            t = translate(n, table, to_stop=True)
-        if n and len(t) >= min_len:
-            yield start + offset, n, t
-        start = index
-    if ends == "open":
-        #No stop codon, Biopython's strict CDS translate will fail
-        n = s[start:]
-        #Ensure we have whole codons
-        #TODO - Try appending N instead?
-        #TODO - Do the next four lines more elegantly
-        if len(n) % 3:
-            n = n[:-1]
-        if len(n) % 3:
-            n = n[:-1]
-        if ftype=="CDS":
-            offset, n, t = start_chop_and_trans(n, strict=False)
-        else:
-            offset = 0
-            t = translate(n, table, to_stop=True)
-        if n and len(t) >= min_len:
-            yield start + offset, n, t
-                        
-
-def get_all_peptides(nuc_seq):
-    """Returns start, end, strand, nucleotides, protein.
-
-    Co-ordinates are Python style zero-based.
-    """
-    #TODO - Refactor to use a generator function (in start order)
-    #rather than making a list and sorting?
-    answer = []
-    full_len = len(nuc_seq)
-    if strand != "reverse":
-        for frame in range(0,3):
-            for offset, n, t in break_up_frame(nuc_seq[frame:]):
-                start = frame + offset #zero based
-                answer.append((start, start + len(n), +1, n, t))
-    if strand != "forward":
-        rc = reverse_complement(nuc_seq)
-        for frame in range(0,3) :
-            for offset, n, t in break_up_frame(rc[frame:]):
-                start = full_len - frame - offset #zero based
-                answer.append((start - len(n), start, -1, n ,t))
-    answer.sort()
-    return answer
-
-def get_top_peptides(nuc_seq):
-    """Returns all peptides of max length."""
-    values = list(get_all_peptides(nuc_seq))
-    if not values:
-        raise StopIteration
-    max_len = max(len(x[-1]) for x in values)
-    for x in values:
-        if len(x[-1]) == max_len:
-            yield x
-
-def get_one_peptide(nuc_seq):
-    """Returns first (left most) peptide with max length."""
-    values = list(get_top_peptides(nuc_seq))
-    if not values:
-        raise StopIteration
-    yield values[0]
-
-if mode == "all":
-    get_peptides = get_all_peptides
-elif mode == "top":
-    get_peptides = get_top_peptides
-elif mode == "one":
-    get_peptides = get_one_peptide
-
-in_count = 0
-out_count = 0
-if out_nuc_file == "-":
-    out_nuc = sys.stdout
-else:
-    out_nuc = open(out_nuc_file, "w")
-if out_prot_file == "-":
-    out_prot = sys.stdout
-else:
-    out_prot = open(out_prot_file, "w")
-for record in SeqIO.parse(input_file, seq_format):
-    for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())):
-        out_count += 1
-        if f_strand == +1:
-            loc = "%i..%i" % (f_start+1, f_end)
-        else:
-            loc = "complement(%i..%i)" % (f_start+1, f_end)
-        descr = "length %i aa, %i bp, from %s of %s" \
-                % (len(t), len(n), loc, record.description)
-        r = SeqRecord(Seq(n), id = record.id + "|%s%i" % (ftype, i+1), name = "", description= descr)
-        t = SeqRecord(Seq(t), id = record.id + "|%s%i" % (ftype, i+1), name = "", description= descr)
-        SeqIO.write(r, out_nuc, "fasta")
-        SeqIO.write(t, out_prot, "fasta")
-    in_count += 1
-if out_nuc is not sys.stdout:
-    out_nuc.close()
-if out_prot is not sys.stdout:
-    out_prot.close()
-
-print "Found %i %ss in %i sequences" % (out_count, ftype, in_count)
--- a/tools/filters/get_orfs_or_cdss.rst	Tue Sep 17 11:48:29 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,125 +0,0 @@
-Galaxy tool to find ORFs or simple CDSs
-=======================================
-
-This tool is copyright 2011-2013 by Peter Cock, The James Hutton Institute
-(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved.
-See the licence text below (MIT licence).
-
-This tool is a short Python script (using Biopython library functions)
-to search nucleotide sequences for open reading frames (ORFs) or coding
-sequences (CDSs) where the first potential start codon is used. See the
-help text in the XML file for more information.
-
-This tool is available from the Galaxy Tool Shed at:
-
-* http://toolshed.g2.bx.psu.edu/view/peterjc/get_orfs_or_cdss
-
-See also the EMBOSS tool ``getorf`` which offers similar functionality and
-has also been wrapped for use within Galaxy.
-
-
-Automated Installation
-======================
-
-This should be straightforward using the Galaxy Tool Shed, which should be
-able to automatically install the dependency on Biopython, and then install
-this tool and run its unit tests.
-
-
-Manual Installation
-===================
-
-There are just two files to install to use this tool from within Galaxy:
-
-* get_orfs_or_cdss.py (the Python script)
-* get_orfs_or_cdss.xml (the Galaxy tool definition)
-
-If you are installing this manually (rather than via the Tool Shed), the
-suggested location is in the Galaxy folder tools/filters next to the tool
-for calling sff_extract.py for converting SFF to FASTQ or FASTA + QUAL.
-You will also need to modify the tools_conf.xml file to tell Galaxy to offer the
-tool. One suggested location is in the filters section. Simply add the line::
-
-    <tool file="filters/get_orfs_or_cdss.xml" />
-
-You will also need to install Biopython 1.54 or later. If you want to run
-the unit tests, include this line in tools_conf.xml.sample and the sample
-FASTA files under the test-data directory. Then::
-
-    ./run_functional_tests.sh -id get_orfs_or_cdss
-
-That's it.
-
-
-History
-=======
-
-======= ======================================================================
-Version Changes
-------- ----------------------------------------------------------------------
-v0.0.1   - Initial version.
-v0.0.2   - Correct labelling issue on reverse strand.
-         - Use the new <stdio> settings in the XML wrappers to catch errors
-v0.0.3   - Include unit tests.
-         - Record Python script version when run from Galaxy.
-v0.0.4   - Link to Tool Shed added to help text and this documentation.
-v0.0.5   - Automated intallation of the Biopython dependency.
-         - Use reStructuredText for this README file.
-         - Adopt standard MIT License.
-         - Updated citation information (Cock et al. 2013).
-======= ======================================================================
-
-
-Developers
-==========
-
-This script and related tools were initially developed on the following hg branch:
-http://bitbucket.org/peterjc/galaxy-central/src/tools
-
-Development has now moved to a dedicated GitHub repository:
-https://github.com/peterjc/pico_galaxy/tree/master/tools
-
-For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use
-the following command from the Galaxy root folder::
-
-    $ tar -czf get_orfs_or_cdss.tar.gz tools/filters/get_orfs_or_cdss.* tools/filters/repository_dependencies.xml test-data/get_orf_input*.fasta test-data/Ssuis.fasta
-
-Check this worked::
-
-    $ tar -tzf get_orfs_or_cdss.tar.gz
-    filter/get_orfs_or_cdss.py
-    filter/get_orfs_or_cdss.rst
-    filter/get_orfs_or_cdss.xml
-    tools/filters/repository_dependencies.xml
-    test-data/get_orf_input.fasta
-    test-data/get_orf_input.Suis_ORF.nuc.fasta
-    test-data/get_orf_input.Suis_ORF.prot.fasta
-    test-data/get_orf_input.t11_nuc_out.fasta
-    test-data/get_orf_input.t11_open_nuc_out.fasta
-    test-data/get_orf_input.t11_open_prot_out.fasta
-    test-data/get_orf_input.t11_prot_out.fasta
-    test-data/get_orf_input.t1_nuc_out.fasta
-    test-data/get_orf_input.t1_prot_out.fasta
-    test-data/Ssuis.fasta
-
-
-Licence (MIT)
-=============
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
--- a/tools/filters/get_orfs_or_cdss.xml	Tue Sep 17 11:48:29 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,173 +0,0 @@
-<tool id="get_orfs_or_cdss" name="Get open reading frames (ORFs) or coding sequences (CDSs)" version="0.0.5">
-	<description>e.g. to get peptides from ESTs</description>
-	<version_command interpreter="python">get_orfs_or_cdss.py --version</version_command>
-	<command interpreter="python">
-get_orfs_or_cdss.py $input_file $input_file.ext $table $ftype $ends $mode $min_len $strand $out_nuc_file $out_prot_file
-	</command>
-	<stdio>
-		<!-- Anything other than zero is an error -->
-		<exit_code range="1:" />
-		<exit_code range=":-1" />
-	</stdio>
-	<inputs>
-		<param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file (nucleotides)" help="FASTA, FASTQ, or SFF format." />
-		<param name="table" type="select" label="Genetic code" help="Tables from the NCBI, these determine the start and stop codons">
-			<option value="1">1. Standard</option>
-			<option value="2">2. Vertebrate Mitochondrial</option>
-			<option value="3">3. Yeast Mitochondrial</option>
-			<option value="4">4. Mold, Protozoan, Coelenterate Mitochondrial and Mycoplasma/Spiroplasma</option>
-			<option value="5">5. Invertebrate Mitochondrial</option>
-			<option value="6">6. Ciliate Macronuclear and Dasycladacean</option>
-			<option value="9">9. Echinoderm Mitochondrial</option>
-			<option value="10">10. Euplotid Nuclear</option>
-			<option value="11">11. Bacterial</option>
-			<option value="12">12. Alternative Yeast Nuclear</option>
-			<option value="13">13. Ascidian Mitochondrial</option>
-			<option value="14">14. Flatworm Mitochondrial</option>
-			<option value="15">15. Blepharisma Macronuclear</option>
-			<option value="16">16. Chlorophycean Mitochondrial</option>
-			<option value="21">21. Trematode Mitochondrial</option>
-			<option value="22">22. Scenedesmus obliquus</option>
-			<option value="23">23. Thraustochytrium Mitochondrial</option>
-		</param>
-		<param name="ftype" type="select" value="True" label="Look for ORFs or CDSs">
-                        <option value="ORF">Look for ORFs (check for stop codons only, ignore start codons)</option>
-                        <option value="CDS">Look for CDSs (with start and stop codons)</option>
-		</param>
-                <param name="ends" type="select" value="open" label="Sequence end treatment">
-			<option value="open">Open ended (will allow missing start/stop codons at the ends)</option>
-                        <option value="closed">Complete (will check for start/stop codons at the ends)</option>
-                        <!-- TODO? Circular, for using this on finished bacteria etc -->
-                </param>
-
-		<param name="mode" type="select" label="Selection criteria" help="Suppose a sequence has ORFs/CDSs of lengths 100, 102 and 102 -- which should be taken? These options would return 3, 2 or 1 ORF.">
-                    <option value="all">All ORFs/CDSs from each sequence</option>
-                    <option value="top">All ORFs/CDSs from each sequence with the maximum length</option>
-                    <option value="one">First ORF/CDS from each sequence with the maximum length</option>
-		</param>
-                <param name="min_len" type="integer" size="5" value="30" label="Minimum length ORF/CDS (in amino acids, e.g. 30 aa = 90 bp plus any stop codon)">
-                </param>
-                <param name="strand" type="select" label="Strand to search" help="Use the forward only option if your sequence directionality is known (e.g. from poly-A tails, or strand specific RNA sequencing.">
-                    <option value="both">Search both the forward and reverse strand</option>
-                    <option value="forward">Only search the forward strand</option>
-                    <option value="reverse">Only search the reverse strand</option>
-                </param>
-	</inputs>
-	<outputs>
-		<data name="out_nuc_file" format="fasta" label="${ftype.value}s (nucleotides)" />
-		<data name="out_prot_file" format="fasta" label="${ftype.value}s (amino acids)" />
-	</outputs>
-	<tests>
-                <test>
-                        <param name="input_file" value="get_orf_input.fasta" />
-                        <param name="table" value="1" />
-                        <param name="ftype" value="CDS" />
-                        <param name="ends" value="open" />
-                        <param name="mode" value="all" />
-                        <param name="min_len" value="10" />
-                        <param name="strand" value="forward" />
-                        <output name="out_nuc_file" file="get_orf_input.t1_nuc_out.fasta" />
-                        <output name="out_prot_file" file="get_orf_input.t1_prot_out.fasta" />
-                </test>
-		<test>
-			<param name="input_file" value="get_orf_input.fasta" />
-			<param name="table" value="11" />
-			<param name="ftype" value="CDS" />
-			<param name="ends" value="closed" />
-			<param name="mode" value="all" />
-			<param name="min_len" value="10" />
-			<param name="strand" value="forward" />
-			<output name="out_nuc_file" file="get_orf_input.t11_nuc_out.fasta" />
-			<output	name="out_prot_file" file="get_orf_input.t11_prot_out.fasta" />
-		</test>
-		<test>
-                        <param name="input_file" value="get_orf_input.fasta" />
-                        <param name="table" value="11" />
-                        <param name="ftype" value="CDS" />
-                        <param name="ends" value="open" />
-                        <param name="mode" value="all" />
-                        <param name="min_len" value="10" />
-                        <param name="strand" value="forward" />
-                        <output name="out_nuc_file" file="get_orf_input.t11_open_nuc_out.fasta" />
-                        <output name="out_prot_file" file="get_orf_input.t11_open_prot_out.fasta" />
-		</test>
-                <test>
-			<param name="input_file" value="Ssuis.fasta" />
-			<param name="table" value="11" />
-			<param name="ftype" value="ORF" />
-			<param name="ends" value="open" />
-			<param name="mode" value="all" />
-			<param name="min_len" value="100" />
-			<param name="strand" value="both" />
-			<output name="out_nuc_file" file="get_orf_input.Suis_ORF.nuc.fasta" />
-			<output name="out_prot_file" file="get_orf_input.Suis_ORF.prot.fasta" />
-		</test>
-	</tests>
-	<requirements>
-		<requirement type="python-module">Bio</requirement>
-	</requirements>
-	<help>
-
-**What it does**
-
-Takes an input file of nucleotide sequences (typically FASTA, but also FASTQ
-and Standard Flowgram Format (SFF) are supported), and searches each sequence
-for open reading frames (ORFs) or potential coding sequences (CDSs) of the
-given minimum length. These are returned as FASTA files of nucleotides and
-protein sequences.
-
-You can choose to have all the ORFs/CDSs above the minimum length for each
-sequence (similar to the EMBOSS getorf tool), those with the longest length
-equal, or the first ORF/CDS with the longest length (in the special case
-where a sequence encodes two or more long ORFs/CDSs of the same length). The
-last option is a reasonable choice when the input sequences represent EST or
-mRNA sequences, where only one ORF/CDS is expected.
-
-Note that if no ORFs/CDSs in a sequence match the criteria, there will be no
-output for that sequence.
-
-Also note that the ORFs/CDSs are assigned modified identifiers to distinguish
-them from the original full length sequences, by appending a suffix.
-
-The start and stop codons are taken from the `NCBI Genetic Codes
-&lt;http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi&gt;`_.
-When searching for ORFs, the sequences will run from stop codon to stop
-codon, and any start codons are ignored. When searching for CDSs, the first
-potential start codon will be used, giving the longest possible CDS within
-each ORF, and thus the longest possible protein sequence. This is useful
-for things like BLAST or domain searching, but since this may not be the
-correct start codon may not be appropriate for signal peptide detection
-etc.
-
-**Example Usage**
-
-Given some EST sequences (Sanger capillary reads) assembled into unigenes,
-or a transcriptome assembly from some RNA-Seq, each of your nucleotide
-sequences should (barring sequencing, assembly errors, frame-shifts etc)
-encode one protein as a single ORF/CDS, which you wish to extract (and
-perhaps translate into amino acids).
-
-If your RNS-Seq data was strand specific, and assembled taking this into
-account, you should only search for ORFs/CDSs on the forward strand.
-
-**Citation**
-
-If you use this Galaxy tool in work leading to a scientific publication please
-cite the following paper:
-
-Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013).
-Galaxy tools and workflows for sequence analysis with applications
-in molecular plant pathology. PeerJ 1:e167
-http://dx.doi.org/10.7717/peerj.167
-
-This tool uses Biopython, so you may also wish to cite the Biopython
-application note (and Galaxy too of course):
-
-Cock et al (2009). Biopython: freely available Python tools for computational
-molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
-http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
-
-This tool is available to install into other Galaxy Instances via the Galaxy
-Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/get_orfs_or_cdss
-	</help>
-</tool>
--- a/tools/filters/repository_dependencies.xml	Tue Sep 17 11:48:29 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,6 +0,0 @@
-<?xml version="1.0"?>
-<repositories description="This requires Biopython as a dependency.">
-<!-- Leave out the tool shed and revision to get the current
-     tool shed and latest revision at the time of upload -->
-<repository changeset_revision="2f6c871cfa35" name="package_biopython_1_61" owner="biopython" toolshed="http://testtoolshed.g2.bx.psu.edu" />
-</repositories>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/get_orfs_or_cdss/README.rst	Thu Sep 26 10:04:13 2013 -0400
@@ -0,0 +1,125 @@
+Galaxy tool to find ORFs or simple CDSs
+=======================================
+
+This tool is copyright 2011-2013 by Peter Cock, The James Hutton Institute
+(formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved.
+See the licence text below (MIT licence).
+
+This tool is a short Python script (using Biopython library functions)
+to search nucleotide sequences for open reading frames (ORFs) or coding
+sequences (CDSs) where the first potential start codon is used. See the
+help text in the XML file for more information.
+
+This tool is available from the Galaxy Tool Shed at:
+
+* http://toolshed.g2.bx.psu.edu/view/peterjc/get_orfs_or_cdss
+
+See also the EMBOSS tool ``getorf`` which offers similar functionality and
+has also been wrapped for use within Galaxy.
+
+
+Automated Installation
+======================
+
+This should be straightforward using the Galaxy Tool Shed, which should be
+able to automatically install the dependency on Biopython, and then install
+this tool and run its unit tests.
+
+
+Manual Installation
+===================
+
+There are just two files to install to use this tool from within Galaxy:
+
+* get_orfs_or_cdss.py (the Python script)
+* get_orfs_or_cdss.xml (the Galaxy tool definition)
+
+The suggested location is in a dedicated tools/get_orfs_or_cdss folder.
+
+You will also need to modify the tools_conf.xml file to tell Galaxy to offer the
+tool. One suggested location is in the filters section. Simply add the line::
+
+    <tool file="get_orfs_or_cdss/get_orfs_or_cdss.xml" />
+
+You will also need to install Biopython 1.54 or later. If you want to run
+the unit tests, include this line in tools_conf.xml.sample and the sample
+FASTA files under the test-data directory. Then::
+
+    ./run_functional_tests.sh -id get_orfs_or_cdss
+
+That's it.
+
+
+History
+=======
+
+======= ======================================================================
+Version Changes
+------- ----------------------------------------------------------------------
+v0.0.1  - Initial version.
+v0.0.2  - Correct labelling issue on reverse strand.
+        - Use the new <stdio> settings in the XML wrappers to catch errors
+v0.0.3  - Include unit tests.
+        - Record Python script version when run from Galaxy.
+v0.0.4  - Link to Tool Shed added to help text and this documentation.
+v0.0.5  - Automated intallation of the Biopython dependency.
+        - Use reStructuredText for this README file.
+        - Adopt standard MIT License.
+        - Updated citation information (Cock et al. 2013).
+        - Renamed folder and adopted README.rst naming.
+======= ======================================================================
+
+
+Developers
+==========
+
+This script and related tools were initially developed on the following hg branch:
+http://bitbucket.org/peterjc/galaxy-central/src/tools
+
+Development has now moved to a dedicated GitHub repository:
+https://github.com/peterjc/pico_galaxy/tree/master/tools
+
+For making the "Galaxy Tool Shed" http://toolshed.g2.bx.psu.edu/ tarball use
+the following command from the Galaxy root folder::
+
+    $ tar -czf get_orfs_or_cdss.tar.gz tools/get_orfs_or_cdss/README.rst tools/get_orfs_or_cdss/get_orfs_or_cdss.* tools/get_orfs_or_cdss/repository_dependencies.xml test-data/get_orf_input*.fasta test-data/Ssuis.fasta
+
+Check this worked::
+
+    $ tar -tzf get_orfs_or_cdss.tar.gz
+    tools/get_orfs_or_cdss/README.rst
+    tools/get_orfs_or_cdss/get_orfs_or_cdss.py
+    tools/get_orfs_or_cdss/get_orfs_or_cdss.xml
+    tools/get_orfs_or_cdss/repository_dependencies.xml
+    test-data/get_orf_input.fasta
+    test-data/get_orf_input.Suis_ORF.nuc.fasta
+    test-data/get_orf_input.Suis_ORF.prot.fasta
+    test-data/get_orf_input.t11_nuc_out.fasta
+    test-data/get_orf_input.t11_open_nuc_out.fasta
+    test-data/get_orf_input.t11_open_prot_out.fasta
+    test-data/get_orf_input.t11_prot_out.fasta
+    test-data/get_orf_input.t1_nuc_out.fasta
+    test-data/get_orf_input.t1_prot_out.fasta
+    test-data/Ssuis.fasta
+
+
+Licence (MIT)
+=============
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/get_orfs_or_cdss/get_orfs_or_cdss.py	Thu Sep 26 10:04:13 2013 -0400
@@ -0,0 +1,223 @@
+#!/usr/bin/env python
+"""Find ORFs in a nucleotide sequence file.
+
+get_orfs_or_cdss.py $input_fasta $input_format $table $ftype $ends $mode $min_len $strand $out_nuc_file $out_prot_file
+
+Takes ten command line options, input sequence filename, format, genetic
+code, CDS vs ORF, end type (open, closed), selection mode (all, top, one),
+minimum length (in amino acids), strand (both, forward, reverse), output
+nucleotide filename, and output protein filename.
+
+This tool is a short Python script which requires Biopython. If you use
+this tool in scientific work leading to a publication, please cite the
+Biopython application note:
+
+Cock et al 2009. Biopython: freely available Python tools for computational
+molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
+http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
+
+This script is copyright 2011-2013 by Peter Cock, The James Hutton Institute
+(formerly SCRI), Dundee, UK. All rights reserved.
+
+See accompanying text file for licence details (MIT/BSD style).
+
+This is version 0.0.3 of the script.
+"""
+import sys
+import re
+
+if "-v" in sys.argv or "--version" in sys.argv:
+    print "v0.0.3"
+    sys.exit(0)
+
+def stop_err(msg, err=1):
+    sys.stderr.write(msg.rstrip() + "\n")
+    sys.exit(err)
+
+try:
+    from Bio.Seq import Seq, reverse_complement, translate
+    from Bio.SeqRecord import SeqRecord
+    from Bio import SeqIO
+    from Bio.Data import CodonTable
+except ImportError:
+    stop_err("Missing Biopython library")
+
+#Parse Command Line
+try:
+    input_file, seq_format, table, ftype, ends, mode, min_len, strand, out_nuc_file, out_prot_file = sys.argv[1:]
+except ValueError:
+    stop_err("Expected ten arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
+
+try:
+    table = int(table)
+except ValueError:
+    stop_err("Expected integer for genetic code table, got %s" % table)
+
+try:
+    table_obj = CodonTable.ambiguous_generic_by_id[table]
+except KeyError:
+    stop_err("Unknown codon table %i" % table)
+
+if ftype not in ["CDS", "ORF"]:
+    stop_err("Expected CDS or ORF, got %s" % ftype)
+
+if ends not in ["open", "closed"]:
+    stop_err("Expected open or closed for end treatment, got %s" % ends)
+
+try:
+    min_len = int(min_len)
+except ValueError:
+    stop_err("Expected integer for min_len, got %s" % min_len)
+
+if seq_format.lower()=="sff":
+    seq_format = "sff-trim"
+elif seq_format.lower()=="fasta":
+    seq_format = "fasta"
+elif seq_format.lower().startswith("fastq"):
+    seq_format = "fastq"
+else:
+    stop_err("Unsupported file type %r" % seq_format)
+
+print "Genetic code table %i" % table
+print "Minimum length %i aa" % min_len
+#print "Taking %s ORF(s) from %s strand(s)" % (mode, strand)
+
+starts = sorted(table_obj.start_codons)
+assert "NNN" not in starts
+re_starts = re.compile("|".join(starts))
+
+stops = sorted(table_obj.stop_codons)
+assert "NNN" not in stops
+re_stops = re.compile("|".join(stops))
+
+def start_chop_and_trans(s, strict=True):
+    """Returns offset, trimmed nuc, protein."""
+    if strict:
+        assert s[-3:] in stops, s
+    assert len(s) % 3 == 0
+    for match in re_starts.finditer(s):
+        #Must check the start is in frame
+        start = match.start()
+        if start % 3 == 0:
+            n = s[start:]
+            assert len(n) % 3 == 0, "%s is len %i" % (n, len(n))
+            if strict:
+                t = translate(n, table, cds=True)
+            else:
+                #Use when missing stop codon,
+                t = "M" + translate(n[3:], table, to_stop=True)
+            return start, n, t
+    return None, None, None
+
+def break_up_frame(s):
+    """Returns offset, nuc, protein."""
+    start = 0
+    for match in re_stops.finditer(s):
+        index = match.start() + 3
+        if index % 3 != 0:
+            continue
+        n = s[start:index]
+        if ftype=="CDS":
+            offset, n, t = start_chop_and_trans(n)
+        else:
+            offset = 0
+            t = translate(n, table, to_stop=True)
+        if n and len(t) >= min_len:
+            yield start + offset, n, t
+        start = index
+    if ends == "open":
+        #No stop codon, Biopython's strict CDS translate will fail
+        n = s[start:]
+        #Ensure we have whole codons
+        #TODO - Try appending N instead?
+        #TODO - Do the next four lines more elegantly
+        if len(n) % 3:
+            n = n[:-1]
+        if len(n) % 3:
+            n = n[:-1]
+        if ftype=="CDS":
+            offset, n, t = start_chop_and_trans(n, strict=False)
+        else:
+            offset = 0
+            t = translate(n, table, to_stop=True)
+        if n and len(t) >= min_len:
+            yield start + offset, n, t
+                        
+
+def get_all_peptides(nuc_seq):
+    """Returns start, end, strand, nucleotides, protein.
+
+    Co-ordinates are Python style zero-based.
+    """
+    #TODO - Refactor to use a generator function (in start order)
+    #rather than making a list and sorting?
+    answer = []
+    full_len = len(nuc_seq)
+    if strand != "reverse":
+        for frame in range(0,3):
+            for offset, n, t in break_up_frame(nuc_seq[frame:]):
+                start = frame + offset #zero based
+                answer.append((start, start + len(n), +1, n, t))
+    if strand != "forward":
+        rc = reverse_complement(nuc_seq)
+        for frame in range(0,3) :
+            for offset, n, t in break_up_frame(rc[frame:]):
+                start = full_len - frame - offset #zero based
+                answer.append((start - len(n), start, -1, n ,t))
+    answer.sort()
+    return answer
+
+def get_top_peptides(nuc_seq):
+    """Returns all peptides of max length."""
+    values = list(get_all_peptides(nuc_seq))
+    if not values:
+        raise StopIteration
+    max_len = max(len(x[-1]) for x in values)
+    for x in values:
+        if len(x[-1]) == max_len:
+            yield x
+
+def get_one_peptide(nuc_seq):
+    """Returns first (left most) peptide with max length."""
+    values = list(get_top_peptides(nuc_seq))
+    if not values:
+        raise StopIteration
+    yield values[0]
+
+if mode == "all":
+    get_peptides = get_all_peptides
+elif mode == "top":
+    get_peptides = get_top_peptides
+elif mode == "one":
+    get_peptides = get_one_peptide
+
+in_count = 0
+out_count = 0
+if out_nuc_file == "-":
+    out_nuc = sys.stdout
+else:
+    out_nuc = open(out_nuc_file, "w")
+if out_prot_file == "-":
+    out_prot = sys.stdout
+else:
+    out_prot = open(out_prot_file, "w")
+for record in SeqIO.parse(input_file, seq_format):
+    for i, (f_start, f_end, f_strand, n, t) in enumerate(get_peptides(str(record.seq).upper())):
+        out_count += 1
+        if f_strand == +1:
+            loc = "%i..%i" % (f_start+1, f_end)
+        else:
+            loc = "complement(%i..%i)" % (f_start+1, f_end)
+        descr = "length %i aa, %i bp, from %s of %s" \
+                % (len(t), len(n), loc, record.description)
+        r = SeqRecord(Seq(n), id = record.id + "|%s%i" % (ftype, i+1), name = "", description= descr)
+        t = SeqRecord(Seq(t), id = record.id + "|%s%i" % (ftype, i+1), name = "", description= descr)
+        SeqIO.write(r, out_nuc, "fasta")
+        SeqIO.write(t, out_prot, "fasta")
+    in_count += 1
+if out_nuc is not sys.stdout:
+    out_nuc.close()
+if out_prot is not sys.stdout:
+    out_prot.close()
+
+print "Found %i %ss in %i sequences" % (out_count, ftype, in_count)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/get_orfs_or_cdss/get_orfs_or_cdss.xml	Thu Sep 26 10:04:13 2013 -0400
@@ -0,0 +1,173 @@
+<tool id="get_orfs_or_cdss" name="Get open reading frames (ORFs) or coding sequences (CDSs)" version="0.0.5">
+	<description>e.g. to get peptides from ESTs</description>
+	<version_command interpreter="python">get_orfs_or_cdss.py --version</version_command>
+	<command interpreter="python">
+get_orfs_or_cdss.py $input_file $input_file.ext $table $ftype $ends $mode $min_len $strand $out_nuc_file $out_prot_file
+	</command>
+	<stdio>
+		<!-- Anything other than zero is an error -->
+		<exit_code range="1:" />
+		<exit_code range=":-1" />
+	</stdio>
+	<inputs>
+		<param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file (nucleotides)" help="FASTA, FASTQ, or SFF format." />
+		<param name="table" type="select" label="Genetic code" help="Tables from the NCBI, these determine the start and stop codons">
+			<option value="1">1. Standard</option>
+			<option value="2">2. Vertebrate Mitochondrial</option>
+			<option value="3">3. Yeast Mitochondrial</option>
+			<option value="4">4. Mold, Protozoan, Coelenterate Mitochondrial and Mycoplasma/Spiroplasma</option>
+			<option value="5">5. Invertebrate Mitochondrial</option>
+			<option value="6">6. Ciliate Macronuclear and Dasycladacean</option>
+			<option value="9">9. Echinoderm Mitochondrial</option>
+			<option value="10">10. Euplotid Nuclear</option>
+			<option value="11">11. Bacterial</option>
+			<option value="12">12. Alternative Yeast Nuclear</option>
+			<option value="13">13. Ascidian Mitochondrial</option>
+			<option value="14">14. Flatworm Mitochondrial</option>
+			<option value="15">15. Blepharisma Macronuclear</option>
+			<option value="16">16. Chlorophycean Mitochondrial</option>
+			<option value="21">21. Trematode Mitochondrial</option>
+			<option value="22">22. Scenedesmus obliquus</option>
+			<option value="23">23. Thraustochytrium Mitochondrial</option>
+		</param>
+		<param name="ftype" type="select" value="True" label="Look for ORFs or CDSs">
+                        <option value="ORF">Look for ORFs (check for stop codons only, ignore start codons)</option>
+                        <option value="CDS">Look for CDSs (with start and stop codons)</option>
+		</param>
+                <param name="ends" type="select" value="open" label="Sequence end treatment">
+			<option value="open">Open ended (will allow missing start/stop codons at the ends)</option>
+                        <option value="closed">Complete (will check for start/stop codons at the ends)</option>
+                        <!-- TODO? Circular, for using this on finished bacteria etc -->
+                </param>
+
+		<param name="mode" type="select" label="Selection criteria" help="Suppose a sequence has ORFs/CDSs of lengths 100, 102 and 102 -- which should be taken? These options would return 3, 2 or 1 ORF.">
+                    <option value="all">All ORFs/CDSs from each sequence</option>
+                    <option value="top">All ORFs/CDSs from each sequence with the maximum length</option>
+                    <option value="one">First ORF/CDS from each sequence with the maximum length</option>
+		</param>
+                <param name="min_len" type="integer" size="5" value="30" label="Minimum length ORF/CDS (in amino acids, e.g. 30 aa = 90 bp plus any stop codon)">
+                </param>
+                <param name="strand" type="select" label="Strand to search" help="Use the forward only option if your sequence directionality is known (e.g. from poly-A tails, or strand specific RNA sequencing.">
+                    <option value="both">Search both the forward and reverse strand</option>
+                    <option value="forward">Only search the forward strand</option>
+                    <option value="reverse">Only search the reverse strand</option>
+                </param>
+	</inputs>
+	<outputs>
+		<data name="out_nuc_file" format="fasta" label="${ftype.value}s (nucleotides)" />
+		<data name="out_prot_file" format="fasta" label="${ftype.value}s (amino acids)" />
+	</outputs>
+	<tests>
+                <test>
+                        <param name="input_file" value="get_orf_input.fasta" />
+                        <param name="table" value="1" />
+                        <param name="ftype" value="CDS" />
+                        <param name="ends" value="open" />
+                        <param name="mode" value="all" />
+                        <param name="min_len" value="10" />
+                        <param name="strand" value="forward" />
+                        <output name="out_nuc_file" file="get_orf_input.t1_nuc_out.fasta" />
+                        <output name="out_prot_file" file="get_orf_input.t1_prot_out.fasta" />
+                </test>
+		<test>
+			<param name="input_file" value="get_orf_input.fasta" />
+			<param name="table" value="11" />
+			<param name="ftype" value="CDS" />
+			<param name="ends" value="closed" />
+			<param name="mode" value="all" />
+			<param name="min_len" value="10" />
+			<param name="strand" value="forward" />
+			<output name="out_nuc_file" file="get_orf_input.t11_nuc_out.fasta" />
+			<output	name="out_prot_file" file="get_orf_input.t11_prot_out.fasta" />
+		</test>
+		<test>
+                        <param name="input_file" value="get_orf_input.fasta" />
+                        <param name="table" value="11" />
+                        <param name="ftype" value="CDS" />
+                        <param name="ends" value="open" />
+                        <param name="mode" value="all" />
+                        <param name="min_len" value="10" />
+                        <param name="strand" value="forward" />
+                        <output name="out_nuc_file" file="get_orf_input.t11_open_nuc_out.fasta" />
+                        <output name="out_prot_file" file="get_orf_input.t11_open_prot_out.fasta" />
+		</test>
+                <test>
+			<param name="input_file" value="Ssuis.fasta" />
+			<param name="table" value="11" />
+			<param name="ftype" value="ORF" />
+			<param name="ends" value="open" />
+			<param name="mode" value="all" />
+			<param name="min_len" value="100" />
+			<param name="strand" value="both" />
+			<output name="out_nuc_file" file="get_orf_input.Suis_ORF.nuc.fasta" />
+			<output name="out_prot_file" file="get_orf_input.Suis_ORF.prot.fasta" />
+		</test>
+	</tests>
+	<requirements>
+		<requirement type="python-module">Bio</requirement>
+	</requirements>
+	<help>
+
+**What it does**
+
+Takes an input file of nucleotide sequences (typically FASTA, but also FASTQ
+and Standard Flowgram Format (SFF) are supported), and searches each sequence
+for open reading frames (ORFs) or potential coding sequences (CDSs) of the
+given minimum length. These are returned as FASTA files of nucleotides and
+protein sequences.
+
+You can choose to have all the ORFs/CDSs above the minimum length for each
+sequence (similar to the EMBOSS getorf tool), those with the longest length
+equal, or the first ORF/CDS with the longest length (in the special case
+where a sequence encodes two or more long ORFs/CDSs of the same length). The
+last option is a reasonable choice when the input sequences represent EST or
+mRNA sequences, where only one ORF/CDS is expected.
+
+Note that if no ORFs/CDSs in a sequence match the criteria, there will be no
+output for that sequence.
+
+Also note that the ORFs/CDSs are assigned modified identifiers to distinguish
+them from the original full length sequences, by appending a suffix.
+
+The start and stop codons are taken from the `NCBI Genetic Codes
+&lt;http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi&gt;`_.
+When searching for ORFs, the sequences will run from stop codon to stop
+codon, and any start codons are ignored. When searching for CDSs, the first
+potential start codon will be used, giving the longest possible CDS within
+each ORF, and thus the longest possible protein sequence. This is useful
+for things like BLAST or domain searching, but since this may not be the
+correct start codon may not be appropriate for signal peptide detection
+etc.
+
+**Example Usage**
+
+Given some EST sequences (Sanger capillary reads) assembled into unigenes,
+or a transcriptome assembly from some RNA-Seq, each of your nucleotide
+sequences should (barring sequencing, assembly errors, frame-shifts etc)
+encode one protein as a single ORF/CDS, which you wish to extract (and
+perhaps translate into amino acids).
+
+If your RNS-Seq data was strand specific, and assembled taking this into
+account, you should only search for ORFs/CDSs on the forward strand.
+
+**Citation**
+
+If you use this Galaxy tool in work leading to a scientific publication please
+cite the following paper:
+
+Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013).
+Galaxy tools and workflows for sequence analysis with applications
+in molecular plant pathology. PeerJ 1:e167
+http://dx.doi.org/10.7717/peerj.167
+
+This tool uses Biopython, so you may also wish to cite the Biopython
+application note (and Galaxy too of course):
+
+Cock et al (2009). Biopython: freely available Python tools for computational
+molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
+http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
+
+This tool is available to install into other Galaxy Instances via the Galaxy
+Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/get_orfs_or_cdss
+	</help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/get_orfs_or_cdss/repository_dependencies.xml	Thu Sep 26 10:04:13 2013 -0400
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<repositories description="This requires Biopython as a dependency.">
+<!-- Leave out the tool shed and revision to get the current
+     tool shed and latest revision at the time of upload -->
+<repository changeset_revision="2f6c871cfa35" name="package_biopython_1_61" owner="biopython" toolshed="http://testtoolshed.g2.bx.psu.edu" />
+</repositories>