Mercurial > repos > peterjc > seq_filter_by_id

--- a/tools/filters/seq_filter_by_id.py	Fri Apr 12 06:29:52 2013 -0400
+++ b/tools/filters/seq_filter_by_id.py	Mon Apr 15 12:11:16 2013 -0400
@@ -29,7 +29,7 @@
 (formerly the Scottish Crop Research Institute, SCRI), UK. All rights reserved.
 See accompanying text file for licence details (MIT/BSD style).

-This is version 0.0.3 of the script, use -v or --version to get the version.
+This is version 0.0.4 of the script, use -v or --version to get the version.
 """
 import sys

@@ -38,18 +38,20 @@
     sys.exit(err)

 if "-v" in sys.argv or "--version" in sys.argv:
-    print "v0.0.3"
+    print "v0.0.4"
     sys.exit(0)

 #Parse Command Line
 try:
     tabular_file, cols_arg, in_file, seq_format, out_positive_file, out_negative_file = sys.argv[1:]
 except ValueError:
-    stop_err("Expected six arguments, got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
+    stop_err("Expected six arguments (tab file, columns, in seq, seq format, out pos, out neg), got %i:\n%s" % (len(sys.argv)-1, " ".join(sys.argv)))
 try:
     columns = [int(arg)-1 for arg in cols_arg.split(",")]
 except ValueError:
     stop_err("Expected list of columns (comma separated integers), got %s" % cols_arg)
+if min(columns) < 0:
+    stop_err("Expect one-based column numbers (not zero-based counting), got %s" % cols_arg)

 if out_positive_file == "-" and out_negative_file == "-":
     stop_err("Neither output file requested")
@@ -78,12 +80,80 @@
 handle.close()


+def crude_fasta_iterator(handle):
+    """Yields tuples, record ID and the full record as a string."""
+    while True:
+        line = handle.readline()
+        if line == "":
+            return # Premature end of file, or just empty?
+        if line[0] == ">":
+            break
+
+    while True:
+        if line[0] != ">":
+            raise ValueError(
+                "Records in Fasta files should start with '>' character")
+        id = line[1:].split(None, 1)[0]
+        lines = [line]
+        line = handle.readline()
+        while True:
+            if not line:
+                break
+            if line[0] == ">":
+                break
+            lines.append(line)
+            line = handle.readline()
+        yield id, "".join(lines)
+        if not line:
+            return # StopIteration
+
+
+def fasta_filter(in_file, pos_file, neg_file, wanted):
+    """FASTA filter producing 60 character line wrapped outout."""
+    pos_count = neg_count = 0
+    #Galaxy now requires Python 2.5+ so can use with statements,
+    with open(in_file) as in_handle:
+        #Doing the if statement outside the loop for speed
+        #(with the downside of three very similar loops).
+        if pos_file != "-" and neg_file != "-":
+            print "Generating two FASTA files"
+            with open(pos_file, "w") as pos_handle:
+                with open(neg_file, "w") as neg_handle:
+                    for identifier, record in crude_fasta_iterator(in_handle):
+                        if identifier in wanted:
+                            pos_handle.write(record)
+                            pos_count += 1
+                        else:
+                            neg_handle.write(record)
+                            neg_count += 1
+        elif pos_file != "-":
+            print "Generating matching FASTA file"
+            with open(pos_file, "w") as pos_handle:
+                for identifier, record in crude_fasta_iterator(in_handle):
+                    if identifier in wanted:
+                        pos_handle.write(record)
+                        pos_count += 1
+                    else:
+                        neg_count += 1
+        else:
+            print "Generating non-matching FASTA file"
+            assert neg_file != "-"
+            with open(neg_file, "w") as neg_handle:
+                for identifier, record in crude_fasta_iterator(in_handle):
+                    if identifier in wanted:
+                        pos_count += 1
+                    else:
+                        neg_handle.write(record)
+                        neg_count += 1
+    return pos_count, neg_count
+
+
 if seq_format.lower()=="sff":
     #Now write filtered SFF file based on IDs from BLAST file
     try:
         from Bio.SeqIO.SffIO import SffIterator, SffWriter
     except ImportError:
-        stop_err("Requires Biopython 1.54 or later")
+        stop_err("SFF filtering requires Biopython 1.54 or later")

     try:
         from Bio.SeqIO.SffIO import ReadRocheXmlManifest
@@ -97,6 +167,7 @@
         manifest = None
     #This makes two passes though the SFF file with isn't so efficient,
     #but this makes the code simple.
+    pos_count = neg_count = 0
     if out_positive_file != "-":
         out_handle = open(out_positive_file, "wb")
         writer = SffWriter(out_handle, xml=manifest)
@@ -113,45 +184,11 @@
     in_handle.close()
     #At the time of writing, Galaxy doesn't show SFF file read counts,
     #so it is useful to put them in stdout and thus shown in job info.
-    if out_positive_file != "-" and out_negative_file != "-":
-        print "%i with and %i without specified IDs" % (pos_count, neg_count)
-    elif out_positive_file != "-":
-        print "%i with specified IDs" % pos_count
-    elif out_negative_file != "-":
-        print "%i without specified IDs" % neg_count
+    print "%i with and %i without specified IDs" % (pos_count, neg_count)
 elif seq_format.lower()=="fasta":
     #Write filtered FASTA file based on IDs from tabular file
-    from galaxy_utils.sequence.fasta import fastaReader, fastaWriter
-    reader = fastaReader(open(in_file, "rU"))
-    if out_positive_file != "-" and out_negative_file != "-":
-        print "Generating two FASTA files"
-        positive_writer = fastaWriter(open(out_positive_file, "w"))
-        negative_writer = fastaWriter(open(out_negative_file, "w"))
-        for record in reader:
-            #The [1:] is because the fastaReader leaves the > on the identifier.
-            if record.identifier and record.identifier.split()[0][1:] in ids:
-                positive_writer.write(record)
-            else:
-                negative_writer.write(record)
-        positive_writer.close()
-        negative_writer.close()
-    elif out_positive_file != "-":
-        print "Generating matching FASTA file"
-        positive_writer = fastaWriter(open(out_positive_file, "w"))
-        for record in reader:
-            #The [1:] is because the fastaReader leaves the > on the identifier.
-            if record.identifier and record.identifier.split()[0][1:] in ids:
-                positive_writer.write(record)
-        positive_writer.close()
-    elif out_negative_file != "-":
-        print "Generating non-matching FASTA file"
-        negative_writer = fastaWriter(open(out_negative_file, "w"))
-        for record in reader:
-            #The [1:] is because the fastaReader leaves the > on the identifier.
-            if not record.identifier or record.identifier.split()[0][1:] not in ids:
-                negative_writer.write(record)
-        negative_writer.close()
-    reader.close()
+    pos_count, neg_count = fasta_filter(in_file, out_positive_file, out_negative_file, ids)
+    print "%i with and %i without specified IDs" % (pos_count, neg_count)
 elif seq_format.lower().startswith("fastq"):
     #Write filtered FASTQ file based on IDs from tabular file
     from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
--- a/tools/filters/seq_filter_by_id.txt	Fri Apr 12 06:29:52 2013 -0400
+++ b/tools/filters/seq_filter_by_id.txt	Mon Apr 15 12:11:16 2013 -0400
@@ -1,7 +1,7 @@
 Galaxy tool to filter FASTA, FASTQ or SFF sequences by ID
 =========================================================

-This tool is copyright 2010-2011 by Peter Cock, The James Hutton Institute
+This tool is copyright 2010-2013 by Peter Cock, The James Hutton Institute
 (formerly SCRI, Scottish Crop Research Institute), UK. All rights reserved.
 See the licence text below.

@@ -36,6 +36,7 @@

 v0.0.1 - Initial version, combining three separate scripts for each file format.
 v0.0.4 - Record script version when run from Galaxy.
+       - Faster FASTA code which preserves the original line wrapping.
        - Basic unit test included.
--- a/tools/filters/seq_filter_by_id.xml	Fri Apr 12 06:29:52 2013 -0400
+++ b/tools/filters/seq_filter_by_id.xml	Mon Apr 15 12:11:16 2013 -0400
@@ -56,11 +56,13 @@
 		</data>
 	</outputs>
 	<tests>
-		<param name="input_file" value="test-data/k12_ten_proteins.fasta" ftype="fasta" />
-		<param name="input_tabular" value="k12_hypothetical.tabular" ftype="tabular" />
-		<param name="columns" value="1" />
-		<param name="output_choice_cond" value="pos" />
-		<output name="output_pos" file="test-data/k12_hypothetical.fasta" ftype="fasta" />
+		<test>
+			<param name="input_file" value="k12_ten_proteins.fasta" ftype="fasta" />
+			<param name="input_tabular" value="k12_hypothetical.tabular" ftype="tabular" />
+			<param name="columns" value="1" />
+			<param name="output_choice" value="pos" />
+			<output name="output_pos" file="k12_hypothetical.fasta" ftype="fasta" />
+		</test>
 	</tests>
 	<requirements>
 		<requirement type="python-module">Bio</requirement>