Mercurial > repos > peterjc > sample_seqs

--- a/tools/sample_seqs/sample_seqs.py	Tue Feb 18 13:06:14 2014 -0500
+++ b/tools/sample_seqs/sample_seqs.py	Thu Mar 27 09:39:49 2014 -0400
@@ -23,16 +23,19 @@
     sys.exit(err)

 if "-v" in sys.argv or "--version" in sys.argv:
-    print "v0.1.0"
+    print("v0.1.0")
     sys.exit(0)

 #Parse Command Line
 if len(sys.argv) < 5:
     stop_err("Requires at least four arguments: seq_format, in_file, out_file, mode, ...")
 seq_format, in_file, out_file, mode = sys.argv[1:5]
+if in_file != "/dev/stdin" and not os.path.isfile(in_file):
+    stop_err("Missing input file %r" % in_file)
+
 if mode == "everyNth":
     if len(sys.argv) != 6:
-        stop_err("If using everyNth, just need argument N")
+        stop_err("If using everyNth, just need argument N (integer, at least 2)")
     try:
         N = int(sys.argv[5])
     except:
@@ -40,25 +43,41 @@
     if N < 2:
         stop_err("Bad N argument %r" % sys.argv[5])
     if (N % 10) == 1:
-        print("Sampling every %ist sequence" % N)
+        sys.stderr.write("Sampling every %ist sequence\n" % N)
     elif (N % 10) == 2:
-        print("Sampling every %ind sequence" % N)
+        sys.stderr.write("Sampling every %ind sequence\n" % N)
     elif (N % 10) == 3:
-        print("Sampling every %ird sequence" % N)
+        sys.stderr.write("Sampling every %ird sequence\n" % N)
     else:
-        print("Sampling every %ith sequence" % N)
+        sys.stderr.write("Sampling every %ith sequence\n" % N)
+    def sampler(iterator):
+        global N
+        count = 0
+        for record in iterator:
+            count += 1
+            if count % N == 1:
+                yield record
+elif mode == "percentage":
+    if len(sys.argv) != 6:
+        stop_err("If using percentage, just need percentage argument (float, range 0 to 100)")
+    try:
+        percent = float(sys.argv[5]) / 100.0
+    except:
+        stop_err("Bad percent argument %r" % sys.argv[5])
+    if percent <= 0.0 or 1.0 <= percent:
+        stop_err("Bad percent argument %r" % sys.argv[5])
+    sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent))
+    def sampler(iterator):
+        global percent
+        count = 0
+        taken = 0
+        for record in iterator:
+            count += 1
+            if percent * count > taken:
+                taken += 1
+                yield record
 else:
     stop_err("Unsupported mode %r" % mode)
-if not os.path.isfile(in_file):
-    stop_err("Missing input file %r" % in_file)
-
-
-def pick_every_N(iterator, N):
-    count = 0
-    for record in iterator:
-        count += 1
-        if count % N == 1:
-            yield record

 def raw_fasta_iterator(handle):
     """Yields raw FASTA records as multi-line strings."""
@@ -94,24 +113,24 @@
         if not line:
             return # StopIteration

-def fasta_filter_every_N(in_file, out_file, N):
+def fasta_filter(in_file, out_file, iterator_filter):
     count = 0
     #Galaxy now requires Python 2.5+ so can use with statements,
     with open(in_file) as in_handle:
         with open(out_file, "w") as pos_handle:
-            for record in pick_every_N(raw_fasta_iterator(in_handle), N):
+            for record in iterator_filter(raw_fasta_iterator(in_handle)):
                 count += 1
                 pos_handle.write(record)
     return count

 try:
     from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
-    def fastq_filter_every_N(in_file, out_file, N):
+    def fastq_filter(in_file, out_file, iterator_filter):
         count = 0
         #from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
         reader = fastqReader(open(in_file, "rU"))
         writer = fastqWriter(open(out_file, "w"))
-        for record in pick_every_N(reader, N):
+        for record in iterator_filter(reader):
             count += 1
             writer.write(record)
         writer.close()
@@ -119,16 +138,16 @@
         return count
 except ImportError:
     from Bio.SeqIO.QualityIO import FastqGeneralIterator
-    def fastq_filter_every_N(in_file, out_file, N):
+    def fastq_filter(in_file, out_file, iterator_filter):
         count = 0
         with open(in_file) as in_handle:
             with open(out_file, "w") as pos_handle:
-                for title, seq, qual in pick_every_N(FastqGeneralIterator(in_handle), N):
+                for title, seq, qual in iterator_filter(FastqGeneralIterator(in_handle)):
                     count += 1
                     pos_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
         return count

-def sff_filter_every_N(in_file, out_file, N):
+def sff_filter(in_file, out_file, iterator_filter):
     count = 0
     try:
         from Bio.SeqIO.SffIO import SffIterator, SffWriter
@@ -148,17 +167,17 @@
         with open(out_file, "wb") as out_handle:
             writer = SffWriter(out_handle, xml=manifest)
             in_handle.seek(0) #start again after getting manifest
-            count = writer.write_file(pick_every_N(SffIterator(in_handle), N))
+            count = writer.write_file(iterator_filter(SffIterator(in_handle)))
             #count = writer.write_file(SffIterator(in_handle))
     return count

 if seq_format.lower()=="sff":
-    count = sff_filter_every_N(in_file, out_file, N)
+    count = sff_filter(in_file, out_file, sampler)
 elif seq_format.lower()=="fasta":
-    count = fasta_filter_every_N(in_file, out_file, N)
+    count = fasta_filter(in_file, out_file, sampler)
 elif seq_format.lower().startswith("fastq"):
-    count = fastq_filter_every_N(in_file, out_file, N)
+    count = fastq_filter(in_file, out_file, sampler)
 else:
     stop_err("Unsupported file type %r" % seq_format)

-print("Sampled %i records" % count)
+sys.stderr.write("Sampled %i records\n" % count)
--- a/tools/sample_seqs/sample_seqs.xml	Tue Feb 18 13:06:14 2014 -0500
+++ b/tools/sample_seqs/sample_seqs.xml	Thu Mar 27 09:39:49 2014 -0400
@@ -6,7 +6,14 @@
     </requirements>
     <version_command interpreter="python">sample_seqs.py --version</version_command>
     <command interpreter="python">
+#if str($sampling.type) == "everyNth":
 sample_seqs.py "$input_file.ext" "$input_file" "$output_file" "${sampling.type}" "${sampling.every_n}"
+#elif str($sampling.type) == "percentage":
+sample_seqs.py "$input_file.ext" "$input_file" "$output_file" "${sampling.type}" "${sampling.percent}"
+#else:
+##Should give an error about invalid sampling type:
+sample_seqs.py "$input_file.ext" "$input_file" "$output_file" "${sampling.type}"
+#end if
     </command>
     <stdio>
         <!-- Anything other than zero is an error -->
@@ -16,36 +23,58 @@
     <inputs>
         <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file" help="FASTA, FASTQ, or SFF format." />
         <conditional name="sampling">
-            <param name="type" type="select" label="Sequence end treatment">
-                <option value="everyNth">Take every n-th sequence (e.g. every second sequence) ended (will allow missing start/stop codons at the ends)</option>
+            <param name="type" type="select" label="Sub-sampling approach">
+                <option value="everyNth">Take every N-th sequence (e.g. every fifth sequence)</option>
+                <option value="percentage">Take some percentage of the sequences (e.g. 20% will take every fifth sequence)</option>
                 <!-- TODO - target coverage etc -->
             </param>
             <when value="everyNth">
-                <param name="every_n" value="5" type="integer" min="2" help="e.g. 5 with take every 5th sequence (taking 20% of the sequences)" />
+                <param name="every_n" value="5" type="integer" min="2" label="N" help="At least 2, e.g. 5 will take every 5th sequence (taking 20% of the sequences)" />
+            </when>
+            <when value="percentage">
+                <param name="percent" value="20.0" type="float" min="0" max="100" label="Percentage" help="Between 0 and 100, e.g. 20% will take every 5th sequence" />
             </when>
         </conditional>
     </inputs>
     <outputs>
-        <data name="output_file" format="fasta" label="${input_file.name} (sub-sampled)" />
+        <data name="output_file" format="input" metadata_source="input_file" label="${input_file.name} (sub-sampled)"/>
     </outputs>
     <tests>
         <test>
             <param name="input_file" value="get_orf_input.Suis_ORF.prot.fasta" />
             <param name="type" value="everyNth" />
             <param name="every_n" value="100" />
-            <output name="out_prot_file" file="get_orf_input.Suis_ORF.prot.sample_N100.fasta" />
+            <output name="output_file" file="get_orf_input.Suis_ORF.prot.sample_N100.fasta" />
         </test>
         <test>
             <param name="input_file" value="ecoli.fastq" />
             <param name="type" value="everyNth" />
             <param name="every_n" value="100" />
-            <output name="out_prot_file" file="ecoli.sample_N100.fastq" />
+            <output name="output_file" file="ecoli.sample_N100.fastq" />
         </test>
         <test>
             <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" />
             <param name="type" value="everyNth" />
             <param name="every_n" value="5" />
-            <output name="out_prot_file" file="MID4_GLZRM4E04_rnd30_frclip.sample_N5.sff" ftype="sff"/>
+            <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.sample_N5.sff" ftype="sff"/>
+        </test>
+        <test>
+            <param name="input_file" value="get_orf_input.Suis_ORF.prot.fasta" />
+            <param name="type" value="percentage" />
+            <param name="percent" value="1.0" />
+            <output name="output_file" file="get_orf_input.Suis_ORF.prot.sample_N100.fasta" />
+        </test>
+        <test>
+            <param name="input_file" value="ecoli.fastq" />
+            <param name="type" value="percentage" />
+            <param name="percent" value="1.0" />
+            <output name="output_file" file="ecoli.sample_N100.fastq" />
+        </test>
+        <test>
+            <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" />
+            <param name="type" value="percentage" />
+            <param name="percent" value="20.0" />
+            <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.sample_N5.sff" ftype="sff"/>
         </test>
     </tests>
     <help>
@@ -56,7 +85,10 @@
 file sub-sampling from this (in the same format).

 Several sampling modes are supported, all designed to be non-random. This
-allows reproducibility, and also works on paired sequence files.
+allows reproducibility, and also works on paired sequence files. Also
+note that by sampling uniformly through the file, this avoids any bias
+should reads in any part of the file are of lesser quality (e.g. one part
+of the slide).

 The simplest mode is to take every N-th sequence, for example taking
 every 2nd sequence would sample half the file - while taking every 5th
@@ -77,10 +109,9 @@
 This tool uses Biopython, so if you use this Galaxy tool in work leading to a
 scientific publication please cite the following paper:

-Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013).
-Galaxy tools and workflows for sequence analysis with applications
-in molecular plant pathology. PeerJ 1:e167
-http://dx.doi.org/10.7717/peerj.167
+Cock et al (2009). Biopython: freely available Python tools for computational
+molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
+http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.

 This tool is available to install into other Galaxy Instances via the Galaxy
 Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/sample_seqs