# HG changeset patch
# User peterjc
# Date 1395927589 14400
# Node ID 369515d7738b152bfc5a0b755f035961044dd3fa
# Parent 4c5b37848acbe6b979651b751c8c513a3372fdca
Uploaded v0.0.1 preview 2, correct typo in test
diff -r 4c5b37848acb -r 369515d7738b tools/sample_seqs/sample_seqs.py
--- a/tools/sample_seqs/sample_seqs.py Tue Feb 18 13:06:14 2014 -0500
+++ b/tools/sample_seqs/sample_seqs.py Thu Mar 27 09:39:49 2014 -0400
@@ -23,16 +23,19 @@
sys.exit(err)
if "-v" in sys.argv or "--version" in sys.argv:
- print "v0.1.0"
+ print("v0.1.0")
sys.exit(0)
#Parse Command Line
if len(sys.argv) < 5:
stop_err("Requires at least four arguments: seq_format, in_file, out_file, mode, ...")
seq_format, in_file, out_file, mode = sys.argv[1:5]
+if in_file != "/dev/stdin" and not os.path.isfile(in_file):
+ stop_err("Missing input file %r" % in_file)
+
if mode == "everyNth":
if len(sys.argv) != 6:
- stop_err("If using everyNth, just need argument N")
+ stop_err("If using everyNth, just need argument N (integer, at least 2)")
try:
N = int(sys.argv[5])
except:
@@ -40,25 +43,41 @@
if N < 2:
stop_err("Bad N argument %r" % sys.argv[5])
if (N % 10) == 1:
- print("Sampling every %ist sequence" % N)
+ sys.stderr.write("Sampling every %ist sequence\n" % N)
elif (N % 10) == 2:
- print("Sampling every %ind sequence" % N)
+ sys.stderr.write("Sampling every %ind sequence\n" % N)
elif (N % 10) == 3:
- print("Sampling every %ird sequence" % N)
+ sys.stderr.write("Sampling every %ird sequence\n" % N)
else:
- print("Sampling every %ith sequence" % N)
+ sys.stderr.write("Sampling every %ith sequence\n" % N)
+ def sampler(iterator):
+ global N
+ count = 0
+ for record in iterator:
+ count += 1
+ if count % N == 1:
+ yield record
+elif mode == "percentage":
+ if len(sys.argv) != 6:
+ stop_err("If using percentage, just need percentage argument (float, range 0 to 100)")
+ try:
+ percent = float(sys.argv[5]) / 100.0
+ except:
+ stop_err("Bad percent argument %r" % sys.argv[5])
+ if percent <= 0.0 or 1.0 <= percent:
+ stop_err("Bad percent argument %r" % sys.argv[5])
+ sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent))
+ def sampler(iterator):
+ global percent
+ count = 0
+ taken = 0
+ for record in iterator:
+ count += 1
+ if percent * count > taken:
+ taken += 1
+ yield record
else:
stop_err("Unsupported mode %r" % mode)
-if not os.path.isfile(in_file):
- stop_err("Missing input file %r" % in_file)
-
-
-def pick_every_N(iterator, N):
- count = 0
- for record in iterator:
- count += 1
- if count % N == 1:
- yield record
def raw_fasta_iterator(handle):
"""Yields raw FASTA records as multi-line strings."""
@@ -94,24 +113,24 @@
if not line:
return # StopIteration
-def fasta_filter_every_N(in_file, out_file, N):
+def fasta_filter(in_file, out_file, iterator_filter):
count = 0
#Galaxy now requires Python 2.5+ so can use with statements,
with open(in_file) as in_handle:
with open(out_file, "w") as pos_handle:
- for record in pick_every_N(raw_fasta_iterator(in_handle), N):
+ for record in iterator_filter(raw_fasta_iterator(in_handle)):
count += 1
pos_handle.write(record)
return count
try:
from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
- def fastq_filter_every_N(in_file, out_file, N):
+ def fastq_filter(in_file, out_file, iterator_filter):
count = 0
#from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
reader = fastqReader(open(in_file, "rU"))
writer = fastqWriter(open(out_file, "w"))
- for record in pick_every_N(reader, N):
+ for record in iterator_filter(reader):
count += 1
writer.write(record)
writer.close()
@@ -119,16 +138,16 @@
return count
except ImportError:
from Bio.SeqIO.QualityIO import FastqGeneralIterator
- def fastq_filter_every_N(in_file, out_file, N):
+ def fastq_filter(in_file, out_file, iterator_filter):
count = 0
with open(in_file) as in_handle:
with open(out_file, "w") as pos_handle:
- for title, seq, qual in pick_every_N(FastqGeneralIterator(in_handle), N):
+ for title, seq, qual in iterator_filter(FastqGeneralIterator(in_handle)):
count += 1
pos_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
return count
-def sff_filter_every_N(in_file, out_file, N):
+def sff_filter(in_file, out_file, iterator_filter):
count = 0
try:
from Bio.SeqIO.SffIO import SffIterator, SffWriter
@@ -148,17 +167,17 @@
with open(out_file, "wb") as out_handle:
writer = SffWriter(out_handle, xml=manifest)
in_handle.seek(0) #start again after getting manifest
- count = writer.write_file(pick_every_N(SffIterator(in_handle), N))
+ count = writer.write_file(iterator_filter(SffIterator(in_handle)))
#count = writer.write_file(SffIterator(in_handle))
return count
if seq_format.lower()=="sff":
- count = sff_filter_every_N(in_file, out_file, N)
+ count = sff_filter(in_file, out_file, sampler)
elif seq_format.lower()=="fasta":
- count = fasta_filter_every_N(in_file, out_file, N)
+ count = fasta_filter(in_file, out_file, sampler)
elif seq_format.lower().startswith("fastq"):
- count = fastq_filter_every_N(in_file, out_file, N)
+ count = fastq_filter(in_file, out_file, sampler)
else:
stop_err("Unsupported file type %r" % seq_format)
-print("Sampled %i records" % count)
+sys.stderr.write("Sampled %i records\n" % count)
diff -r 4c5b37848acb -r 369515d7738b tools/sample_seqs/sample_seqs.xml
--- a/tools/sample_seqs/sample_seqs.xml Tue Feb 18 13:06:14 2014 -0500
+++ b/tools/sample_seqs/sample_seqs.xml Thu Mar 27 09:39:49 2014 -0400
@@ -6,7 +6,14 @@
sample_seqs.py --version
+#if str($sampling.type) == "everyNth":
sample_seqs.py "$input_file.ext" "$input_file" "$output_file" "${sampling.type}" "${sampling.every_n}"
+#elif str($sampling.type) == "percentage":
+sample_seqs.py "$input_file.ext" "$input_file" "$output_file" "${sampling.type}" "${sampling.percent}"
+#else:
+##Should give an error about invalid sampling type:
+sample_seqs.py "$input_file.ext" "$input_file" "$output_file" "${sampling.type}"
+#end if
@@ -16,36 +23,58 @@
-
-
+
+
+
-
+
+
+
+
-
+
-
+
-
+
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -56,7 +85,10 @@
file sub-sampling from this (in the same format).
Several sampling modes are supported, all designed to be non-random. This
-allows reproducibility, and also works on paired sequence files.
+allows reproducibility, and also works on paired sequence files. Also
+note that by sampling uniformly through the file, this avoids any bias
+should reads in any part of the file are of lesser quality (e.g. one part
+of the slide).
The simplest mode is to take every N-th sequence, for example taking
every 2nd sequence would sample half the file - while taking every 5th
@@ -77,10 +109,9 @@
This tool uses Biopython, so if you use this Galaxy tool in work leading to a
scientific publication please cite the following paper:
-Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013).
-Galaxy tools and workflows for sequence analysis with applications
-in molecular plant pathology. PeerJ 1:e167
-http://dx.doi.org/10.7717/peerj.167
+Cock et al (2009). Biopython: freely available Python tools for computational
+molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3.
+http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878.
This tool is available to install into other Galaxy Instances via the Galaxy
Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/sample_seqs