Mercurial > repos > peterjc > sample_seqs
changeset 1:369515d7738b draft
Uploaded v0.0.1 preview 2, correct typo in test
author | peterjc |
---|---|
date | Thu, 27 Mar 2014 09:39:49 -0400 |
parents | 4c5b37848acb |
children | 219924bd7e3e |
files | tools/sample_seqs/sample_seqs.py tools/sample_seqs/sample_seqs.xml |
diffstat | 2 files changed, 90 insertions(+), 40 deletions(-) [+] |
line wrap: on
line diff
--- a/tools/sample_seqs/sample_seqs.py Tue Feb 18 13:06:14 2014 -0500 +++ b/tools/sample_seqs/sample_seqs.py Thu Mar 27 09:39:49 2014 -0400 @@ -23,16 +23,19 @@ sys.exit(err) if "-v" in sys.argv or "--version" in sys.argv: - print "v0.1.0" + print("v0.1.0") sys.exit(0) #Parse Command Line if len(sys.argv) < 5: stop_err("Requires at least four arguments: seq_format, in_file, out_file, mode, ...") seq_format, in_file, out_file, mode = sys.argv[1:5] +if in_file != "/dev/stdin" and not os.path.isfile(in_file): + stop_err("Missing input file %r" % in_file) + if mode == "everyNth": if len(sys.argv) != 6: - stop_err("If using everyNth, just need argument N") + stop_err("If using everyNth, just need argument N (integer, at least 2)") try: N = int(sys.argv[5]) except: @@ -40,25 +43,41 @@ if N < 2: stop_err("Bad N argument %r" % sys.argv[5]) if (N % 10) == 1: - print("Sampling every %ist sequence" % N) + sys.stderr.write("Sampling every %ist sequence\n" % N) elif (N % 10) == 2: - print("Sampling every %ind sequence" % N) + sys.stderr.write("Sampling every %ind sequence\n" % N) elif (N % 10) == 3: - print("Sampling every %ird sequence" % N) + sys.stderr.write("Sampling every %ird sequence\n" % N) else: - print("Sampling every %ith sequence" % N) + sys.stderr.write("Sampling every %ith sequence\n" % N) + def sampler(iterator): + global N + count = 0 + for record in iterator: + count += 1 + if count % N == 1: + yield record +elif mode == "percentage": + if len(sys.argv) != 6: + stop_err("If using percentage, just need percentage argument (float, range 0 to 100)") + try: + percent = float(sys.argv[5]) / 100.0 + except: + stop_err("Bad percent argument %r" % sys.argv[5]) + if percent <= 0.0 or 1.0 <= percent: + stop_err("Bad percent argument %r" % sys.argv[5]) + sys.stderr.write("Sampling %0.3f%% of sequences\n" % (100.0 * percent)) + def sampler(iterator): + global percent + count = 0 + taken = 0 + for record in iterator: + count += 1 + if percent * count > taken: + taken += 1 + yield record else: stop_err("Unsupported mode %r" % mode) -if not os.path.isfile(in_file): - stop_err("Missing input file %r" % in_file) - - -def pick_every_N(iterator, N): - count = 0 - for record in iterator: - count += 1 - if count % N == 1: - yield record def raw_fasta_iterator(handle): """Yields raw FASTA records as multi-line strings.""" @@ -94,24 +113,24 @@ if not line: return # StopIteration -def fasta_filter_every_N(in_file, out_file, N): +def fasta_filter(in_file, out_file, iterator_filter): count = 0 #Galaxy now requires Python 2.5+ so can use with statements, with open(in_file) as in_handle: with open(out_file, "w") as pos_handle: - for record in pick_every_N(raw_fasta_iterator(in_handle), N): + for record in iterator_filter(raw_fasta_iterator(in_handle)): count += 1 pos_handle.write(record) return count try: from galaxy_utils.sequence.fastq import fastqReader, fastqWriter - def fastq_filter_every_N(in_file, out_file, N): + def fastq_filter(in_file, out_file, iterator_filter): count = 0 #from galaxy_utils.sequence.fastq import fastqReader, fastqWriter reader = fastqReader(open(in_file, "rU")) writer = fastqWriter(open(out_file, "w")) - for record in pick_every_N(reader, N): + for record in iterator_filter(reader): count += 1 writer.write(record) writer.close() @@ -119,16 +138,16 @@ return count except ImportError: from Bio.SeqIO.QualityIO import FastqGeneralIterator - def fastq_filter_every_N(in_file, out_file, N): + def fastq_filter(in_file, out_file, iterator_filter): count = 0 with open(in_file) as in_handle: with open(out_file, "w") as pos_handle: - for title, seq, qual in pick_every_N(FastqGeneralIterator(in_handle), N): + for title, seq, qual in iterator_filter(FastqGeneralIterator(in_handle)): count += 1 pos_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) return count -def sff_filter_every_N(in_file, out_file, N): +def sff_filter(in_file, out_file, iterator_filter): count = 0 try: from Bio.SeqIO.SffIO import SffIterator, SffWriter @@ -148,17 +167,17 @@ with open(out_file, "wb") as out_handle: writer = SffWriter(out_handle, xml=manifest) in_handle.seek(0) #start again after getting manifest - count = writer.write_file(pick_every_N(SffIterator(in_handle), N)) + count = writer.write_file(iterator_filter(SffIterator(in_handle))) #count = writer.write_file(SffIterator(in_handle)) return count if seq_format.lower()=="sff": - count = sff_filter_every_N(in_file, out_file, N) + count = sff_filter(in_file, out_file, sampler) elif seq_format.lower()=="fasta": - count = fasta_filter_every_N(in_file, out_file, N) + count = fasta_filter(in_file, out_file, sampler) elif seq_format.lower().startswith("fastq"): - count = fastq_filter_every_N(in_file, out_file, N) + count = fastq_filter(in_file, out_file, sampler) else: stop_err("Unsupported file type %r" % seq_format) -print("Sampled %i records" % count) +sys.stderr.write("Sampled %i records\n" % count)
--- a/tools/sample_seqs/sample_seqs.xml Tue Feb 18 13:06:14 2014 -0500 +++ b/tools/sample_seqs/sample_seqs.xml Thu Mar 27 09:39:49 2014 -0400 @@ -6,7 +6,14 @@ </requirements> <version_command interpreter="python">sample_seqs.py --version</version_command> <command interpreter="python"> +#if str($sampling.type) == "everyNth": sample_seqs.py "$input_file.ext" "$input_file" "$output_file" "${sampling.type}" "${sampling.every_n}" +#elif str($sampling.type) == "percentage": +sample_seqs.py "$input_file.ext" "$input_file" "$output_file" "${sampling.type}" "${sampling.percent}" +#else: +##Should give an error about invalid sampling type: +sample_seqs.py "$input_file.ext" "$input_file" "$output_file" "${sampling.type}" +#end if </command> <stdio> <!-- Anything other than zero is an error --> @@ -16,36 +23,58 @@ <inputs> <param name="input_file" type="data" format="fasta,fastq,sff" label="Sequence file" help="FASTA, FASTQ, or SFF format." /> <conditional name="sampling"> - <param name="type" type="select" label="Sequence end treatment"> - <option value="everyNth">Take every n-th sequence (e.g. every second sequence) ended (will allow missing start/stop codons at the ends)</option> + <param name="type" type="select" label="Sub-sampling approach"> + <option value="everyNth">Take every N-th sequence (e.g. every fifth sequence)</option> + <option value="percentage">Take some percentage of the sequences (e.g. 20% will take every fifth sequence)</option> <!-- TODO - target coverage etc --> </param> <when value="everyNth"> - <param name="every_n" value="5" type="integer" min="2" help="e.g. 5 with take every 5th sequence (taking 20% of the sequences)" /> + <param name="every_n" value="5" type="integer" min="2" label="N" help="At least 2, e.g. 5 will take every 5th sequence (taking 20% of the sequences)" /> + </when> + <when value="percentage"> + <param name="percent" value="20.0" type="float" min="0" max="100" label="Percentage" help="Between 0 and 100, e.g. 20% will take every 5th sequence" /> </when> </conditional> </inputs> <outputs> - <data name="output_file" format="fasta" label="${input_file.name} (sub-sampled)" /> + <data name="output_file" format="input" metadata_source="input_file" label="${input_file.name} (sub-sampled)"/> </outputs> <tests> <test> <param name="input_file" value="get_orf_input.Suis_ORF.prot.fasta" /> <param name="type" value="everyNth" /> <param name="every_n" value="100" /> - <output name="out_prot_file" file="get_orf_input.Suis_ORF.prot.sample_N100.fasta" /> + <output name="output_file" file="get_orf_input.Suis_ORF.prot.sample_N100.fasta" /> </test> <test> <param name="input_file" value="ecoli.fastq" /> <param name="type" value="everyNth" /> <param name="every_n" value="100" /> - <output name="out_prot_file" file="ecoli.sample_N100.fastq" /> + <output name="output_file" file="ecoli.sample_N100.fastq" /> </test> <test> <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" /> <param name="type" value="everyNth" /> <param name="every_n" value="5" /> - <output name="out_prot_file" file="MID4_GLZRM4E04_rnd30_frclip.sample_N5.sff" ftype="sff"/> + <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.sample_N5.sff" ftype="sff"/> + </test> + <test> + <param name="input_file" value="get_orf_input.Suis_ORF.prot.fasta" /> + <param name="type" value="percentage" /> + <param name="percent" value="1.0" /> + <output name="output_file" file="get_orf_input.Suis_ORF.prot.sample_N100.fasta" /> + </test> + <test> + <param name="input_file" value="ecoli.fastq" /> + <param name="type" value="percentage" /> + <param name="percent" value="1.0" /> + <output name="output_file" file="ecoli.sample_N100.fastq" /> + </test> + <test> + <param name="input_file" value="MID4_GLZRM4E04_rnd30_frclip.sff" ftype="sff" /> + <param name="type" value="percentage" /> + <param name="percent" value="20.0" /> + <output name="output_file" file="MID4_GLZRM4E04_rnd30_frclip.sample_N5.sff" ftype="sff"/> </test> </tests> <help> @@ -56,7 +85,10 @@ file sub-sampling from this (in the same format). Several sampling modes are supported, all designed to be non-random. This -allows reproducibility, and also works on paired sequence files. +allows reproducibility, and also works on paired sequence files. Also +note that by sampling uniformly through the file, this avoids any bias +should reads in any part of the file are of lesser quality (e.g. one part +of the slide). The simplest mode is to take every N-th sequence, for example taking every 2nd sequence would sample half the file - while taking every 5th @@ -77,10 +109,9 @@ This tool uses Biopython, so if you use this Galaxy tool in work leading to a scientific publication please cite the following paper: -Peter J.A. Cock, Björn A. Grüning, Konrad Paszkiewicz and Leighton Pritchard (2013). -Galaxy tools and workflows for sequence analysis with applications -in molecular plant pathology. PeerJ 1:e167 -http://dx.doi.org/10.7717/peerj.167 +Cock et al (2009). Biopython: freely available Python tools for computational +molecular biology and bioinformatics. Bioinformatics 25(11) 1422-3. +http://dx.doi.org/10.1093/bioinformatics/btp163 pmid:19304878. This tool is available to install into other Galaxy Instances via the Galaxy Tool Shed at http://toolshed.g2.bx.psu.edu/view/peterjc/sample_seqs