changeset 0:b029a652edcd draft

Imported from capsule None
author devteam
date Mon, 19 May 2014 10:59:27 -0400
parents
children 4da47b319c82
files fasta_filter_by_length.py fasta_filter_by_length.xml test-data/4.fasta test-data/454.fasta test-data/fasta_tool_filter_length_1.out
diffstat 5 files changed, 219 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fasta_filter_by_length.py	Mon May 19 10:59:27 2014 -0400
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+"""
+Input: fasta, minimal length, maximal length
+Output: fasta
+Return sequences whose lengths are within the range.
+"""
+
+import sys, os
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def stop_err( msg ):
+    sys.stderr.write( msg )
+    sys.exit()
+
+def __main__():
+    input_filename = sys.argv[1]
+    try:
+        min_length = int( sys.argv[2] )
+    except:
+        stop_err( "Minimal length of the return sequence requires a numerical value." )
+    try:
+        max_length = int( sys.argv[3] )
+    except:
+        stop_err( "Maximum length of the return sequence requires a numerical value." )
+    output_filename = sys.argv[4]
+    output_handle = open( output_filename, 'w' )
+    tmp_size = 0 #-1
+    tmp_buf = ''
+    at_least_one = 0
+    for line in file(input_filename):
+        if not line or line.startswith('#'):
+            continue
+        if line[0] == '>':
+            if min_length <= tmp_size <= max_length or (min_length <= tmp_size and max_length == 0):
+                output_handle.write(tmp_buf)
+                at_least_one = 1
+            tmp_buf = line
+            tmp_size = 0                                                       
+        else:
+            if max_length == 0 or tmp_size < max_length:
+                tmp_size += len(line.rstrip('\r\n'))
+                tmp_buf += line
+    # final flush of buffer
+    if min_length <= tmp_size <= max_length or (min_length <= tmp_size and max_length == 0):
+        output_handle.write(tmp_buf.rstrip('\r\n'))
+        at_least_one = 1
+    output_handle.close()
+    if at_least_one == 0:
+        print "There is no sequence that falls within your range."
+
+if __name__ == "__main__" : __main__()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fasta_filter_by_length.xml	Mon May 19 10:59:27 2014 -0400
@@ -0,0 +1,56 @@
+<tool id="fasta_filter_by_length" name="Filter sequences by length" version="1.1">
+	<description></description>
+	<command interpreter="python">fasta_filter_by_length.py $input $min_length $max_length $output </command>
+	<inputs>
+		<param name="input" type="data" format="fasta" label="Fasta file"/>
+		<param name="min_length" type="integer" size="15" value="0" label="Minimal length" />
+		<param name="max_length" type="integer" size="15" value="0" label="Maximum length" help="Setting to '0' will return all sequences longer than the 'Minimal length'"/> 
+	</inputs>
+	<outputs>
+		<data name="output" format="fasta"/>
+	</outputs>
+	<tests>
+		<test>
+			<param name="input" value="454.fasta" />
+			<param name="min_length" value="10" />
+			<param name="max_length" value="0" />
+			<output name="output" file="fasta_tool_filter_length_1.out" />
+		</test>
+	</tests>
+	<help>
+
+.. class:: infomark
+
+**TIP**. To return sequences longer than a certain length, set *Minimal length* to desired value and leave *Maximum length* set to '0'.
+
+-----
+
+**What it does**
+	
+Outputs sequences between *Minimal length* and *Maximum length*.
+ 
+-----
+
+**Example**
+
+Suppose you have the following FASTA formatted sequences::
+
+	&gt;seq1
+	TCATTTAATGAC
+	&gt;seq2
+	ATGGC
+	&gt;seq3
+	TCACATGATGCCG
+	&gt;seq4
+	ATGGAAGC
+
+Setting the **Minimal length** to **10**, and the **Maximum length** to **0** will return all sequences longer than 10 bp::
+
+ 	&gt;seq1
+	TCATTTAATGAC
+	&gt;seq3
+	TCACATGATGCCG
+
+
+	</help>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/4.fasta	Mon May 19 10:59:27 2014 -0400
@@ -0,0 +1,7 @@
+>EYKX4VC01B65GS length=54 xy=0784_1754 region=1 run=R_2007_11_07_16_15_57_
+CCGGTATCCGGGTGCCGTGATGAGCGCCACCGGAACGAATTCGACTATGCCGAA
+>EYKX4VC01BNCSP length=187 xy=0558_3831 region=1 run=R_2007_11_07_16_15_57_
+CTTACCGGTCACCACCGTGCCTTCAGGATTGATCGCCAGATCGGTCGGTGCGTCAGGCGG
+GGTGACATCGCCCACCACGGTACTCACTGGCTGGCTCTGGTTCCCGGCGGCATCGGAGGC
+CACCACGTTGAGGGTATTCCCCTCGGTTTGTGGCTCGGTGAGAACCACGTTGTAGTCGCC
+ATTGGTC
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/454.fasta	Mon May 19 10:59:27 2014 -0400
@@ -0,0 +1,52 @@
+>EYKX4VC01B65GS length=54 xy=0784_1754 region=1 run=R_2007_11_07_16_15_57_
+CCGGTATCCGGGTGCCGTGATGAGCGCCACCGGAACGAATTCGACTATGCCGAA
+>EYKX4VC01BNCSP length=187 xy=0558_3831 region=1 run=R_2007_11_07_16_15_57_
+CTTACCGGTCACCACCGTGCCTTCAGGATTGATCGCCAGATCGGTCGGTGCGTCAGGCGG
+GGTGACATCGCCCACCACGGTACTCACTGGCTGGCTCTGGTTCCCGGCGGCATCGGAGGC
+CACCACGTTGAGGGTATTCCCCTCGGTTTGTGGCTCGGTGAGAACCACGTTGTAGTCGCC
+ATTGGTC
+>EYKX4VC01CD9FT length=115 xy=0865_1719 region=1 run=R_2007_11_07_16_15_57_
+GGGGGCTTTGGCCTGTCGTCCGGCACCTCGCAAGAGCTACAGCAGGCGCGGCTGGCGATC
+ATCGGCGGCACGCCGGCCTATATGTCGCCGGAACACACCACCCGCACCCAACGCG
+>EYKX4VC01B8FW0 length=95 xy=0799_0514 region=1 run=R_2007_11_07_16_15_57_
+TAAATTTCAAGGAATGCAAATCAGGGTCGTGTGTTTAGACTTCGGCTTTAGAGACCTGAA
+TACGTCAAAAACATAACTTCATGATATCTTGCAGT
+>EYKX4VC01BCGYW length=115 xy=0434_3926 region=1 run=R_2007_11_07_16_15_57_
+GGCCAGCCGGGACAGCGTTGTTGGGCTGCATGGCGACGAGCTAAAAGTCGCCATCACCGC
+CCCGCCGGTTGATGGGCAGGCTAATGCCCATCTGGTAAAAACTTTCTCGCCAAAC
+>EYKX4VC01AZXC6 length=116 xy=0292_0280 region=1 run=R_2007_11_07_16_15_57_
+GGGGGCGTTTGGCCTGTCGTCCGGCACCTCGCAAGAGCTACAGCAGGCGCGGCTGGCGAT
+CATCGGCGGCACGCCGGCCTATATGTCGCCGGAACACACCACCCGCACCCAACGCG
+>EYKX4VC01CATH5 length=82 xy=0826_0843 region=1 run=R_2007_11_07_16_15_57_
+CGAAATTGCACATTCTCGGCCATATCTCTGGACCTACATGACCGATTTGATCATCTTCGA
+ACTTAGCCTTCCTTTNTTAACG
+>EYKX4VC01BCEIV length=47 xy=0434_0757 region=1 run=R_2007_11_07_16_15_57_
+TGACGTCGTGCCGAGCTACGACAATGCCGACATGGTGATCGTTAACA
+>EYKX4VC01BWERM length=83 xy=0662_0304 region=1 run=R_2007_11_07_16_15_57_
+CGGTCGGCCTCACCATGGAGAAGATCCCGCCCCGGCCGAGGTCATGGTGGATCTCGGCCA
+GGGCGTGCTGATGAAGTTCAAAT
+>EYKX4VC01BT2O7 length=69 xy=0635_1945 region=1 run=R_2007_11_07_16_15_57_
+AGCGTTTCTCCAGCCGGTCGGCTACGCCGTTTGCCCCTGAAAGACGCTGTTCAGACCGAA
+CGCGGTAAA
+>EYKX4VC01BO0UO length=222 xy=0577_3838 region=1 run=R_2007_11_07_16_15_57_
+AGACCTGGGACAGCGGCGGGCTGCTGAAGCCGCAGGCGATAGAGGACAAACTGCAGTACC
+GCTTCTGGCTGCACTATGCCGAAGGCTCGCTGATGCCGCTGCTGTTAATGAAGCTGGTGT
+TCGCCAGCCTGGGTAAACCCCCTGTGCCCTTTGGCGTCCGCTCGCTGGGCGCCCTGCTGG
+GCAAGGGCATTCAGAAAGCGTGGCTGGATCCCCAGCTGGCCA
+>EYKX4VC01CBCPK length=83 xy=0832_1158 region=1 run=R_2007_11_07_16_15_57_
+CGGTCGGCCTCACCATGGAGAAGATCCCGCCCCGGCCGAGGTCATGGTGGATCTCGGCCA
+GGGCGTGCTGATGAAGTTCAAAT
+>EYKX4VC01B474S length=54 xy=0762_2010 region=1 run=R_2007_11_07_16_15_57_
+AGCAGTTTTCCAGCGCTTTCGAAGAGCGCTGGCGCGCGCGGGCTTCCAGCATAT
+>EYKX4VC01BB4QL length=57 xy=0431_0363 region=1 run=R_2007_11_07_16_15_57_
+GGGGAGGAGCTAATAATATGCTCTTGGGGAGGAGCTAATTATATGCTCTTGGGGAGG
+>EYKX4VC01BJ37M length=64 xy=0522_0192 region=1 run=R_2007_11_07_16_15_57_
+TCGAGTATGTATCAAGGACTACATACAAATTTGCCAAAAGAGATTATGCACTATCCCGAC
+TTCC
+>EYKX4VC01BV9R8 length=54 xy=0660_2038 region=1 run=R_2007_11_07_16_15_57_
+AAAACTCGGAGAAACTATTCAGCAGCACTGCGTTTCGCTGAATTTTAGACCGTT
+>EYKX4VC01CEPP8 length=60 xy=0870_2350 region=1 run=R_2007_11_07_16_15_57_
+CTGGGTGGGTGCACTACAGGAACGTCATTTGTTCAATCCTCACGTTGTTGTTAGTGTCAG
+>EYKX4VC01BTLME length=78 xy=0630_0292 region=1 run=R_2007_11_07_16_15_57_
+TTATCCACACGCTGTCCGGATCCAGCGCCAGGCGCCGACGCTGGACTTCCGCCGCCTGCG
+CCCAGTTGCCCTGACTTC
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fasta_tool_filter_length_1.out	Mon May 19 10:59:27 2014 -0400
@@ -0,0 +1,52 @@
+>EYKX4VC01B65GS length=54 xy=0784_1754 region=1 run=R_2007_11_07_16_15_57_
+CCGGTATCCGGGTGCCGTGATGAGCGCCACCGGAACGAATTCGACTATGCCGAA
+>EYKX4VC01BNCSP length=187 xy=0558_3831 region=1 run=R_2007_11_07_16_15_57_
+CTTACCGGTCACCACCGTGCCTTCAGGATTGATCGCCAGATCGGTCGGTGCGTCAGGCGG
+GGTGACATCGCCCACCACGGTACTCACTGGCTGGCTCTGGTTCCCGGCGGCATCGGAGGC
+CACCACGTTGAGGGTATTCCCCTCGGTTTGTGGCTCGGTGAGAACCACGTTGTAGTCGCC
+ATTGGTC
+>EYKX4VC01CD9FT length=115 xy=0865_1719 region=1 run=R_2007_11_07_16_15_57_
+GGGGGCTTTGGCCTGTCGTCCGGCACCTCGCAAGAGCTACAGCAGGCGCGGCTGGCGATC
+ATCGGCGGCACGCCGGCCTATATGTCGCCGGAACACACCACCCGCACCCAACGCG
+>EYKX4VC01B8FW0 length=95 xy=0799_0514 region=1 run=R_2007_11_07_16_15_57_
+TAAATTTCAAGGAATGCAAATCAGGGTCGTGTGTTTAGACTTCGGCTTTAGAGACCTGAA
+TACGTCAAAAACATAACTTCATGATATCTTGCAGT
+>EYKX4VC01BCGYW length=115 xy=0434_3926 region=1 run=R_2007_11_07_16_15_57_
+GGCCAGCCGGGACAGCGTTGTTGGGCTGCATGGCGACGAGCTAAAAGTCGCCATCACCGC
+CCCGCCGGTTGATGGGCAGGCTAATGCCCATCTGGTAAAAACTTTCTCGCCAAAC
+>EYKX4VC01AZXC6 length=116 xy=0292_0280 region=1 run=R_2007_11_07_16_15_57_
+GGGGGCGTTTGGCCTGTCGTCCGGCACCTCGCAAGAGCTACAGCAGGCGCGGCTGGCGAT
+CATCGGCGGCACGCCGGCCTATATGTCGCCGGAACACACCACCCGCACCCAACGCG
+>EYKX4VC01CATH5 length=82 xy=0826_0843 region=1 run=R_2007_11_07_16_15_57_
+CGAAATTGCACATTCTCGGCCATATCTCTGGACCTACATGACCGATTTGATCATCTTCGA
+ACTTAGCCTTCCTTTNTTAACG
+>EYKX4VC01BCEIV length=47 xy=0434_0757 region=1 run=R_2007_11_07_16_15_57_
+TGACGTCGTGCCGAGCTACGACAATGCCGACATGGTGATCGTTAACA
+>EYKX4VC01BWERM length=83 xy=0662_0304 region=1 run=R_2007_11_07_16_15_57_
+CGGTCGGCCTCACCATGGAGAAGATCCCGCCCCGGCCGAGGTCATGGTGGATCTCGGCCA
+GGGCGTGCTGATGAAGTTCAAAT
+>EYKX4VC01BT2O7 length=69 xy=0635_1945 region=1 run=R_2007_11_07_16_15_57_
+AGCGTTTCTCCAGCCGGTCGGCTACGCCGTTTGCCCCTGAAAGACGCTGTTCAGACCGAA
+CGCGGTAAA
+>EYKX4VC01BO0UO length=222 xy=0577_3838 region=1 run=R_2007_11_07_16_15_57_
+AGACCTGGGACAGCGGCGGGCTGCTGAAGCCGCAGGCGATAGAGGACAAACTGCAGTACC
+GCTTCTGGCTGCACTATGCCGAAGGCTCGCTGATGCCGCTGCTGTTAATGAAGCTGGTGT
+TCGCCAGCCTGGGTAAACCCCCTGTGCCCTTTGGCGTCCGCTCGCTGGGCGCCCTGCTGG
+GCAAGGGCATTCAGAAAGCGTGGCTGGATCCCCAGCTGGCCA
+>EYKX4VC01CBCPK length=83 xy=0832_1158 region=1 run=R_2007_11_07_16_15_57_
+CGGTCGGCCTCACCATGGAGAAGATCCCGCCCCGGCCGAGGTCATGGTGGATCTCGGCCA
+GGGCGTGCTGATGAAGTTCAAAT
+>EYKX4VC01B474S length=54 xy=0762_2010 region=1 run=R_2007_11_07_16_15_57_
+AGCAGTTTTCCAGCGCTTTCGAAGAGCGCTGGCGCGCGCGGGCTTCCAGCATAT
+>EYKX4VC01BB4QL length=57 xy=0431_0363 region=1 run=R_2007_11_07_16_15_57_
+GGGGAGGAGCTAATAATATGCTCTTGGGGAGGAGCTAATTATATGCTCTTGGGGAGG
+>EYKX4VC01BJ37M length=64 xy=0522_0192 region=1 run=R_2007_11_07_16_15_57_
+TCGAGTATGTATCAAGGACTACATACAAATTTGCCAAAAGAGATTATGCACTATCCCGAC
+TTCC
+>EYKX4VC01BV9R8 length=54 xy=0660_2038 region=1 run=R_2007_11_07_16_15_57_
+AAAACTCGGAGAAACTATTCAGCAGCACTGCGTTTCGCTGAATTTTAGACCGTT
+>EYKX4VC01CEPP8 length=60 xy=0870_2350 region=1 run=R_2007_11_07_16_15_57_
+CTGGGTGGGTGCACTACAGGAACGTCATTTGTTCAATCCTCACGTTGTTGTTAGTGTCAG
+>EYKX4VC01BTLME length=78 xy=0630_0292 region=1 run=R_2007_11_07_16_15_57_
+TTATCCACACGCTGTCCGGATCCAGCGCCAGGCGCCGACGCTGGACTTCCGCCGCCTGCG
+CCCAGTTGCCCTGACTTC
\ No newline at end of file