Mercurial > repos > devteam > fasta_filter_by_length
changeset 0:b029a652edcd draft
Imported from capsule None
author | devteam |
---|---|
date | Mon, 19 May 2014 10:59:27 -0400 |
parents | |
children | 4da47b319c82 |
files | fasta_filter_by_length.py fasta_filter_by_length.xml test-data/4.fasta test-data/454.fasta test-data/fasta_tool_filter_length_1.out |
diffstat | 5 files changed, 219 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fasta_filter_by_length.py Mon May 19 10:59:27 2014 -0400 @@ -0,0 +1,52 @@ +#!/usr/bin/env python +""" +Input: fasta, minimal length, maximal length +Output: fasta +Return sequences whose lengths are within the range. +""" + +import sys, os + +assert sys.version_info[:2] >= ( 2, 4 ) + +def stop_err( msg ): + sys.stderr.write( msg ) + sys.exit() + +def __main__(): + input_filename = sys.argv[1] + try: + min_length = int( sys.argv[2] ) + except: + stop_err( "Minimal length of the return sequence requires a numerical value." ) + try: + max_length = int( sys.argv[3] ) + except: + stop_err( "Maximum length of the return sequence requires a numerical value." ) + output_filename = sys.argv[4] + output_handle = open( output_filename, 'w' ) + tmp_size = 0 #-1 + tmp_buf = '' + at_least_one = 0 + for line in file(input_filename): + if not line or line.startswith('#'): + continue + if line[0] == '>': + if min_length <= tmp_size <= max_length or (min_length <= tmp_size and max_length == 0): + output_handle.write(tmp_buf) + at_least_one = 1 + tmp_buf = line + tmp_size = 0 + else: + if max_length == 0 or tmp_size < max_length: + tmp_size += len(line.rstrip('\r\n')) + tmp_buf += line + # final flush of buffer + if min_length <= tmp_size <= max_length or (min_length <= tmp_size and max_length == 0): + output_handle.write(tmp_buf.rstrip('\r\n')) + at_least_one = 1 + output_handle.close() + if at_least_one == 0: + print "There is no sequence that falls within your range." + +if __name__ == "__main__" : __main__()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fasta_filter_by_length.xml Mon May 19 10:59:27 2014 -0400 @@ -0,0 +1,56 @@ +<tool id="fasta_filter_by_length" name="Filter sequences by length" version="1.1"> + <description></description> + <command interpreter="python">fasta_filter_by_length.py $input $min_length $max_length $output </command> + <inputs> + <param name="input" type="data" format="fasta" label="Fasta file"/> + <param name="min_length" type="integer" size="15" value="0" label="Minimal length" /> + <param name="max_length" type="integer" size="15" value="0" label="Maximum length" help="Setting to '0' will return all sequences longer than the 'Minimal length'"/> + </inputs> + <outputs> + <data name="output" format="fasta"/> + </outputs> + <tests> + <test> + <param name="input" value="454.fasta" /> + <param name="min_length" value="10" /> + <param name="max_length" value="0" /> + <output name="output" file="fasta_tool_filter_length_1.out" /> + </test> + </tests> + <help> + +.. class:: infomark + +**TIP**. To return sequences longer than a certain length, set *Minimal length* to desired value and leave *Maximum length* set to '0'. + +----- + +**What it does** + +Outputs sequences between *Minimal length* and *Maximum length*. + +----- + +**Example** + +Suppose you have the following FASTA formatted sequences:: + + >seq1 + TCATTTAATGAC + >seq2 + ATGGC + >seq3 + TCACATGATGCCG + >seq4 + ATGGAAGC + +Setting the **Minimal length** to **10**, and the **Maximum length** to **0** will return all sequences longer than 10 bp:: + + >seq1 + TCATTTAATGAC + >seq3 + TCACATGATGCCG + + + </help> +</tool> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/4.fasta Mon May 19 10:59:27 2014 -0400 @@ -0,0 +1,7 @@ +>EYKX4VC01B65GS length=54 xy=0784_1754 region=1 run=R_2007_11_07_16_15_57_ +CCGGTATCCGGGTGCCGTGATGAGCGCCACCGGAACGAATTCGACTATGCCGAA +>EYKX4VC01BNCSP length=187 xy=0558_3831 region=1 run=R_2007_11_07_16_15_57_ +CTTACCGGTCACCACCGTGCCTTCAGGATTGATCGCCAGATCGGTCGGTGCGTCAGGCGG +GGTGACATCGCCCACCACGGTACTCACTGGCTGGCTCTGGTTCCCGGCGGCATCGGAGGC +CACCACGTTGAGGGTATTCCCCTCGGTTTGTGGCTCGGTGAGAACCACGTTGTAGTCGCC +ATTGGTC
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/454.fasta Mon May 19 10:59:27 2014 -0400 @@ -0,0 +1,52 @@ +>EYKX4VC01B65GS length=54 xy=0784_1754 region=1 run=R_2007_11_07_16_15_57_ +CCGGTATCCGGGTGCCGTGATGAGCGCCACCGGAACGAATTCGACTATGCCGAA +>EYKX4VC01BNCSP length=187 xy=0558_3831 region=1 run=R_2007_11_07_16_15_57_ +CTTACCGGTCACCACCGTGCCTTCAGGATTGATCGCCAGATCGGTCGGTGCGTCAGGCGG +GGTGACATCGCCCACCACGGTACTCACTGGCTGGCTCTGGTTCCCGGCGGCATCGGAGGC +CACCACGTTGAGGGTATTCCCCTCGGTTTGTGGCTCGGTGAGAACCACGTTGTAGTCGCC +ATTGGTC +>EYKX4VC01CD9FT length=115 xy=0865_1719 region=1 run=R_2007_11_07_16_15_57_ +GGGGGCTTTGGCCTGTCGTCCGGCACCTCGCAAGAGCTACAGCAGGCGCGGCTGGCGATC +ATCGGCGGCACGCCGGCCTATATGTCGCCGGAACACACCACCCGCACCCAACGCG +>EYKX4VC01B8FW0 length=95 xy=0799_0514 region=1 run=R_2007_11_07_16_15_57_ +TAAATTTCAAGGAATGCAAATCAGGGTCGTGTGTTTAGACTTCGGCTTTAGAGACCTGAA +TACGTCAAAAACATAACTTCATGATATCTTGCAGT +>EYKX4VC01BCGYW length=115 xy=0434_3926 region=1 run=R_2007_11_07_16_15_57_ +GGCCAGCCGGGACAGCGTTGTTGGGCTGCATGGCGACGAGCTAAAAGTCGCCATCACCGC +CCCGCCGGTTGATGGGCAGGCTAATGCCCATCTGGTAAAAACTTTCTCGCCAAAC +>EYKX4VC01AZXC6 length=116 xy=0292_0280 region=1 run=R_2007_11_07_16_15_57_ +GGGGGCGTTTGGCCTGTCGTCCGGCACCTCGCAAGAGCTACAGCAGGCGCGGCTGGCGAT +CATCGGCGGCACGCCGGCCTATATGTCGCCGGAACACACCACCCGCACCCAACGCG +>EYKX4VC01CATH5 length=82 xy=0826_0843 region=1 run=R_2007_11_07_16_15_57_ +CGAAATTGCACATTCTCGGCCATATCTCTGGACCTACATGACCGATTTGATCATCTTCGA +ACTTAGCCTTCCTTTNTTAACG +>EYKX4VC01BCEIV length=47 xy=0434_0757 region=1 run=R_2007_11_07_16_15_57_ +TGACGTCGTGCCGAGCTACGACAATGCCGACATGGTGATCGTTAACA +>EYKX4VC01BWERM length=83 xy=0662_0304 region=1 run=R_2007_11_07_16_15_57_ +CGGTCGGCCTCACCATGGAGAAGATCCCGCCCCGGCCGAGGTCATGGTGGATCTCGGCCA +GGGCGTGCTGATGAAGTTCAAAT +>EYKX4VC01BT2O7 length=69 xy=0635_1945 region=1 run=R_2007_11_07_16_15_57_ +AGCGTTTCTCCAGCCGGTCGGCTACGCCGTTTGCCCCTGAAAGACGCTGTTCAGACCGAA +CGCGGTAAA +>EYKX4VC01BO0UO length=222 xy=0577_3838 region=1 run=R_2007_11_07_16_15_57_ +AGACCTGGGACAGCGGCGGGCTGCTGAAGCCGCAGGCGATAGAGGACAAACTGCAGTACC +GCTTCTGGCTGCACTATGCCGAAGGCTCGCTGATGCCGCTGCTGTTAATGAAGCTGGTGT +TCGCCAGCCTGGGTAAACCCCCTGTGCCCTTTGGCGTCCGCTCGCTGGGCGCCCTGCTGG +GCAAGGGCATTCAGAAAGCGTGGCTGGATCCCCAGCTGGCCA +>EYKX4VC01CBCPK length=83 xy=0832_1158 region=1 run=R_2007_11_07_16_15_57_ +CGGTCGGCCTCACCATGGAGAAGATCCCGCCCCGGCCGAGGTCATGGTGGATCTCGGCCA +GGGCGTGCTGATGAAGTTCAAAT +>EYKX4VC01B474S length=54 xy=0762_2010 region=1 run=R_2007_11_07_16_15_57_ +AGCAGTTTTCCAGCGCTTTCGAAGAGCGCTGGCGCGCGCGGGCTTCCAGCATAT +>EYKX4VC01BB4QL length=57 xy=0431_0363 region=1 run=R_2007_11_07_16_15_57_ +GGGGAGGAGCTAATAATATGCTCTTGGGGAGGAGCTAATTATATGCTCTTGGGGAGG +>EYKX4VC01BJ37M length=64 xy=0522_0192 region=1 run=R_2007_11_07_16_15_57_ +TCGAGTATGTATCAAGGACTACATACAAATTTGCCAAAAGAGATTATGCACTATCCCGAC +TTCC +>EYKX4VC01BV9R8 length=54 xy=0660_2038 region=1 run=R_2007_11_07_16_15_57_ +AAAACTCGGAGAAACTATTCAGCAGCACTGCGTTTCGCTGAATTTTAGACCGTT +>EYKX4VC01CEPP8 length=60 xy=0870_2350 region=1 run=R_2007_11_07_16_15_57_ +CTGGGTGGGTGCACTACAGGAACGTCATTTGTTCAATCCTCACGTTGTTGTTAGTGTCAG +>EYKX4VC01BTLME length=78 xy=0630_0292 region=1 run=R_2007_11_07_16_15_57_ +TTATCCACACGCTGTCCGGATCCAGCGCCAGGCGCCGACGCTGGACTTCCGCCGCCTGCG +CCCAGTTGCCCTGACTTC
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fasta_tool_filter_length_1.out Mon May 19 10:59:27 2014 -0400 @@ -0,0 +1,52 @@ +>EYKX4VC01B65GS length=54 xy=0784_1754 region=1 run=R_2007_11_07_16_15_57_ +CCGGTATCCGGGTGCCGTGATGAGCGCCACCGGAACGAATTCGACTATGCCGAA +>EYKX4VC01BNCSP length=187 xy=0558_3831 region=1 run=R_2007_11_07_16_15_57_ +CTTACCGGTCACCACCGTGCCTTCAGGATTGATCGCCAGATCGGTCGGTGCGTCAGGCGG +GGTGACATCGCCCACCACGGTACTCACTGGCTGGCTCTGGTTCCCGGCGGCATCGGAGGC +CACCACGTTGAGGGTATTCCCCTCGGTTTGTGGCTCGGTGAGAACCACGTTGTAGTCGCC +ATTGGTC +>EYKX4VC01CD9FT length=115 xy=0865_1719 region=1 run=R_2007_11_07_16_15_57_ +GGGGGCTTTGGCCTGTCGTCCGGCACCTCGCAAGAGCTACAGCAGGCGCGGCTGGCGATC +ATCGGCGGCACGCCGGCCTATATGTCGCCGGAACACACCACCCGCACCCAACGCG +>EYKX4VC01B8FW0 length=95 xy=0799_0514 region=1 run=R_2007_11_07_16_15_57_ +TAAATTTCAAGGAATGCAAATCAGGGTCGTGTGTTTAGACTTCGGCTTTAGAGACCTGAA +TACGTCAAAAACATAACTTCATGATATCTTGCAGT +>EYKX4VC01BCGYW length=115 xy=0434_3926 region=1 run=R_2007_11_07_16_15_57_ +GGCCAGCCGGGACAGCGTTGTTGGGCTGCATGGCGACGAGCTAAAAGTCGCCATCACCGC +CCCGCCGGTTGATGGGCAGGCTAATGCCCATCTGGTAAAAACTTTCTCGCCAAAC +>EYKX4VC01AZXC6 length=116 xy=0292_0280 region=1 run=R_2007_11_07_16_15_57_ +GGGGGCGTTTGGCCTGTCGTCCGGCACCTCGCAAGAGCTACAGCAGGCGCGGCTGGCGAT +CATCGGCGGCACGCCGGCCTATATGTCGCCGGAACACACCACCCGCACCCAACGCG +>EYKX4VC01CATH5 length=82 xy=0826_0843 region=1 run=R_2007_11_07_16_15_57_ +CGAAATTGCACATTCTCGGCCATATCTCTGGACCTACATGACCGATTTGATCATCTTCGA +ACTTAGCCTTCCTTTNTTAACG +>EYKX4VC01BCEIV length=47 xy=0434_0757 region=1 run=R_2007_11_07_16_15_57_ +TGACGTCGTGCCGAGCTACGACAATGCCGACATGGTGATCGTTAACA +>EYKX4VC01BWERM length=83 xy=0662_0304 region=1 run=R_2007_11_07_16_15_57_ +CGGTCGGCCTCACCATGGAGAAGATCCCGCCCCGGCCGAGGTCATGGTGGATCTCGGCCA +GGGCGTGCTGATGAAGTTCAAAT +>EYKX4VC01BT2O7 length=69 xy=0635_1945 region=1 run=R_2007_11_07_16_15_57_ +AGCGTTTCTCCAGCCGGTCGGCTACGCCGTTTGCCCCTGAAAGACGCTGTTCAGACCGAA +CGCGGTAAA +>EYKX4VC01BO0UO length=222 xy=0577_3838 region=1 run=R_2007_11_07_16_15_57_ +AGACCTGGGACAGCGGCGGGCTGCTGAAGCCGCAGGCGATAGAGGACAAACTGCAGTACC +GCTTCTGGCTGCACTATGCCGAAGGCTCGCTGATGCCGCTGCTGTTAATGAAGCTGGTGT +TCGCCAGCCTGGGTAAACCCCCTGTGCCCTTTGGCGTCCGCTCGCTGGGCGCCCTGCTGG +GCAAGGGCATTCAGAAAGCGTGGCTGGATCCCCAGCTGGCCA +>EYKX4VC01CBCPK length=83 xy=0832_1158 region=1 run=R_2007_11_07_16_15_57_ +CGGTCGGCCTCACCATGGAGAAGATCCCGCCCCGGCCGAGGTCATGGTGGATCTCGGCCA +GGGCGTGCTGATGAAGTTCAAAT +>EYKX4VC01B474S length=54 xy=0762_2010 region=1 run=R_2007_11_07_16_15_57_ +AGCAGTTTTCCAGCGCTTTCGAAGAGCGCTGGCGCGCGCGGGCTTCCAGCATAT +>EYKX4VC01BB4QL length=57 xy=0431_0363 region=1 run=R_2007_11_07_16_15_57_ +GGGGAGGAGCTAATAATATGCTCTTGGGGAGGAGCTAATTATATGCTCTTGGGGAGG +>EYKX4VC01BJ37M length=64 xy=0522_0192 region=1 run=R_2007_11_07_16_15_57_ +TCGAGTATGTATCAAGGACTACATACAAATTTGCCAAAAGAGATTATGCACTATCCCGAC +TTCC +>EYKX4VC01BV9R8 length=54 xy=0660_2038 region=1 run=R_2007_11_07_16_15_57_ +AAAACTCGGAGAAACTATTCAGCAGCACTGCGTTTCGCTGAATTTTAGACCGTT +>EYKX4VC01CEPP8 length=60 xy=0870_2350 region=1 run=R_2007_11_07_16_15_57_ +CTGGGTGGGTGCACTACAGGAACGTCATTTGTTCAATCCTCACGTTGTTGTTAGTGTCAG +>EYKX4VC01BTLME length=78 xy=0630_0292 region=1 run=R_2007_11_07_16_15_57_ +TTATCCACACGCTGTCCGGATCCAGCGCCAGGCGCCGACGCTGGACTTCCGCCGCCTGCG +CCCAGTTGCCCTGACTTC \ No newline at end of file