Mercurial > repos > devteam > fasta_compute_length

--- a/fasta_compute_length.py	Wed Sep 11 09:41:36 2019 -0400
+++ b/fasta_compute_length.py	Sun Mar 01 12:22:12 2020 +0000
@@ -6,4 +6,5 @@
 import sys
 from utils.fasta_to_len import compute_fasta_length

-compute_fasta_length( sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4] == 'id_only' )
+
+compute_fasta_length(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4] == 'id_only')
--- a/fasta_compute_length.xml	Wed Sep 11 09:41:36 2019 -0400
+++ b/fasta_compute_length.xml	Sun Mar 01 12:22:12 2020 +0000
@@ -1,11 +1,13 @@
-<?xml version="1.0"?>
-<tool id="fasta_compute_length" name="Compute sequence length" version="1.0.2">
+<tool id="fasta_compute_length" name="Compute sequence length" version="1.0.2" profile="16.04">
     <description></description>
+    <requirements>
+        <requirement type="package" version="3.7">python</requirement>
+    </requirements>
     <command>
     #if $ref.ref_source == 'dbkey':
         cp '${ref.index.fields.len_path}' '$output'
     #else:
-        python $__tool_directory__/fasta_compute_length.py
+        python '$__tool_directory__/fasta_compute_length.py'
           #if $ref.ref_source == 'history':
             '$input'
           #else:
@@ -85,7 +87,7 @@
             <output name="output" file="merged.tab" />
         </test>
     </tests>
-    <help>
+    <help><![CDATA[

 **What it does**

@@ -97,7 +99,7 @@

 Suppose you have the following FASTA formatted sequences from a Roche (454) FLX sequencing run::

-    &gt;EYKX4VC02EQLO5 length=108 xy=1826_0455 region=2 run=R_2007_11_07_16_15_57_
+    >EYKX4VC02EQLO5 length=108 xy=1826_0455 region=2 run=R_2007_11_07_16_15_57_
     TCCGCGCCGAGCATGCCCATCTTGGATTCCGGCGCGATGACCATCGCCCGCTCCACCACG
     TTCGGCCGGCCCTTCTCGTCGAGGAATGACACCAGCGCTTCGCCCACG
     &gt;EYKX4VC02D4GS2 length=60 xy=1573_3972 region=2 run=R_2007_11_07_16_15_57_
@@ -110,10 +112,10 @@

 However, if your IDs are not all the same length, you may wish to just keep the fasta ID, and not the description::

-    &gt;EYKX4VC02EQLO5 length=108 xy=1826_0455 region=2 run=R_2007_11_07_16_15_57_
+    >EYKX4VC02EQLO5 length=108 xy=1826_0455 region=2 run=R_2007_11_07_16_15_57_
     TCCGCGCCGAGCATGCCCATCTTGGATTCCGGCGCGATGACCATCGCCCGCTCCACCACG
     TTCGGCCGGCCCTTCTCGTCGAGGAATGACACCAGCGCTTCGCCCACG
-    &gt;EYKX4VC length=60 xy=1573_3972 region=2 run=R_2007_11_07_16_15_57_
+    >EYKX4VC length=60 xy=1573_3972 region=2 run=R_2007_11_07_16_15_57_
     AATAAAACTAAATCAGCAAAGACTGGCAAATACTCACAGGCTTATACAATACAAATGTAAfa

 Running this tool with **Strip fasta description from header** set to **True** and **How many characters to keep?** set to **0** will produce::
@@ -122,7 +124,7 @@
     EYKX4VC     60


-    </help>
+    ]]></help>
     <citations>
         <citation type="doi">10.1093/bioinformatics/btq281</citation>
     </citations>
--- a/utils/fasta_to_len.py	Wed Sep 11 09:41:36 2019 -0400
+++ b/utils/fasta_to_len.py	Sun Mar 01 12:22:12 2020 +0000
@@ -5,16 +5,13 @@
 Return titles with lengths of corresponding seq
 """

-import sys, os
+import sys

-assert sys.version_info[:2] >= ( 2, 4 )
+assert sys.version_info[:2] >= (2, 4)

-def compute_fasta_length( fasta_file, out_file, keep_first_char, keep_first_word=False ):

-    infile = fasta_file
-    out = open( out_file, 'w')
-    keep_first_char = int( keep_first_char )
-
+def compute_fasta_length(fasta_file, out_file, keep_first_char, keep_first_word=False):
+    keep_first_char = int(keep_first_char)
     fasta_title = ''
     seq_len = 0

@@ -25,28 +22,28 @@
         keep_first_char += 1

     first_entry = True
-
-    for line in open( infile ):
-        line = line.strip()
-        if not line or line.startswith( '#' ):
-            continue
-        if line[0] == '>':
-            if first_entry == False:
-                if keep_first_word:
-                    fasta_title = fasta_title.split()[0]
-                out.write( "%s\t%d\n" % ( fasta_title[ 1:keep_first_char ], seq_len ) )
+    with open(fasta_file) as in_fh, open(out_file, 'w') as out_fh:
+        for line in in_fh:
+            line = line.strip()
+            if not line or line.startswith('#'):
+                continue
+            if line[0] == '>':
+                if first_entry is False:
+                    if keep_first_word:
+                        fasta_title = fasta_title.split()[0]
+                    out_fh.write("%s\t%d\n" % (fasta_title[1:keep_first_char], seq_len))
+                else:
+                    first_entry = False
+                fasta_title = line
+                seq_len = 0
             else:
-                first_entry = False
-            fasta_title = line
-            seq_len = 0
-        else:
-            seq_len += len(line)
+                seq_len += len(line)

-    # last fasta-entry
-    if keep_first_word:
-        fasta_title = fasta_title.split()[0]
-    out.write( "%s\t%d\n" % ( fasta_title[ 1:keep_first_char ], seq_len ) )
-    out.close()
+        # last fasta-entry
+        if keep_first_word:
+            fasta_title = fasta_title.split()[0]
+        out_fh.write("%s\t%d\n" % (fasta_title[1:keep_first_char], seq_len))

-if __name__ == "__main__" :
-    compute_fasta_length( sys.argv[1], sys.argv[2], sys.argv[3], True )
\ No newline at end of file
+
+if __name__ == "__main__":
+    compute_fasta_length(sys.argv[1], sys.argv[2], sys.argv[3], True)