view utils/fasta_to_len.py @ 4:79cd53e23207 draft

"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/fasta_compute_length commit cd1ed08574b749eee2a3f6e6151dbb0c8ca15bbf"
author devteam
date Sun, 01 Mar 2020 12:22:12 +0000
parents 86f00ebe7be7
children a51da10f8caf
line wrap: on
line source

#!/usr/bin/env python
"""
Input: fasta, int
Output: tabular
Return titles with lengths of corresponding seq
"""

import sys

assert sys.version_info[:2] >= (2, 4)


def compute_fasta_length(fasta_file, out_file, keep_first_char, keep_first_word=False):
    keep_first_char = int(keep_first_char)
    fasta_title = ''
    seq_len = 0

    # number of char to keep in the title
    if keep_first_char == 0:
        keep_first_char = None
    else:
        keep_first_char += 1

    first_entry = True
    with open(fasta_file) as in_fh, open(out_file, 'w') as out_fh:
        for line in in_fh:
            line = line.strip()
            if not line or line.startswith('#'):
                continue
            if line[0] == '>':
                if first_entry is False:
                    if keep_first_word:
                        fasta_title = fasta_title.split()[0]
                    out_fh.write("%s\t%d\n" % (fasta_title[1:keep_first_char], seq_len))
                else:
                    first_entry = False
                fasta_title = line
                seq_len = 0
            else:
                seq_len += len(line)

        # last fasta-entry
        if keep_first_word:
            fasta_title = fasta_title.split()[0]
        out_fh.write("%s\t%d\n" % (fasta_title[1:keep_first_char], seq_len))


if __name__ == "__main__":
    compute_fasta_length(sys.argv[1], sys.argv[2], sys.argv[3], True)