view tabular_to_fasta.py @ 1:7f7a1bea4653 draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/tabular_to_fasta commit 01140c0ac1a926856c55853a0028e5d44935d9e6"
author devteam
date Wed, 05 Feb 2020 15:51:26 +0000
parents db61adc12770
children
line wrap: on
line source

#!/usr/bin/env python
"""
Input: fasta, minimal length, maximal length
Output: fasta
Return sequences whose lengths are within the range.
"""
import os
import sys


def stop_err(msg):
    sys.exit(msg)


def __main__():
    infile = sys.argv[1]
    title_col = sys.argv[2]
    seq_col = sys.argv[3]
    outfile = sys.argv[4]

    if title_col == None or title_col == 'None' or seq_col == None or seq_col == 'None':
        stop_err("Columns not specified.")
    try:
        seq_col = int(seq_col) - 1
    except:
        stop_err("Invalid Sequence Column: %s." % str(seq_col))

    title_col_list = title_col.split(',')
    skipped_lines = 0
    first_invalid_line = 0
    invalid_line = ""
    i = 0

    with open(outfile, 'w') as out:
        for i, line in enumerate(open(infile)):
            error = False
            line = line.rstrip('\r\n')
            if line and not line.startswith('#'):
                fields = line.split('\t')
                fasta_title = []
                for j in title_col_list:
                    try:
                        j = int(j) - 1
                        fasta_title.append(fields[j])
                    except:
                        skipped_lines += 1
                        if not invalid_line:
                            first_invalid_line = i + 1
                            invalid_line = line
                        error = True
                        break
                if not error:
                    try:
                        fasta_seq = fields[seq_col]
                        if fasta_title[0].startswith(">"):
                            fasta_title[0] = fasta_title[0][1:]
                        print(">%s\n%s" % ("_".join(fasta_title), fasta_seq), file=out)
                    except:
                        skipped_lines += 1
                        if not invalid_line:
                            first_invalid_line = i + 1
                            invalid_line = line

    if skipped_lines > 0:
        print('Data issue: skipped %d blank or invalid lines starting at #%d: "%s"' % (skipped_lines, first_invalid_line, invalid_line))


if __name__ == "__main__":
    __main__()