view bin/metadata_from_seqnames.py @ 2:7eaf6f9abd28 draft default tip

planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b-dirty
author bcclaywell
date Mon, 12 Oct 2015 17:57:38 -0400
parents d67268158946
children
line wrap: on
line source

#!/usr/bin/env python
"""Little script for parsing metadata out of sequence names given regular expressions. Supports parsing out
deme information and data information."""

import argparse
import csv
import re
from Bio import SeqIO


def get_args():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('inseqs', help="Input sequences in fasta format")
    parser.add_argument('-d', '--deme-regex', required=True, type=re.compile,
            help="Regular expression with which to parse deme information")
    parser.add_argument('-t', '--time-regex', type=re.compile,
            help="Regular expression with which to parse date information")
    parser.add_argument('output', type=argparse.FileType('w'))
    return parser.parse_args()


def main():
    args = get_args()
    seqreader = SeqIO.parse(args.inseqs, 'fasta')

    header = ['sequence', 'deme']
    if args.time_regex:
        header.append('date')

    outwriter = csv.DictWriter(args.output, header)
    outwriter.writeheader()
    for seqrec in seqreader:
        seqname = seqrec.id
        try:
            deme = args.deme_regex.match(seqname).groups()[0]
        except Exception:
            raise Exception, "There was a problem parsing deme information for sequence %s. Try again." % seqname
        rowdict = dict(sequence=seqname, deme=deme)
        if args.time_regex:
            try:
                rowdict['date'] = args.time_regex.match(seqname).groups()[0]
            except Exception:
                raise Exception, "There was a problem parsing date information for sequence %s. Try again." % seqname
        outwriter.writerow(rowdict)

    args.output.close()


if __name__ == '__main__':
    main()