comparison bin/metadata_from_seqnames.py @ 0:d67268158946 draft

planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
author bcclaywell
date Mon, 12 Oct 2015 17:43:33 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d67268158946
1 #!/usr/bin/env python
2 """Little script for parsing metadata out of sequence names given regular expressions. Supports parsing out
3 deme information and data information."""
4
5 import argparse
6 import csv
7 import re
8 from Bio import SeqIO
9
10
11 def get_args():
12 parser = argparse.ArgumentParser(description=__doc__)
13 parser.add_argument('inseqs', help="Input sequences in fasta format")
14 parser.add_argument('-d', '--deme-regex', required=True, type=re.compile,
15 help="Regular expression with which to parse deme information")
16 parser.add_argument('-t', '--time-regex', type=re.compile,
17 help="Regular expression with which to parse date information")
18 parser.add_argument('output', type=argparse.FileType('w'))
19 return parser.parse_args()
20
21
22 def main():
23 args = get_args()
24 seqreader = SeqIO.parse(args.inseqs, 'fasta')
25
26 header = ['sequence', 'deme']
27 if args.time_regex:
28 header.append('date')
29
30 outwriter = csv.DictWriter(args.output, header)
31 outwriter.writeheader()
32 for seqrec in seqreader:
33 seqname = seqrec.id
34 try:
35 deme = args.deme_regex.match(seqname).groups()[0]
36 except Exception:
37 raise Exception, "There was a problem parsing deme information for sequence %s. Try again." % seqname
38 rowdict = dict(sequence=seqname, deme=deme)
39 if args.time_regex:
40 try:
41 rowdict['date'] = args.time_regex.match(seqname).groups()[0]
42 except Exception:
43 raise Exception, "There was a problem parsing date information for sequence %s. Try again." % seqname
44 outwriter.writerow(rowdict)
45
46 args.output.close()
47
48
49 if __name__ == '__main__':
50 main()
51
52