Mercurial > repos > bcclaywell > argo_navis
annotate bin/metadata_from_seqnames.py @ 0:d67268158946 draft
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
author | bcclaywell |
---|---|
date | Mon, 12 Oct 2015 17:43:33 -0400 |
parents | |
children |
rev | line source |
---|---|
0
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
1 #!/usr/bin/env python |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
2 """Little script for parsing metadata out of sequence names given regular expressions. Supports parsing out |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
3 deme information and data information.""" |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
4 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
5 import argparse |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
6 import csv |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
7 import re |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
8 from Bio import SeqIO |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
9 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
10 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
11 def get_args(): |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
12 parser = argparse.ArgumentParser(description=__doc__) |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
13 parser.add_argument('inseqs', help="Input sequences in fasta format") |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
14 parser.add_argument('-d', '--deme-regex', required=True, type=re.compile, |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
15 help="Regular expression with which to parse deme information") |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
16 parser.add_argument('-t', '--time-regex', type=re.compile, |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
17 help="Regular expression with which to parse date information") |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
18 parser.add_argument('output', type=argparse.FileType('w')) |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
19 return parser.parse_args() |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
20 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
21 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
22 def main(): |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
23 args = get_args() |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
24 seqreader = SeqIO.parse(args.inseqs, 'fasta') |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
25 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
26 header = ['sequence', 'deme'] |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
27 if args.time_regex: |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
28 header.append('date') |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
29 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
30 outwriter = csv.DictWriter(args.output, header) |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
31 outwriter.writeheader() |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
32 for seqrec in seqreader: |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
33 seqname = seqrec.id |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
34 try: |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
35 deme = args.deme_regex.match(seqname).groups()[0] |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
36 except Exception: |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
37 raise Exception, "There was a problem parsing deme information for sequence %s. Try again." % seqname |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
38 rowdict = dict(sequence=seqname, deme=deme) |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
39 if args.time_regex: |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
40 try: |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
41 rowdict['date'] = args.time_regex.match(seqname).groups()[0] |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
42 except Exception: |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
43 raise Exception, "There was a problem parsing date information for sequence %s. Try again." % seqname |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
44 outwriter.writerow(rowdict) |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
45 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
46 args.output.close() |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
47 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
48 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
49 if __name__ == '__main__': |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
50 main() |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
51 |
d67268158946
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
bcclaywell
parents:
diff
changeset
|
52 |