annotate extract_features.py @ 4:c243e17fb224 draft

Uploaded
author bgruening
date Fri, 07 Jun 2013 02:30:21 -0400
parents 702d9e042295
children bcfe8e0731f8
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
3c5116448979 Uploaded
bgruening
parents:
diff changeset
1 #!/usr/bin/env python
3c5116448979 Uploaded
bgruening
parents:
diff changeset
2
3c5116448979 Uploaded
bgruening
parents:
diff changeset
3 import os
3c5116448979 Uploaded
bgruening
parents:
diff changeset
4 import sys
3c5116448979 Uploaded
bgruening
parents:
diff changeset
5 import argparse
3c5116448979 Uploaded
bgruening
parents:
diff changeset
6 import textwrap
3c5116448979 Uploaded
bgruening
parents:
diff changeset
7
3c5116448979 Uploaded
bgruening
parents:
diff changeset
8 def main( args ):
3c5116448979 Uploaded
bgruening
parents:
diff changeset
9 """
3c5116448979 Uploaded
bgruening
parents:
diff changeset
10 Extract the protein and coding section from an augustus gff, gtf file
3c5116448979 Uploaded
bgruening
parents:
diff changeset
11 Example file:
3c5116448979 Uploaded
bgruening
parents:
diff changeset
12 HS04636 AUGUSTUS stop_codon 6901 6903 . + 0 Parent=g1.t1
3c5116448979 Uploaded
bgruening
parents:
diff changeset
13 HS04636 AUGUSTUS transcription_end_site 8857 8857 . + . Parent=g1.t1
3c5116448979 Uploaded
bgruening
parents:
diff changeset
14 # protein sequence = [MLARALLLCAVLALSHTANPCCSHPCQNRGVCMSVGFDQYKCDCTRTGFYGENCSTPEFLTRIKLFLKPTPNTVHYIL
3c5116448979 Uploaded
bgruening
parents:
diff changeset
15 # THFKGFWNVVNNIPFLRNAIMSYVLTSRSHLIDSPPTYNADYGYKSWEAFSNLSYYTRALPPVPDDCPTPLGVKGKKQLPDSNEIVEKLLLRRKFIPD
3c5116448979 Uploaded
bgruening
parents:
diff changeset
16 # PQGSNMMFAFFAQHFTHQFFKTDHKRGPAFTNGLGHGVDLNHIYGETLARQRKLRLFKDGKMKYQIIDGEMYPPTVKDTQAEMIYPPQVPEHLRFAVG
3c5116448979 Uploaded
bgruening
parents:
diff changeset
17 # QEVFGLVPGLMMYATIWLREHNRVCDVLKQEHPEWGDEQLFQTSRLILIGETIKIVIEDYVQHLSGYHFKLKFDPELLFNKQFQYQNRIAAEFNTLYH
3c5116448979 Uploaded
bgruening
parents:
diff changeset
18 # WHPLLPDTFQIHDQKYNYQQFIYNNSILLEHGITQFVESFTRQIAGRVAGGRNVPPAVQKVSQASIDQSRQMKYQSFNEYRKRFMLKPYESFEELTGE
3c5116448979 Uploaded
bgruening
parents:
diff changeset
19 # KEMSAELEALYGDIDAVELYPALLVEKPRPDAIFGETMVEVGAPFSLKGLMGNVICSPAYWKPSTFGGEVGFQIINTASIQSLICNNVKGCPFTSFSV
3c5116448979 Uploaded
bgruening
parents:
diff changeset
20 # PDPELIKTVTINASSSRSGLDDINPTVLLKERSTEL]
3c5116448979 Uploaded
bgruening
parents:
diff changeset
21 # end gene g1
3c5116448979 Uploaded
bgruening
parents:
diff changeset
22 ###
3c5116448979 Uploaded
bgruening
parents:
diff changeset
23 #
3c5116448979 Uploaded
bgruening
parents:
diff changeset
24 # ----- prediction on sequence number 2 (length = 2344, name = HS08198) -----
3c5116448979 Uploaded
bgruening
parents:
diff changeset
25 #
3c5116448979 Uploaded
bgruening
parents:
diff changeset
26 # Predicted genes for sequence number 2 on both strands
3c5116448979 Uploaded
bgruening
parents:
diff changeset
27 # start gene g2
3c5116448979 Uploaded
bgruening
parents:
diff changeset
28 HS08198 AUGUSTUS gene 86 2344 1 + . ID=g2
3c5116448979 Uploaded
bgruening
parents:
diff changeset
29 HS08198 AUGUSTUS transcript 86 2344 . + . ID=g2.t1;Parent=g2
3c5116448979 Uploaded
bgruening
parents:
diff changeset
30 HS08198 AUGUSTUS transcription_start_site 86 86 . + . Parent=g2.t1
3c5116448979 Uploaded
bgruening
parents:
diff changeset
31 HS08198 AUGUSTUS exon 86 582 . + . Parent=g2.t1
3c5116448979 Uploaded
bgruening
parents:
diff changeset
32 HS08198 AUGUSTUS start_codon 445 447 . + 0 Parent=g2.t1
3c5116448979 Uploaded
bgruening
parents:
diff changeset
33 """
3c5116448979 Uploaded
bgruening
parents:
diff changeset
34 protein_seq = ''
3c5116448979 Uploaded
bgruening
parents:
diff changeset
35 coding_seq = ''
3c5116448979 Uploaded
bgruening
parents:
diff changeset
36 if args.protein:
3c5116448979 Uploaded
bgruening
parents:
diff changeset
37 po = open( args.protein, 'w+' )
3c5116448979 Uploaded
bgruening
parents:
diff changeset
38 if args.codingseq:
3c5116448979 Uploaded
bgruening
parents:
diff changeset
39 co = open( args.codingseq, 'w+' )
3c5116448979 Uploaded
bgruening
parents:
diff changeset
40
3c5116448979 Uploaded
bgruening
parents:
diff changeset
41 for line in sys.stdin:
3c5116448979 Uploaded
bgruening
parents:
diff changeset
42 # protein- and coding-sequence are stored as comments
3c5116448979 Uploaded
bgruening
parents:
diff changeset
43 if line.startswith('#'):
3c5116448979 Uploaded
bgruening
parents:
diff changeset
44 line = line[2:].strip()
3c5116448979 Uploaded
bgruening
parents:
diff changeset
45 if line.startswith('start gene'):
3c5116448979 Uploaded
bgruening
parents:
diff changeset
46 gene_name = line[11:].strip()
3c5116448979 Uploaded
bgruening
parents:
diff changeset
47
3c5116448979 Uploaded
bgruening
parents:
diff changeset
48 if args.protein and line.startswith('protein sequence = ['):
3c5116448979 Uploaded
bgruening
parents:
diff changeset
49 if line.endswith(']'):
3c5116448979 Uploaded
bgruening
parents:
diff changeset
50 line = line[20:-1]
3c5116448979 Uploaded
bgruening
parents:
diff changeset
51 protein_seq = line
3c5116448979 Uploaded
bgruening
parents:
diff changeset
52 else:
3c5116448979 Uploaded
bgruening
parents:
diff changeset
53 line = line[20:]
3c5116448979 Uploaded
bgruening
parents:
diff changeset
54 protein_seq = line
3c5116448979 Uploaded
bgruening
parents:
diff changeset
55
3c5116448979 Uploaded
bgruening
parents:
diff changeset
56 if args.codingseq and line.startswith('coding sequence = ['):
3c5116448979 Uploaded
bgruening
parents:
diff changeset
57 if line.endswith(']'):
3
702d9e042295 Uploaded
bgruening
parents: 1
diff changeset
58 line = line[19:-1]
702d9e042295 Uploaded
bgruening
parents: 1
diff changeset
59 coding_seq = line
1
3c5116448979 Uploaded
bgruening
parents:
diff changeset
60 else:
3
702d9e042295 Uploaded
bgruening
parents: 1
diff changeset
61 line = line[19:]
702d9e042295 Uploaded
bgruening
parents: 1
diff changeset
62 coding_seq = line
1
3c5116448979 Uploaded
bgruening
parents:
diff changeset
63
3c5116448979 Uploaded
bgruening
parents:
diff changeset
64 if protein_seq:
3c5116448979 Uploaded
bgruening
parents:
diff changeset
65 if line.endswith(']'):
3c5116448979 Uploaded
bgruening
parents:
diff changeset
66 protein_seq += line[:-1]
3c5116448979 Uploaded
bgruening
parents:
diff changeset
67 po.write( '>%s\n%s\n' % (gene_name, '\n'.join( textwrap.wrap( protein_seq, 80 ) ) ) )
3c5116448979 Uploaded
bgruening
parents:
diff changeset
68 protein_seq = ''
3c5116448979 Uploaded
bgruening
parents:
diff changeset
69 else:
3c5116448979 Uploaded
bgruening
parents:
diff changeset
70 protein_seq += line
3c5116448979 Uploaded
bgruening
parents:
diff changeset
71
3c5116448979 Uploaded
bgruening
parents:
diff changeset
72 if coding_seq:
3c5116448979 Uploaded
bgruening
parents:
diff changeset
73 if line.endswith(']'):
3c5116448979 Uploaded
bgruening
parents:
diff changeset
74 coding_seq += line[:-1]
3c5116448979 Uploaded
bgruening
parents:
diff changeset
75 co.write( '>%s\n%s\n' % (gene_name, '\n'.join( textwrap.wrap( coding_seq, 80 ) ) ) )
3c5116448979 Uploaded
bgruening
parents:
diff changeset
76 coding_seq = ''
3c5116448979 Uploaded
bgruening
parents:
diff changeset
77 else:
3c5116448979 Uploaded
bgruening
parents:
diff changeset
78 coding_seq += line
3c5116448979 Uploaded
bgruening
parents:
diff changeset
79 if args.codingseq:
3c5116448979 Uploaded
bgruening
parents:
diff changeset
80 co.close()
3c5116448979 Uploaded
bgruening
parents:
diff changeset
81 if args.protein:
3c5116448979 Uploaded
bgruening
parents:
diff changeset
82 po.close()
3c5116448979 Uploaded
bgruening
parents:
diff changeset
83
3c5116448979 Uploaded
bgruening
parents:
diff changeset
84 if __name__ == '__main__':
3c5116448979 Uploaded
bgruening
parents:
diff changeset
85 parser = argparse.ArgumentParser()
3c5116448979 Uploaded
bgruening
parents:
diff changeset
86 parser.add_argument('-p', '--protein', help='Path to the protein file.')
3c5116448979 Uploaded
bgruening
parents:
diff changeset
87 parser.add_argument('-c', '--codingseq', help='Path to the coding file.')
3c5116448979 Uploaded
bgruening
parents:
diff changeset
88
3c5116448979 Uploaded
bgruening
parents:
diff changeset
89 args = parser.parse_args()
3c5116448979 Uploaded
bgruening
parents:
diff changeset
90 main( args )
3c5116448979 Uploaded
bgruening
parents:
diff changeset
91