Mercurial > repos > bgruening > augustus_training
comparison extract_features.py @ 4:da01a05d91c5 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/augustus commit bba7f5df059fcbeb06e89cf689e9a04d4f22cb76"
author | iuc |
---|---|
date | Thu, 15 Jul 2021 17:15:16 +0000 |
parents | 101933e63fa8 |
children |
comparison
equal
deleted
inserted
replaced
3:7d3fa213c3d3 | 4:da01a05d91c5 |
---|---|
3 import argparse | 3 import argparse |
4 import sys | 4 import sys |
5 import textwrap | 5 import textwrap |
6 | 6 |
7 | 7 |
8 def main( args ): | 8 def main(args): |
9 """ | 9 """ |
10 Extract the protein and coding section from an augustus gff, gtf file | 10 Extract the protein and coding section from an augustus gff, gtf file |
11 Example file: | 11 Example file: |
12 HS04636 AUGUSTUS stop_codon 6901 6903 . + 0 Parent=g1.t1 | 12 HS04636 AUGUSTUS stop_codon 6901 6903 . + 0 Parent=g1.t1 |
13 HS04636 AUGUSTUS transcription_end_site 8857 8857 . + . Parent=g1.t1 | 13 HS04636 AUGUSTUS transcription_end_site 8857 8857 . + . Parent=g1.t1 |
14 # protein sequence = [MLARALLLCAVLALSHTANPCCSHPCQNRGVCMSVGFDQYKCDCTRTGFYGENCSTPEFLTRIKLFLKPTPNTVHYIL | 14 # protein sequence = [MLARALLLCAVLALSHTANPCCSHPCQNRGVCMSVGFDQYKCDCTRTGFYGENCSTPEFLTRIKLFLKPTPNTVHYIL |
15 # THFKGFWNVVNNIPFLRNAIMSYVLTSRSHLIDSPPTYNADYGYKSWEAFSNLSYYTRALPPVPDDCPTPLGVKGKKQLPDSNEIVEKLLLRRKFIPD | 15 # THFKGFWNVVNNIPFLRNAIMSYVLTSRSHLIDSPPTYNADYGYKSWEAFSNLSYYTRALPPVPDDCPTPLGVKGKKQLPDSNEIVEKLLLRRKFIPD |
16 # PQGSNMMFAFFAQHFTHQFFKTDHKRGPAFTNGLGHGVDLNHIYGETLARQRKLRLFKDGKMKYQIIDGEMYPPTVKDTQAEMIYPPQVPEHLRFAVG | 16 # PQGSNMMFAFFAQHFTHQFFKTDHKRGPAFTNGLGHGVDLNHIYGETLARQRKLRLFKDGKMKYQIIDGEMYPPTVKDTQAEMIYPPQVPEHLRFAVG |
17 # QEVFGLVPGLMMYATIWLREHNRVCDVLKQEHPEWGDEQLFQTSRLILIGETIKIVIEDYVQHLSGYHFKLKFDPELLFNKQFQYQNRIAAEFNTLYH | 17 # QEVFGLVPGLMMYATIWLREHNRVCDVLKQEHPEWGDEQLFQTSRLILIGETIKIVIEDYVQHLSGYHFKLKFDPELLFNKQFQYQNRIAAEFNTLYH |
18 # WHPLLPDTFQIHDQKYNYQQFIYNNSILLEHGITQFVESFTRQIAGRVAGGRNVPPAVQKVSQASIDQSRQMKYQSFNEYRKRFMLKPYESFEELTGE | 18 # WHPLLPDTFQIHDQKYNYQQFIYNNSILLEHGITQFVESFTRQIAGRVAGGRNVPPAVQKVSQASIDQSRQMKYQSFNEYRKRFMLKPYESFEELTGE |
19 # KEMSAELEALYGDIDAVELYPALLVEKPRPDAIFGETMVEVGAPFSLKGLMGNVICSPAYWKPSTFGGEVGFQIINTASIQSLICNNVKGCPFTSFSV | 19 # KEMSAELEALYGDIDAVELYPALLVEKPRPDAIFGETMVEVGAPFSLKGLMGNVICSPAYWKPSTFGGEVGFQIINTASIQSLICNNVKGCPFTSFSV |
20 # PDPELIKTVTINASSSRSGLDDINPTVLLKERSTEL] | 20 # PDPELIKTVTINASSSRSGLDDINPTVLLKERSTEL] |
21 # end gene g1 | 21 # end gene g1 |
22 ### | 22 ### |
23 # | 23 # |
24 # ----- prediction on sequence number 2 (length = 2344, name = HS08198) ----- | 24 # ----- prediction on sequence number 2 (length = 2344, name = HS08198) ----- |
25 # | 25 # |
26 # Predicted genes for sequence number 2 on both strands | 26 # Predicted genes for sequence number 2 on both strands |
27 # start gene g2 | 27 # start gene g2 |
28 HS08198 AUGUSTUS gene 86 2344 1 + . ID=g2 | 28 HS08198 AUGUSTUS gene 86 2344 1 + . ID=g2 |
29 HS08198 AUGUSTUS transcript 86 2344 . + . ID=g2.t1;Parent=g2 | 29 HS08198 AUGUSTUS transcript 86 2344 . + . ID=g2.t1;Parent=g2 |
30 HS08198 AUGUSTUS transcription_start_site 86 86 . + . Parent=g2.t1 | 30 HS08198 AUGUSTUS transcription_start_site 86 86 . + . Parent=g2.t1 |
31 HS08198 AUGUSTUS exon 86 582 . + . Parent=g2.t1 | 31 HS08198 AUGUSTUS exon 86 582 . + . Parent=g2.t1 |
32 HS08198 AUGUSTUS start_codon 445 447 . + 0 Parent=g2.t1 | 32 HS08198 AUGUSTUS start_codon 445 447 . + 0 Parent=g2.t1 |
33 """ | 33 """ |
34 protein_seq = '' | 34 protein_seq = "" |
35 coding_seq = '' | 35 coding_seq = "" |
36 if args.protein: | 36 if args.protein: |
37 po = open( args.protein, 'w+' ) | 37 po = open(args.protein, "w+") |
38 if args.codingseq: | 38 if args.codingseq: |
39 co = open( args.codingseq, 'w+' ) | 39 co = open(args.codingseq, "w+") |
40 | 40 |
41 for line in sys.stdin: | 41 for line in sys.stdin: |
42 # protein- and coding-sequence are stored as comments | 42 # protein- and coding-sequence are stored as comments |
43 if line.startswith('#'): | 43 if line.startswith("#"): |
44 line = line[2:].strip() | 44 line = line[2:].strip() |
45 if line.startswith('start gene'): | 45 if line.startswith("start gene"): |
46 gene_name = line[11:].strip() | 46 gene_name = line[11:].strip() |
47 | 47 |
48 if protein_seq: | 48 if protein_seq: |
49 if line.endswith(']'): | 49 if line.endswith("]"): |
50 protein_seq += line[:-1] | 50 protein_seq += line[:-1] |
51 po.write( '>%s\n%s\n' % (gene_name, '\n'.join( textwrap.wrap( protein_seq, 80 ) ) ) ) | 51 po.write( |
52 protein_seq = '' | 52 ">%s\n%s\n" |
53 % (gene_name, "\n".join(textwrap.wrap(protein_seq, 80))) | |
54 ) | |
55 protein_seq = "" | |
53 else: | 56 else: |
54 protein_seq += line | 57 protein_seq += line |
55 | 58 |
56 if coding_seq: | 59 if coding_seq: |
57 if line.endswith(']'): | 60 if line.endswith("]"): |
58 coding_seq += line[:-1] | 61 coding_seq += line[:-1] |
59 co.write( '>%s\n%s\n' % (gene_name, '\n'.join( textwrap.wrap( coding_seq, 80 ) ) ) ) | 62 co.write( |
60 coding_seq = '' | 63 ">%s\n%s\n" |
64 % (gene_name, "\n".join(textwrap.wrap(coding_seq, 80))) | |
65 ) | |
66 coding_seq = "" | |
61 else: | 67 else: |
62 coding_seq += line | 68 coding_seq += line |
63 | 69 |
64 if args.protein and line.startswith('protein sequence = ['): | 70 if args.protein and line.startswith("protein sequence = ["): |
65 if line.endswith(']'): | 71 if line.endswith("]"): |
66 protein_seq = line[20:-1] | 72 protein_seq = line[20:-1] |
67 po.write( '>%s\n%s\n' % (gene_name, '\n'.join( textwrap.wrap( protein_seq, 80 ) ) ) ) | 73 po.write( |
68 protein_seq = '' | 74 ">%s\n%s\n" |
75 % (gene_name, "\n".join(textwrap.wrap(protein_seq, 80))) | |
76 ) | |
77 protein_seq = "" | |
69 else: | 78 else: |
70 line = line[20:] | 79 line = line[20:] |
71 protein_seq = line | 80 protein_seq = line |
72 | 81 |
73 if args.codingseq and line.startswith('coding sequence = ['): | 82 if args.codingseq and line.startswith("coding sequence = ["): |
74 if line.endswith(']'): | 83 if line.endswith("]"): |
75 coding_seq = line[19:-1] | 84 coding_seq = line[19:-1] |
76 co.write( '>%s\n%s\n' % (gene_name, '\n'.join( textwrap.wrap( coding_seq, 80 ) ) ) ) | 85 co.write( |
77 coding_seq = '' | 86 ">%s\n%s\n" |
87 % (gene_name, "\n".join(textwrap.wrap(coding_seq, 80))) | |
88 ) | |
89 coding_seq = "" | |
78 else: | 90 else: |
79 line = line[19:] | 91 line = line[19:] |
80 coding_seq = line | 92 coding_seq = line |
81 | 93 |
82 if args.codingseq: | 94 if args.codingseq: |
83 co.close() | 95 co.close() |
84 if args.protein: | 96 if args.protein: |
85 po.close() | 97 po.close() |
86 | 98 |
87 | 99 |
88 if __name__ == '__main__': | 100 if __name__ == "__main__": |
89 parser = argparse.ArgumentParser() | 101 parser = argparse.ArgumentParser() |
90 parser.add_argument('-p', '--protein', help='Path to the protein file.') | 102 parser.add_argument("-p", "--protein", help="Path to the protein file.") |
91 parser.add_argument('-c', '--codingseq', help='Path to the coding file.') | 103 parser.add_argument("-c", "--codingseq", help="Path to the coding file.") |
92 | 104 |
93 args = parser.parse_args() | 105 args = parser.parse_args() |
94 main( args ) | 106 main(args) |