Mercurial > repos > eric-rasche > apollo
comparison create_features_from_gff3.py @ 7:f9a6e151b3b4 draft
planemo upload for repository https://github.com/TAMU-CPT/galaxy-webapollo commit 52b9e5bf6a6efb09a5cb845ee48703651c644174
author | eric-rasche |
---|---|
date | Tue, 27 Jun 2017 04:05:17 -0400 |
parents | 7610987e0c48 |
children |
comparison
equal
deleted
inserted
replaced
6:8f76685cdfc8 | 7:f9a6e151b3b4 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 from builtins import str | |
2 import sys | 3 import sys |
3 import json | |
4 import time | 4 import time |
5 import argparse | 5 import argparse |
6 from webapollo import WebApolloInstance, featuresToFeatureSchema | 6 from webapollo import WebApolloInstance, featuresToFeatureSchema |
7 from webapollo import WAAuth, OrgOrGuess, GuessOrg, AssertUser | 7 from webapollo import WAAuth, OrgOrGuess, GuessOrg, AssertUser, retry |
8 from BCBio import GFF | 8 from BCBio import GFF |
9 import logging | 9 import logging |
10 logging.basicConfig(level=logging.INFO) | 10 logging.basicConfig(level=logging.INFO) |
11 log = logging.getLogger(__name__) | 11 log = logging.getLogger(__name__) |
12 | 12 |
13 | 13 |
14 if __name__ == '__main__': | 14 if __name__ == '__main__': |
15 parser = argparse.ArgumentParser(description='Sample script to add an attribute to a feature via web services') | 15 parser = argparse.ArgumentParser(description='Sample script to add an attribute to a feature via web services') |
16 WAAuth(parser) | 16 WAAuth(parser) |
17 parser.add_argument('email', help='User Email') | 17 parser.add_argument('email', help='User Email') |
18 parser.add_argument('--source', help='URL where the input dataset can be found.') | |
18 OrgOrGuess(parser) | 19 OrgOrGuess(parser) |
19 | 20 |
20 parser.add_argument('gff3', type=argparse.FileType('r'), help='GFF3 file') | 21 parser.add_argument('gff3', type=argparse.FileType('r'), help='GFF3 file') |
21 args = parser.parse_args() | 22 args = parser.parse_args() |
22 | 23 |
35 bad_quals = ['date_creation', 'source', 'owner', 'date_last_modified', 'Name', 'ID'] | 36 bad_quals = ['date_creation', 'source', 'owner', 'date_last_modified', 'Name', 'ID'] |
36 | 37 |
37 sys.stdout.write('# ') | 38 sys.stdout.write('# ') |
38 sys.stdout.write('\t'.join(['Feature ID', 'Apollo ID', 'Success', 'Messages'])) | 39 sys.stdout.write('\t'.join(['Feature ID', 'Apollo ID', 'Success', 'Messages'])) |
39 sys.stdout.write('\n') | 40 sys.stdout.write('\n') |
40 | |
41 # print(wa.annotations.getFeatures()) | 41 # print(wa.annotations.getFeatures()) |
42 for rec in GFF.parse(args.gff3): | 42 for rec in GFF.parse(args.gff3): |
43 wa.annotations.setSequence(rec.id, org['id']) | 43 wa.annotations.setSequence(rec.id, org['id']) |
44 for feature in rec.features: | 44 for feature in rec.features: |
45 # We can only handle genes right now | 45 # We can only handle genes right now |
46 if feature.type != 'gene': | 46 if feature.type not in ('gene', 'terminator'): |
47 continue | 47 continue |
48 # Convert the feature into a presentation that Apollo will accept | 48 # Convert the feature into a presentation that Apollo will accept |
49 featureData = featuresToFeatureSchema([feature]) | 49 featureData = featuresToFeatureSchema([feature]) |
50 | 50 if 'children' in featureData[0] and any([child['type']['name'] == 'tRNA' for child in featureData[0]['children']]): |
51 try: | |
52 # We're experiencing a (transient?) problem where gene_001 to | 51 # We're experiencing a (transient?) problem where gene_001 to |
53 # gene_025 will be rejected. Thus, hardcode to a known working | 52 # gene_025 will be rejected. Thus, hardcode to a known working |
54 # gene name and update later. | 53 # gene name and update later. |
55 featureData[0]['name'] = 'gene_000' | 54 |
56 # Extract CDS feature from the feature data, this will be used | 55 featureData[0]['name'] = 'tRNA_000' |
57 # to set the CDS location correctly (apollo currently screwing | 56 tRNA_sf = [child for child in feature.sub_features if child.type == 'tRNA'][0] |
58 # this up (2.0.6)) | 57 tRNA_type = 'tRNA-' + tRNA_sf.qualifiers.get('Codon', ["Unk"])[0] |
59 CDS = featureData[0]['children'][0]['children'] | 58 |
60 CDS = [x for x in CDS if x['type']['name'] == 'CDS'][0]['location'] | 59 if 'Name' in feature.qualifiers: |
61 # Create the new feature | 60 if feature.qualifiers['Name'][0].startswith('tRNA-'): |
61 tRNA_type = feature.qualifiers['Name'][0] | |
62 | |
62 newfeature = wa.annotations.addFeature(featureData, trustme=True) | 63 newfeature = wa.annotations.addFeature(featureData, trustme=True) |
63 # Extract the UUIDs that apollo returns to us | |
64 mrna_id = newfeature['features'][0]['uniquename'] | |
65 gene_id = newfeature['features'][0]['parent_id'] | |
66 # Sleep to give it time to actually persist the feature. Apollo | |
67 # is terrible about writing + immediately reading back written | |
68 # data. | |
69 time.sleep(1) | |
70 # Correct the translation start, but with strand specific log | |
71 if CDS['strand'] == 1: | |
72 wa.annotations.setTranslationStart(mrna_id, min(CDS['fmin'], CDS['fmax'])) | |
73 else: | |
74 wa.annotations.setTranslationStart(mrna_id, max(CDS['fmin'], CDS['fmax']) - 1) | |
75 | 64 |
76 # Finally we set the name, this should be correct. | 65 def func0(): |
77 wa.annotations.setName(mrna_id, feature.qualifiers.get('product', ["Unknown"])[0]) | 66 wa.annotations.setName( |
78 wa.annotations.setName(gene_id, feature.qualifiers.get('product', ["Unknown"])[0]) | 67 newfeature['features'][0]['uniquename'], |
68 tRNA_type, | |
69 ) | |
70 retry(func0) | |
79 | 71 |
80 for (k, v) in feature.qualifiers.items(): | 72 if args.source: |
81 if k not in bad_quals: | 73 gene_id = newfeature['features'][0]['parent_id'] |
82 # set qualifier | 74 |
83 pass | 75 def setSource(): |
76 wa.annotations.addAttributes(gene_id, {'DatasetSource': [args.source]}) | |
77 retry(setSource) | |
84 | 78 |
85 sys.stdout.write('\t'.join([ | 79 sys.stdout.write('\t'.join([ |
86 feature.id, | 80 feature.id, |
87 gene_id, | 81 newfeature['features'][0]['uniquename'], |
88 'success', | 82 'success', |
89 "Dropped qualifiers: %s" % (json.dumps({k: v for (k, v) in feature.qualifiers.items() if k not in bad_quals})), | |
90 ])) | 83 ])) |
91 except Exception as e: | 84 elif featureData[0]['type']['name'] == 'terminator': |
85 # We're experiencing a (transient?) problem where gene_001 to | |
86 # gene_025 will be rejected. Thus, hardcode to a known working | |
87 # gene name and update later. | |
88 featureData[0]['name'] = 'terminator_000' | |
89 newfeature = wa.annotations.addFeature(featureData, trustme=True) | |
90 | |
91 def func0(): | |
92 wa.annotations.setName( | |
93 newfeature['features'][0]['uniquename'], | |
94 'terminator' | |
95 ) | |
96 | |
97 retry(func0) | |
98 | |
99 if args.source: | |
100 gene_id = newfeature['features'][0]['parent_id'] | |
101 | |
102 def setSource(): | |
103 wa.annotations.addAttributes(gene_id, {'DatasetSource': [args.source]}) | |
104 retry(setSource) | |
105 | |
92 sys.stdout.write('\t'.join([ | 106 sys.stdout.write('\t'.join([ |
93 feature.id, | 107 feature.id, |
94 '', | 108 newfeature['features'][0]['uniquename'], |
95 'ERROR', | 109 'success', |
96 str(e) | |
97 ])) | 110 ])) |
111 else: | |
112 try: | |
113 # We're experiencing a (transient?) problem where gene_001 to | |
114 # gene_025 will be rejected. Thus, hardcode to a known working | |
115 # gene name and update later. | |
116 featureData[0]['name'] = 'gene_000' | |
117 # Extract CDS feature from the feature data, this will be used | |
118 # to set the CDS location correctly (apollo currently screwing | |
119 # this up (2.0.6)) | |
120 CDS = featureData[0]['children'][0]['children'] | |
121 CDS = [x for x in CDS if x['type']['name'] == 'CDS'][0]['location'] | |
122 # Create the new feature | |
123 newfeature = wa.annotations.addFeature(featureData, trustme=True) | |
124 # Extract the UUIDs that apollo returns to us | |
125 mrna_id = newfeature['features'][0]['uniquename'] | |
126 gene_id = newfeature['features'][0]['parent_id'] | |
127 # Sleep to give it time to actually persist the feature. Apollo | |
128 # is terrible about writing + immediately reading back written | |
129 # data. | |
130 time.sleep(1) | |
131 # Correct the translation start, but with strand specific log | |
132 if CDS['strand'] == 1: | |
133 wa.annotations.setTranslationStart(mrna_id, min(CDS['fmin'], CDS['fmax'])) | |
134 else: | |
135 wa.annotations.setTranslationStart(mrna_id, max(CDS['fmin'], CDS['fmax']) - 1) | |
98 | 136 |
137 # Finally we set the name, this should be correct. | |
138 time.sleep(0.5) | |
139 wa.annotations.setName(mrna_id, feature.qualifiers.get('product', feature.qualifiers.get('Name', ["Unknown"]))[0]) | |
140 time.sleep(0.5) | |
141 | |
142 def func(): | |
143 wa.annotations.setName(gene_id, feature.qualifiers.get('product', feature.qualifiers.get('Name', ["Unknown"]))[0]) | |
144 retry(func) | |
145 | |
146 if args.source: | |
147 gene_id = newfeature['features'][0]['parent_id'] | |
148 | |
149 def setSource(): | |
150 wa.annotations.addAttributes(gene_id, {'DatasetSource': [args.source]}) | |
151 retry(setSource) | |
152 extra_attr = {} | |
153 for (key, values) in feature.qualifiers.items(): | |
154 if key in bad_quals: | |
155 continue | |
156 | |
157 if key == 'Note': | |
158 def func2(): | |
159 wa.annotations.addComments(gene_id, values) | |
160 retry(func2) | |
161 else: | |
162 extra_attr[key] = values | |
163 | |
164 def func3(): | |
165 wa.annotations.addAttributes(gene_id, extra_attr) | |
166 retry(func3) | |
167 | |
168 sys.stdout.write('\t'.join([ | |
169 feature.id, | |
170 gene_id, | |
171 'success', | |
172 ])) | |
173 except Exception as e: | |
174 msg = str(e) | |
175 if '\n' in msg: | |
176 msg = msg[0:msg.index('\n')] | |
177 sys.stdout.write('\t'.join([ | |
178 feature.id, | |
179 '', | |
180 'ERROR', | |
181 msg | |
182 ])) | |
99 sys.stdout.write('\n') | 183 sys.stdout.write('\n') |
184 sys.stdout.flush() |