comparison create_features_from_gff3.py @ 7:f9a6e151b3b4 draft

planemo upload for repository https://github.com/TAMU-CPT/galaxy-webapollo commit 52b9e5bf6a6efb09a5cb845ee48703651c644174
author eric-rasche
date Tue, 27 Jun 2017 04:05:17 -0400
parents 7610987e0c48
children
comparison
equal deleted inserted replaced
6:8f76685cdfc8 7:f9a6e151b3b4
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 from builtins import str
2 import sys 3 import sys
3 import json
4 import time 4 import time
5 import argparse 5 import argparse
6 from webapollo import WebApolloInstance, featuresToFeatureSchema 6 from webapollo import WebApolloInstance, featuresToFeatureSchema
7 from webapollo import WAAuth, OrgOrGuess, GuessOrg, AssertUser 7 from webapollo import WAAuth, OrgOrGuess, GuessOrg, AssertUser, retry
8 from BCBio import GFF 8 from BCBio import GFF
9 import logging 9 import logging
10 logging.basicConfig(level=logging.INFO) 10 logging.basicConfig(level=logging.INFO)
11 log = logging.getLogger(__name__) 11 log = logging.getLogger(__name__)
12 12
13 13
14 if __name__ == '__main__': 14 if __name__ == '__main__':
15 parser = argparse.ArgumentParser(description='Sample script to add an attribute to a feature via web services') 15 parser = argparse.ArgumentParser(description='Sample script to add an attribute to a feature via web services')
16 WAAuth(parser) 16 WAAuth(parser)
17 parser.add_argument('email', help='User Email') 17 parser.add_argument('email', help='User Email')
18 parser.add_argument('--source', help='URL where the input dataset can be found.')
18 OrgOrGuess(parser) 19 OrgOrGuess(parser)
19 20
20 parser.add_argument('gff3', type=argparse.FileType('r'), help='GFF3 file') 21 parser.add_argument('gff3', type=argparse.FileType('r'), help='GFF3 file')
21 args = parser.parse_args() 22 args = parser.parse_args()
22 23
35 bad_quals = ['date_creation', 'source', 'owner', 'date_last_modified', 'Name', 'ID'] 36 bad_quals = ['date_creation', 'source', 'owner', 'date_last_modified', 'Name', 'ID']
36 37
37 sys.stdout.write('# ') 38 sys.stdout.write('# ')
38 sys.stdout.write('\t'.join(['Feature ID', 'Apollo ID', 'Success', 'Messages'])) 39 sys.stdout.write('\t'.join(['Feature ID', 'Apollo ID', 'Success', 'Messages']))
39 sys.stdout.write('\n') 40 sys.stdout.write('\n')
40
41 # print(wa.annotations.getFeatures()) 41 # print(wa.annotations.getFeatures())
42 for rec in GFF.parse(args.gff3): 42 for rec in GFF.parse(args.gff3):
43 wa.annotations.setSequence(rec.id, org['id']) 43 wa.annotations.setSequence(rec.id, org['id'])
44 for feature in rec.features: 44 for feature in rec.features:
45 # We can only handle genes right now 45 # We can only handle genes right now
46 if feature.type != 'gene': 46 if feature.type not in ('gene', 'terminator'):
47 continue 47 continue
48 # Convert the feature into a presentation that Apollo will accept 48 # Convert the feature into a presentation that Apollo will accept
49 featureData = featuresToFeatureSchema([feature]) 49 featureData = featuresToFeatureSchema([feature])
50 50 if 'children' in featureData[0] and any([child['type']['name'] == 'tRNA' for child in featureData[0]['children']]):
51 try:
52 # We're experiencing a (transient?) problem where gene_001 to 51 # We're experiencing a (transient?) problem where gene_001 to
53 # gene_025 will be rejected. Thus, hardcode to a known working 52 # gene_025 will be rejected. Thus, hardcode to a known working
54 # gene name and update later. 53 # gene name and update later.
55 featureData[0]['name'] = 'gene_000' 54
56 # Extract CDS feature from the feature data, this will be used 55 featureData[0]['name'] = 'tRNA_000'
57 # to set the CDS location correctly (apollo currently screwing 56 tRNA_sf = [child for child in feature.sub_features if child.type == 'tRNA'][0]
58 # this up (2.0.6)) 57 tRNA_type = 'tRNA-' + tRNA_sf.qualifiers.get('Codon', ["Unk"])[0]
59 CDS = featureData[0]['children'][0]['children'] 58
60 CDS = [x for x in CDS if x['type']['name'] == 'CDS'][0]['location'] 59 if 'Name' in feature.qualifiers:
61 # Create the new feature 60 if feature.qualifiers['Name'][0].startswith('tRNA-'):
61 tRNA_type = feature.qualifiers['Name'][0]
62
62 newfeature = wa.annotations.addFeature(featureData, trustme=True) 63 newfeature = wa.annotations.addFeature(featureData, trustme=True)
63 # Extract the UUIDs that apollo returns to us
64 mrna_id = newfeature['features'][0]['uniquename']
65 gene_id = newfeature['features'][0]['parent_id']
66 # Sleep to give it time to actually persist the feature. Apollo
67 # is terrible about writing + immediately reading back written
68 # data.
69 time.sleep(1)
70 # Correct the translation start, but with strand specific log
71 if CDS['strand'] == 1:
72 wa.annotations.setTranslationStart(mrna_id, min(CDS['fmin'], CDS['fmax']))
73 else:
74 wa.annotations.setTranslationStart(mrna_id, max(CDS['fmin'], CDS['fmax']) - 1)
75 64
76 # Finally we set the name, this should be correct. 65 def func0():
77 wa.annotations.setName(mrna_id, feature.qualifiers.get('product', ["Unknown"])[0]) 66 wa.annotations.setName(
78 wa.annotations.setName(gene_id, feature.qualifiers.get('product', ["Unknown"])[0]) 67 newfeature['features'][0]['uniquename'],
68 tRNA_type,
69 )
70 retry(func0)
79 71
80 for (k, v) in feature.qualifiers.items(): 72 if args.source:
81 if k not in bad_quals: 73 gene_id = newfeature['features'][0]['parent_id']
82 # set qualifier 74
83 pass 75 def setSource():
76 wa.annotations.addAttributes(gene_id, {'DatasetSource': [args.source]})
77 retry(setSource)
84 78
85 sys.stdout.write('\t'.join([ 79 sys.stdout.write('\t'.join([
86 feature.id, 80 feature.id,
87 gene_id, 81 newfeature['features'][0]['uniquename'],
88 'success', 82 'success',
89 "Dropped qualifiers: %s" % (json.dumps({k: v for (k, v) in feature.qualifiers.items() if k not in bad_quals})),
90 ])) 83 ]))
91 except Exception as e: 84 elif featureData[0]['type']['name'] == 'terminator':
85 # We're experiencing a (transient?) problem where gene_001 to
86 # gene_025 will be rejected. Thus, hardcode to a known working
87 # gene name and update later.
88 featureData[0]['name'] = 'terminator_000'
89 newfeature = wa.annotations.addFeature(featureData, trustme=True)
90
91 def func0():
92 wa.annotations.setName(
93 newfeature['features'][0]['uniquename'],
94 'terminator'
95 )
96
97 retry(func0)
98
99 if args.source:
100 gene_id = newfeature['features'][0]['parent_id']
101
102 def setSource():
103 wa.annotations.addAttributes(gene_id, {'DatasetSource': [args.source]})
104 retry(setSource)
105
92 sys.stdout.write('\t'.join([ 106 sys.stdout.write('\t'.join([
93 feature.id, 107 feature.id,
94 '', 108 newfeature['features'][0]['uniquename'],
95 'ERROR', 109 'success',
96 str(e)
97 ])) 110 ]))
111 else:
112 try:
113 # We're experiencing a (transient?) problem where gene_001 to
114 # gene_025 will be rejected. Thus, hardcode to a known working
115 # gene name and update later.
116 featureData[0]['name'] = 'gene_000'
117 # Extract CDS feature from the feature data, this will be used
118 # to set the CDS location correctly (apollo currently screwing
119 # this up (2.0.6))
120 CDS = featureData[0]['children'][0]['children']
121 CDS = [x for x in CDS if x['type']['name'] == 'CDS'][0]['location']
122 # Create the new feature
123 newfeature = wa.annotations.addFeature(featureData, trustme=True)
124 # Extract the UUIDs that apollo returns to us
125 mrna_id = newfeature['features'][0]['uniquename']
126 gene_id = newfeature['features'][0]['parent_id']
127 # Sleep to give it time to actually persist the feature. Apollo
128 # is terrible about writing + immediately reading back written
129 # data.
130 time.sleep(1)
131 # Correct the translation start, but with strand specific log
132 if CDS['strand'] == 1:
133 wa.annotations.setTranslationStart(mrna_id, min(CDS['fmin'], CDS['fmax']))
134 else:
135 wa.annotations.setTranslationStart(mrna_id, max(CDS['fmin'], CDS['fmax']) - 1)
98 136
137 # Finally we set the name, this should be correct.
138 time.sleep(0.5)
139 wa.annotations.setName(mrna_id, feature.qualifiers.get('product', feature.qualifiers.get('Name', ["Unknown"]))[0])
140 time.sleep(0.5)
141
142 def func():
143 wa.annotations.setName(gene_id, feature.qualifiers.get('product', feature.qualifiers.get('Name', ["Unknown"]))[0])
144 retry(func)
145
146 if args.source:
147 gene_id = newfeature['features'][0]['parent_id']
148
149 def setSource():
150 wa.annotations.addAttributes(gene_id, {'DatasetSource': [args.source]})
151 retry(setSource)
152 extra_attr = {}
153 for (key, values) in feature.qualifiers.items():
154 if key in bad_quals:
155 continue
156
157 if key == 'Note':
158 def func2():
159 wa.annotations.addComments(gene_id, values)
160 retry(func2)
161 else:
162 extra_attr[key] = values
163
164 def func3():
165 wa.annotations.addAttributes(gene_id, extra_attr)
166 retry(func3)
167
168 sys.stdout.write('\t'.join([
169 feature.id,
170 gene_id,
171 'success',
172 ]))
173 except Exception as e:
174 msg = str(e)
175 if '\n' in msg:
176 msg = msg[0:msg.index('\n')]
177 sys.stdout.write('\t'.join([
178 feature.id,
179 '',
180 'ERROR',
181 msg
182 ]))
99 sys.stdout.write('\n') 183 sys.stdout.write('\n')
184 sys.stdout.flush()