Mercurial > repos > eric-rasche > apollo
diff create_features_from_gff3.py @ 7:f9a6e151b3b4 draft
planemo upload for repository https://github.com/TAMU-CPT/galaxy-webapollo commit 52b9e5bf6a6efb09a5cb845ee48703651c644174
author | eric-rasche |
---|---|
date | Tue, 27 Jun 2017 04:05:17 -0400 |
parents | 7610987e0c48 |
children |
line wrap: on
line diff
--- a/create_features_from_gff3.py Sat Mar 04 18:00:52 2017 -0500 +++ b/create_features_from_gff3.py Tue Jun 27 04:05:17 2017 -0400 @@ -1,10 +1,10 @@ #!/usr/bin/env python +from builtins import str import sys -import json import time import argparse from webapollo import WebApolloInstance, featuresToFeatureSchema -from webapollo import WAAuth, OrgOrGuess, GuessOrg, AssertUser +from webapollo import WAAuth, OrgOrGuess, GuessOrg, AssertUser, retry from BCBio import GFF import logging logging.basicConfig(level=logging.INFO) @@ -15,6 +15,7 @@ parser = argparse.ArgumentParser(description='Sample script to add an attribute to a feature via web services') WAAuth(parser) parser.add_argument('email', help='User Email') + parser.add_argument('--source', help='URL where the input dataset can be found.') OrgOrGuess(parser) parser.add_argument('gff3', type=argparse.FileType('r'), help='GFF3 file') @@ -37,63 +38,147 @@ sys.stdout.write('# ') sys.stdout.write('\t'.join(['Feature ID', 'Apollo ID', 'Success', 'Messages'])) sys.stdout.write('\n') - # print(wa.annotations.getFeatures()) for rec in GFF.parse(args.gff3): wa.annotations.setSequence(rec.id, org['id']) for feature in rec.features: # We can only handle genes right now - if feature.type != 'gene': + if feature.type not in ('gene', 'terminator'): continue # Convert the feature into a presentation that Apollo will accept featureData = featuresToFeatureSchema([feature]) + if 'children' in featureData[0] and any([child['type']['name'] == 'tRNA' for child in featureData[0]['children']]): + # We're experiencing a (transient?) problem where gene_001 to + # gene_025 will be rejected. Thus, hardcode to a known working + # gene name and update later. - try: + featureData[0]['name'] = 'tRNA_000' + tRNA_sf = [child for child in feature.sub_features if child.type == 'tRNA'][0] + tRNA_type = 'tRNA-' + tRNA_sf.qualifiers.get('Codon', ["Unk"])[0] + + if 'Name' in feature.qualifiers: + if feature.qualifiers['Name'][0].startswith('tRNA-'): + tRNA_type = feature.qualifiers['Name'][0] + + newfeature = wa.annotations.addFeature(featureData, trustme=True) + + def func0(): + wa.annotations.setName( + newfeature['features'][0]['uniquename'], + tRNA_type, + ) + retry(func0) + + if args.source: + gene_id = newfeature['features'][0]['parent_id'] + + def setSource(): + wa.annotations.addAttributes(gene_id, {'DatasetSource': [args.source]}) + retry(setSource) + + sys.stdout.write('\t'.join([ + feature.id, + newfeature['features'][0]['uniquename'], + 'success', + ])) + elif featureData[0]['type']['name'] == 'terminator': # We're experiencing a (transient?) problem where gene_001 to # gene_025 will be rejected. Thus, hardcode to a known working # gene name and update later. - featureData[0]['name'] = 'gene_000' - # Extract CDS feature from the feature data, this will be used - # to set the CDS location correctly (apollo currently screwing - # this up (2.0.6)) - CDS = featureData[0]['children'][0]['children'] - CDS = [x for x in CDS if x['type']['name'] == 'CDS'][0]['location'] - # Create the new feature + featureData[0]['name'] = 'terminator_000' newfeature = wa.annotations.addFeature(featureData, trustme=True) - # Extract the UUIDs that apollo returns to us - mrna_id = newfeature['features'][0]['uniquename'] - gene_id = newfeature['features'][0]['parent_id'] - # Sleep to give it time to actually persist the feature. Apollo - # is terrible about writing + immediately reading back written - # data. - time.sleep(1) - # Correct the translation start, but with strand specific log - if CDS['strand'] == 1: - wa.annotations.setTranslationStart(mrna_id, min(CDS['fmin'], CDS['fmax'])) - else: - wa.annotations.setTranslationStart(mrna_id, max(CDS['fmin'], CDS['fmax']) - 1) + + def func0(): + wa.annotations.setName( + newfeature['features'][0]['uniquename'], + 'terminator' + ) - # Finally we set the name, this should be correct. - wa.annotations.setName(mrna_id, feature.qualifiers.get('product', ["Unknown"])[0]) - wa.annotations.setName(gene_id, feature.qualifiers.get('product', ["Unknown"])[0]) + retry(func0) - for (k, v) in feature.qualifiers.items(): - if k not in bad_quals: - # set qualifier - pass + if args.source: + gene_id = newfeature['features'][0]['parent_id'] + + def setSource(): + wa.annotations.addAttributes(gene_id, {'DatasetSource': [args.source]}) + retry(setSource) sys.stdout.write('\t'.join([ feature.id, - gene_id, + newfeature['features'][0]['uniquename'], 'success', - "Dropped qualifiers: %s" % (json.dumps({k: v for (k, v) in feature.qualifiers.items() if k not in bad_quals})), ])) - except Exception as e: - sys.stdout.write('\t'.join([ - feature.id, - '', - 'ERROR', - str(e) - ])) + else: + try: + # We're experiencing a (transient?) problem where gene_001 to + # gene_025 will be rejected. Thus, hardcode to a known working + # gene name and update later. + featureData[0]['name'] = 'gene_000' + # Extract CDS feature from the feature data, this will be used + # to set the CDS location correctly (apollo currently screwing + # this up (2.0.6)) + CDS = featureData[0]['children'][0]['children'] + CDS = [x for x in CDS if x['type']['name'] == 'CDS'][0]['location'] + # Create the new feature + newfeature = wa.annotations.addFeature(featureData, trustme=True) + # Extract the UUIDs that apollo returns to us + mrna_id = newfeature['features'][0]['uniquename'] + gene_id = newfeature['features'][0]['parent_id'] + # Sleep to give it time to actually persist the feature. Apollo + # is terrible about writing + immediately reading back written + # data. + time.sleep(1) + # Correct the translation start, but with strand specific log + if CDS['strand'] == 1: + wa.annotations.setTranslationStart(mrna_id, min(CDS['fmin'], CDS['fmax'])) + else: + wa.annotations.setTranslationStart(mrna_id, max(CDS['fmin'], CDS['fmax']) - 1) + + # Finally we set the name, this should be correct. + time.sleep(0.5) + wa.annotations.setName(mrna_id, feature.qualifiers.get('product', feature.qualifiers.get('Name', ["Unknown"]))[0]) + time.sleep(0.5) + + def func(): + wa.annotations.setName(gene_id, feature.qualifiers.get('product', feature.qualifiers.get('Name', ["Unknown"]))[0]) + retry(func) + if args.source: + gene_id = newfeature['features'][0]['parent_id'] + + def setSource(): + wa.annotations.addAttributes(gene_id, {'DatasetSource': [args.source]}) + retry(setSource) + extra_attr = {} + for (key, values) in feature.qualifiers.items(): + if key in bad_quals: + continue + + if key == 'Note': + def func2(): + wa.annotations.addComments(gene_id, values) + retry(func2) + else: + extra_attr[key] = values + + def func3(): + wa.annotations.addAttributes(gene_id, extra_attr) + retry(func3) + + sys.stdout.write('\t'.join([ + feature.id, + gene_id, + 'success', + ])) + except Exception as e: + msg = str(e) + if '\n' in msg: + msg = msg[0:msg.index('\n')] + sys.stdout.write('\t'.join([ + feature.id, + '', + 'ERROR', + msg + ])) sys.stdout.write('\n') + sys.stdout.flush()