Mercurial > repos > eric-rasche > apollo
view create_features_from_gff3.py @ 8:df7a90763b3c draft default tip
planemo upload for repository https://github.com/TAMU-CPT/galaxy-webapollo commit 37aa8fa02c144220da9a3899644025b8c19697c9
author | eric-rasche |
---|---|
date | Tue, 27 Jun 2017 04:13:27 -0400 |
parents | f9a6e151b3b4 |
children |
line wrap: on
line source
#!/usr/bin/env python from builtins import str import sys import time import argparse from webapollo import WebApolloInstance, featuresToFeatureSchema from webapollo import WAAuth, OrgOrGuess, GuessOrg, AssertUser, retry from BCBio import GFF import logging logging.basicConfig(level=logging.INFO) log = logging.getLogger(__name__) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Sample script to add an attribute to a feature via web services') WAAuth(parser) parser.add_argument('email', help='User Email') parser.add_argument('--source', help='URL where the input dataset can be found.') OrgOrGuess(parser) parser.add_argument('gff3', type=argparse.FileType('r'), help='GFF3 file') args = parser.parse_args() wa = WebApolloInstance(args.apollo, args.username, args.password) # User must have an account gx_user = AssertUser(wa.users.loadUsers(email=args.email)) # Get organism org_cn = GuessOrg(args, wa) if isinstance(org_cn, list): org_cn = org_cn[0] # TODO: Check user perms on org. org = wa.organisms.findOrganismByCn(org_cn) bad_quals = ['date_creation', 'source', 'owner', 'date_last_modified', 'Name', 'ID'] sys.stdout.write('# ') sys.stdout.write('\t'.join(['Feature ID', 'Apollo ID', 'Success', 'Messages'])) sys.stdout.write('\n') # print(wa.annotations.getFeatures()) for rec in GFF.parse(args.gff3): wa.annotations.setSequence(rec.id, org['id']) for feature in rec.features: # We can only handle genes right now if feature.type not in ('gene', 'terminator'): continue # Convert the feature into a presentation that Apollo will accept featureData = featuresToFeatureSchema([feature]) if 'children' in featureData[0] and any([child['type']['name'] == 'tRNA' for child in featureData[0]['children']]): # We're experiencing a (transient?) problem where gene_001 to # gene_025 will be rejected. Thus, hardcode to a known working # gene name and update later. featureData[0]['name'] = 'tRNA_000' tRNA_sf = [child for child in feature.sub_features if child.type == 'tRNA'][0] tRNA_type = 'tRNA-' + tRNA_sf.qualifiers.get('Codon', ["Unk"])[0] if 'Name' in feature.qualifiers: if feature.qualifiers['Name'][0].startswith('tRNA-'): tRNA_type = feature.qualifiers['Name'][0] newfeature = wa.annotations.addFeature(featureData, trustme=True) def func0(): wa.annotations.setName( newfeature['features'][0]['uniquename'], tRNA_type, ) retry(func0) if args.source: gene_id = newfeature['features'][0]['parent_id'] def setSource(): wa.annotations.addAttributes(gene_id, {'DatasetSource': [args.source]}) retry(setSource) sys.stdout.write('\t'.join([ feature.id, newfeature['features'][0]['uniquename'], 'success', ])) elif featureData[0]['type']['name'] == 'terminator': # We're experiencing a (transient?) problem where gene_001 to # gene_025 will be rejected. Thus, hardcode to a known working # gene name and update later. featureData[0]['name'] = 'terminator_000' newfeature = wa.annotations.addFeature(featureData, trustme=True) def func0(): wa.annotations.setName( newfeature['features'][0]['uniquename'], 'terminator' ) retry(func0) if args.source: gene_id = newfeature['features'][0]['parent_id'] def setSource(): wa.annotations.addAttributes(gene_id, {'DatasetSource': [args.source]}) retry(setSource) sys.stdout.write('\t'.join([ feature.id, newfeature['features'][0]['uniquename'], 'success', ])) else: try: # We're experiencing a (transient?) problem where gene_001 to # gene_025 will be rejected. Thus, hardcode to a known working # gene name and update later. featureData[0]['name'] = 'gene_000' # Extract CDS feature from the feature data, this will be used # to set the CDS location correctly (apollo currently screwing # this up (2.0.6)) CDS = featureData[0]['children'][0]['children'] CDS = [x for x in CDS if x['type']['name'] == 'CDS'][0]['location'] # Create the new feature newfeature = wa.annotations.addFeature(featureData, trustme=True) # Extract the UUIDs that apollo returns to us mrna_id = newfeature['features'][0]['uniquename'] gene_id = newfeature['features'][0]['parent_id'] # Sleep to give it time to actually persist the feature. Apollo # is terrible about writing + immediately reading back written # data. time.sleep(1) # Correct the translation start, but with strand specific log if CDS['strand'] == 1: wa.annotations.setTranslationStart(mrna_id, min(CDS['fmin'], CDS['fmax'])) else: wa.annotations.setTranslationStart(mrna_id, max(CDS['fmin'], CDS['fmax']) - 1) # Finally we set the name, this should be correct. time.sleep(0.5) wa.annotations.setName(mrna_id, feature.qualifiers.get('product', feature.qualifiers.get('Name', ["Unknown"]))[0]) time.sleep(0.5) def func(): wa.annotations.setName(gene_id, feature.qualifiers.get('product', feature.qualifiers.get('Name', ["Unknown"]))[0]) retry(func) if args.source: gene_id = newfeature['features'][0]['parent_id'] def setSource(): wa.annotations.addAttributes(gene_id, {'DatasetSource': [args.source]}) retry(setSource) extra_attr = {} for (key, values) in feature.qualifiers.items(): if key in bad_quals: continue if key == 'Note': def func2(): wa.annotations.addComments(gene_id, values) retry(func2) else: extra_attr[key] = values def func3(): wa.annotations.addAttributes(gene_id, extra_attr) retry(func3) sys.stdout.write('\t'.join([ feature.id, gene_id, 'success', ])) except Exception as e: msg = str(e) if '\n' in msg: msg = msg[0:msg.index('\n')] sys.stdout.write('\t'.join([ feature.id, '', 'ERROR', msg ])) sys.stdout.write('\n') sys.stdout.flush()