Mercurial > repos > public-health-bioinformatics > flu_classification_suite
changeset 2:43b6f25c6f4d draft
planemo upload for repository https://github.com/Public-Health-Bioinformatics/flu_classification_suite commit 856d0b7ab7dc801c168fcdf45cfd2e31f062a37e-dirty
author | public-health-bioinformatics |
---|---|
date | Wed, 09 Jan 2019 15:56:52 -0500 |
parents | c20ba006a8ad |
children | 5eba528ed056 |
files | assign_clades.py assign_clades.xml repository_dependencies.xml test-data/clades.csv test-data/output.fa test-data/test_input.fasta |
diffstat | 6 files changed, 9 insertions(+), 189 deletions(-) [+] |
line wrap: on
line diff
--- a/assign_clades.py Wed Jan 09 15:34:48 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,133 +0,0 @@ -#!/usr/bin/env python - -'''Accepts fasta files containing amino acid sequence, reading them in as -amino acid sequence objects. Reads influenza clade defintions (i.e. amino -acids at certain positions) from .csv file into dictionary structure. Searches -each of the amino acid sequence objects for a list of matching clades, assigns -the most 'evolved' (i.e. child as opposed to parent clade) to the sequence. Appends -"_cladename" to the Sequence name and generates a fasta file of original sequences with -modified names.''' - -'''Author: Diane Eisler, Molecular Microbiology & Genomics, BCCDC Public Health Laboratory, Oct 2017''' - -import sys,string,os, time, Bio -from Bio import Seq, SeqIO, SeqUtils, Alphabet, SeqRecord -from Bio.SeqRecord import SeqRecord -from Bio.Alphabet import IUPAC -from Bio.Seq import Seq - -localtime = time.asctime(time.localtime(time.time())) #date and time of analysis -inFileHandle1 = sys.argv[1] #batch fasta file with sequences to be parsed -inFileHandle2 = sys.argv[2] # .csv file containing clade definitions and "depth" -outFileHandle = sys.argv[3] #user-specified name for output file of aa seq's with clade suffixes -outFile= open(outFileHandle,'w') #open a writable, appendable output file -seqList = [] #list of aa sequence objects to parse for clade definitions -cladeList = [] #empty list to hold clade tuples i.e. ("3C.3a", 1 ,{"3":"I", "9":"V"..}) - -'''Searches record for required amino acids at defined positions. If found, assigns -clade name to sequence name by appending underscore and clade name to record id.''' -def call_clade(record): - print "---------------------------------------------------------------------" - print "Parsing %s for matching flu clade definitions..." % (record.id) - matchList = [] #empty list to hold clades that match 100% - #iterate over each tuple in the clade list - for clade in cladeList: - cladeName = clade[0] #temp variable for name - depth = clade[1] #temp variable for depth - sites = clade[2] #temp variable for aa def dictionary - shouldFind = len(sites) #number of sites that should match - found = 0 #a counter to hold matches to antigenic sites - #iterate over each position in sites dictionary - for pos, aa in sites.iteritems(): - #translate pos to corresponding index in target sequence - index = int(pos) - 1 - #if record at index has same amino acid as 'aa', increment 'found' - if record[index] == aa: - found += 1 - if (found == shouldFind): - #add the matching clade tuple to the list of matches - matchList.append(clade) - return matchList - -'''Compares depth level of clades in a list and returns the most granular one''' -def decide_clade_by_depth(matchList): - #empty variable for maximum depth encountered - max_depth = 0 - best_match_name = '' #variable to hold most granular clade - #for each matching clade, check depth of the corresponding tuple - for clade in matchList: - #if the current clade is 'deeper' than the one before it - if clade[1] > max_depth: - #store this depth - max_depth = clade[1] - #store name of the clade - best_match_name = clade[0] - return best_match_name - -'''opens the .csv file of clade definitions and clade "depth" ''' -with open (inFileHandle2, 'r') as clade_file: - #remove whitespace from the end of each line and split elements at commas - for line in clade_file: - #print "Current Line in File:" + line - sites={} #initialize a dictionary for clade - elementList = line.rstrip().split(',') - new_list = [] #start a new list to put non-empty strings into - #remove empty stings in list - for item in elementList: - if item != '': - new_list.append(item) - name = new_list.pop(0) #move 1st element to name field - depth = int(new_list.pop(0)) #move 2nd element to depth field - #read remaining pairs of non-null elements into clade def dictionary - for i in range(0, len(new_list), 2): - #move next 2 items from the list into the dict - pos = new_list[i] - aa = new_list[i + 1] - sites[pos] = aa - #add the clade info as a tuple to the cladeList[] - oneClade =(name, depth, sites) - cladeList.append(oneClade) - print "The List of Clades:" - for clade in cladeList: - print "Clade Name: %s Depth: %i Antigenic Sites: %i" % (clade[0], clade[1], len(clade[2])) - for pos, aa in clade[2].iteritems(): - print "Pos: %s\tAA: %s" % (pos,aa) - -'''opens readable input file of sequences to parse using filename from cmd line, - instantiates as AA Sequence objects, with ppercase sequences''' -with open(inFileHandle1,'r') as inFile: - #read in Sequences from fasta file, uppercase and add to seqList - for record in SeqIO.parse(inFile, "fasta", alphabet=IUPAC.protein): - record = record.upper() - seqList.append(record) #add Seq to list of Sequences - print "\n%i flu HA sequences will be compared to current clade definitions..." % len(seqList) - #parse each target sequence object - for record in seqList: - clade_call = '' #empty variale for final clade call on sequence - matchingCladeList = call_clade(record) #holds matching clade tuples - #if there is more than one clade match - if len(matchingCladeList) > 1: - #choose the most granular clade based on depth - clade_call = decide_clade_by_depth(matchingCladeList) - #if there is only one clade call - elif len(matchingCladeList) > 0: - clade = matchingCladeList[0] #take the first tuple in the list - clade_call = clade[0] #clade name is the first item in the tuple - #empty list return, no matches - else: - clade_call = "No_Match" - print clade_call - seq_name = record.id - mod_name = seq_name + "_" + clade_call - print "New Sequence Name: " + mod_name - record.id = mod_name - - -#output fasta file with clade calls appended to sequence names -SeqIO.write(seqList,outFile,"fasta") - -#print("\n%i Sequences Extracted to Output file: %s" % ((len(extractedSeqList),outFileHandle))) -inFile.close() -clade_file.close() -outFile.close() -
--- a/assign_clades.xml Wed Jan 09 15:34:48 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,29 +0,0 @@ -<tool id="assign_clades" name="Assign Clades" version="0.0.1"> - <requirements> - <requirement type="package" version="1.70">biopython</requirement> - </requirements> - <command detect_errors="exit_code"><![CDATA[ - assign_clades.py - '$input_fasta' - '$clade_definitions' - '$output_file' - ]]></command> - <inputs> - <param name="input_fasta" format="fasta" type="data" /> - <param name="clade_definitions" format="csv" type="data" /> - </inputs> - <outputs> - <data name="output_file" format="fasta"/> - </outputs> - <tests> - <test> - <param name="input_fasta" value="input_fasta.fasta" /> - <param name="clade_definitions" value="clades.csv" /> - <output name="output_file" value="output.fasta" /> - </test> - </tests> - <help><![CDATA[ - ]]></help> - <citations> - </citations> -</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/repository_dependencies.xml Wed Jan 09 15:56:52 2019 -0500 @@ -0,0 +1,9 @@ +<?xml version="1.0" ?> +<repositories description="This is a metapackage, that will install several other tools."> + <repository changeset_revision="3c4764294d81" name="aggregate_linelisting" owner="public-health-bioinformatics" toolshed="https://testtoolshed.g2.bx.psu.edu"/> + <repository changeset_revision="d3414e1396c4" name="antigenic_site_extraction" owner="public-health-bioinformatics" toolshed="https://testtoolshed.g2.bx.psu.edu"/> + <repository changeset_revision="1dc65ec11a40" name="assign_clades" owner="public-health-bioinformatics" toolshed="https://testtoolshed.g2.bx.psu.edu"/> + <repository changeset_revision="ba7cee75eb68" name="change_fasta_deflines" owner="public-health-bioinformatics" toolshed="https://testtoolshed.g2.bx.psu.edu"/> + <repository changeset_revision="bda72dec1f55" name="linelisting" owner="public-health-bioinformatics" toolshed="https://testtoolshed.g2.bx.psu.edu"/> + <repository changeset_revision="9ccb9f098c9f" name="reformat_usearch_collapsed_fasta" owner="public-health-bioinformatics" toolshed="https://testtoolshed.g2.bx.psu.edu"/> +</repositories> \ No newline at end of file
--- a/test-data/clades.csv Wed Jan 09 15:34:48 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,15 +0,0 @@ -3C.2a,1,3,I,144,S,145,S,159,Y,160,T,225,D,311,H,489,N,,,,,,,,,,,,,, -3C.2a_+_T131K_+_R142K_+_R261Q,2,3,I,131,K,142,K,144,S,145,S,159,Y,160,T,225,D,261,Q,311,H,489,N,,,,,,,, -3C.2a_+_N121K_+_S144K,2,3,I,121,K,144,K,145,S,159,Y,160,T,225,D,311,H,489,N,,,,,,,,,,,, -3C.2a_+_Q197K_+_R261Q,2,3,I,144,S,145,S,159,Y,160,T,197,K,225,D,261,Q,311,H,489,N,,,,,,,,,, -3C.2a_+_N31S_+_D53N_+_R142G_+_S144R_+_N171K_+_I192T_+_Q197H_,2,3,I,31,S,53,N,142,G,144,R,145,S,159,Y,160,T,171,K,192,T,197,H,225,D,311,H,489,N,, -3C.2a1_(w/o_N121K),3,3,I,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,484,E,489,N,,,,,,,, -3C.2a1_+_N121K,4,3,I,121,K,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,484,E,489,N,,,,,, -3C.2a1_+_N121K_+_R142G_+_I242V,4,3,I,121,K,142,G,144,S,145,S,159,Y,160,T,171,K,225,D,242,V,311,H,406,V,479,E,484,E,489,N -3C.2a1_+_N121K_+_R142G,4,3,I,121,K,142,G,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,484,E,489,N,,,, -3C.2a1_+_N121K_+_T135K,4,3,I,121,K,135,K,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,479,E,484,E,489,N,, -3C.2a1_+_N121K_+_K92R_+_H311Q,4,3,I,92,R,121,K,144,S,145,S,159,Y,160,T,171,K,225,D,311,Q,406,V,484,E,489,N,,,, -3C.2a1_+_N121K_+_I140M,4,3,I,121,K,140,M,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,479,E,484,E,489,N,, -3C.2a1_+_R142G,4,3,I,142,G,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,484,E,489,N,,,,,, -3C.2a1_+_S47T_+_G78S,4,3,I,47,T,78,S,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,484,E,489,N,,,, -3C.3a,1,128,A,138,S,142,G,145,S,159,S,225,D,,,,,,,,,,,,,,,,,,
--- a/test-data/output.fa Wed Jan 09 15:34:48 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ ->test_3C.3a test -QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILD -GENCTLIDALLGDPQCDGFQNKKWDLFVERNKAYSSCYPYDVPDYASLRSLVASSGTLEF -NNESFNWAGVTQNGTSSSCIRGSKSSFFSRLNWLTHLNSKYPALNVTMPNNEQFDKLYIW -GVHHPGTDKDQISLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPG -DILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRI -TYGACPRYVKQSTLKLATGMRNVPERQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEG -RGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDL -WSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGS -IRNGTYDHNVYRDEALNNRFQIKGVELKSGYKDW
--- a/test-data/test_input.fasta Wed Jan 09 15:34:48 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ ->test -QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERNKAYSSCYPYDVPDYASLRSLVASSGTLEFNNESFNWAGVTQNGTSSSCIRGSKSSFFSRLNWLTHLNSKYPALNVTMPNNEQFDKLYIWGVHHPGTDKDQISLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKQSTLKLATGMRNVPERQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHNVYRDEALNNRFQIKGVELKSGYKDW