comparison assign_clades.py @ 3:bb1cdfafee59 draft default tip

planemo upload for repository https://github.com/Public-Health-Bioinformatics/flu_classification_suite commit 6f09c69c51ec3d6bd7487f55384b97155355c456
author public-health-bioinformatics
date Mon, 04 Feb 2019 18:34:14 -0500
parents 1f113d9db8ba
children
comparison
equal deleted inserted replaced
2:0d3dad155413 3:bb1cdfafee59
25 cladeList = [] #empty list to hold clade tuples i.e. ("3C.3a", 1 ,{"3":"I", "9":"V"..}) 25 cladeList = [] #empty list to hold clade tuples i.e. ("3C.3a", 1 ,{"3":"I", "9":"V"..})
26 26
27 '''Searches record for required amino acids at defined positions. If found, assigns 27 '''Searches record for required amino acids at defined positions. If found, assigns
28 clade name to sequence name by appending underscore and clade name to record id.''' 28 clade name to sequence name by appending underscore and clade name to record id.'''
29 def call_clade(record): 29 def call_clade(record):
30 print "---------------------------------------------------------------------" 30 print("---------------------------------------------------------------------")
31 print "Parsing %s for matching flu clade definitions..." % (record.id) 31 print("Parsing %s for matching flu clade definitions..." % (record.id))
32 matchList = [] #empty list to hold clades that match 100% 32 matchList = [] #empty list to hold clades that match 100%
33 #iterate over each tuple in the clade list 33 #iterate over each tuple in the clade list
34 for clade in cladeList: 34 for clade in cladeList:
35 cladeName = clade[0] #temp variable for name 35 cladeName = clade[0] #temp variable for name
36 depth = clade[1] #temp variable for depth 36 depth = clade[1] #temp variable for depth
37 sites = clade[2] #temp variable for aa def dictionary 37 sites = clade[2] #temp variable for aa def dictionary
38 shouldFind = len(sites) #number of sites that should match 38 shouldFind = len(sites) #number of sites that should match
39 found = 0 #a counter to hold matches to antigenic sites 39 found = 0 #a counter to hold matches to antigenic sites
40 #iterate over each position in sites dictionary 40 #iterate over each position in sites dictionary
41 for pos, aa in sites.iteritems(): 41 for pos, aa in sites.items():
42 #translate pos to corresponding index in target sequence 42 #translate pos to corresponding index in target sequence
43 index = int(pos) - 1 43 index = int(pos) - 1
44 #if record at index has same amino acid as 'aa', increment 'found' 44 #if record at index has same amino acid as 'aa', increment 'found'
45 if record[index] == aa: 45 if record[index] == aa:
46 found += 1 46 found += 1
85 aa = new_list[i + 1] 85 aa = new_list[i + 1]
86 sites[pos] = aa 86 sites[pos] = aa
87 #add the clade info as a tuple to the cladeList[] 87 #add the clade info as a tuple to the cladeList[]
88 oneClade =(name, depth, sites) 88 oneClade =(name, depth, sites)
89 cladeList.append(oneClade) 89 cladeList.append(oneClade)
90 print "The List of Clades:" 90 print("The List of Clades:")
91 for clade in cladeList: 91 for clade in cladeList:
92 print "Clade Name: %s Depth: %i Antigenic Sites: %i" % (clade[0], clade[1], len(clade[2])) 92 print("Clade Name: %s Depth: %i Antigenic Sites: %i" % (clade[0], clade[1], len(clade[2])))
93 for pos, aa in clade[2].iteritems(): 93 for pos, aa in clade[2].items():
94 print "Pos: %s\tAA: %s" % (pos,aa) 94 print("Pos: %s\tAA: %s" % (pos,aa))
95 95
96 '''opens readable input file of sequences to parse using filename from cmd line, 96 '''opens readable input file of sequences to parse using filename from cmd line,
97 instantiates as AA Sequence objects, with ppercase sequences''' 97 instantiates as AA Sequence objects, with ppercase sequences'''
98 with open(inFileHandle1,'r') as inFile: 98 with open(inFileHandle1,'r') as inFile:
99 #read in Sequences from fasta file, uppercase and add to seqList 99 #read in Sequences from fasta file, uppercase and add to seqList
100 for record in SeqIO.parse(inFile, "fasta", alphabet=IUPAC.protein): 100 for record in SeqIO.parse(inFile, "fasta", alphabet=IUPAC.protein):
101 record = record.upper() 101 record = record.upper()
102 seqList.append(record) #add Seq to list of Sequences 102 seqList.append(record) #add Seq to list of Sequences
103 print "\n%i flu HA sequences will be compared to current clade definitions..." % len(seqList) 103 print("\n%i flu HA sequences will be compared to current clade definitions..." % len(seqList))
104 #parse each target sequence object 104 #parse each target sequence object
105 for record in seqList: 105 for record in seqList:
106 clade_call = '' #empty variale for final clade call on sequence 106 clade_call = '' #empty variale for final clade call on sequence
107 matchingCladeList = call_clade(record) #holds matching clade tuples 107 matchingCladeList = call_clade(record) #holds matching clade tuples
108 #if there is more than one clade match 108 #if there is more than one clade match
114 clade = matchingCladeList[0] #take the first tuple in the list 114 clade = matchingCladeList[0] #take the first tuple in the list
115 clade_call = clade[0] #clade name is the first item in the tuple 115 clade_call = clade[0] #clade name is the first item in the tuple
116 #empty list return, no matches 116 #empty list return, no matches
117 else: 117 else:
118 clade_call = "No_Match" 118 clade_call = "No_Match"
119 print clade_call 119 print(clade_call)
120 seq_name = record.id 120 seq_name = record.id
121 mod_name = seq_name + "_" + clade_call 121 mod_name = seq_name + "_" + clade_call
122 print "New Sequence Name: " + mod_name 122 print("New Sequence Name: " + mod_name)
123 record.id = mod_name 123 record.id = mod_name
124 124
125 125
126 #output fasta file with clade calls appended to sequence names 126 #output fasta file with clade calls appended to sequence names
127 SeqIO.write(seqList,outFile,"fasta") 127 SeqIO.write(seqList,outFile,"fasta")