Mercurial > repos > public-health-bioinformatics > assign_clades
comparison assign_clades.py @ 3:bb1cdfafee59 draft default tip
planemo upload for repository https://github.com/Public-Health-Bioinformatics/flu_classification_suite commit 6f09c69c51ec3d6bd7487f55384b97155355c456
author | public-health-bioinformatics |
---|---|
date | Mon, 04 Feb 2019 18:34:14 -0500 |
parents | 1f113d9db8ba |
children |
comparison
equal
deleted
inserted
replaced
2:0d3dad155413 | 3:bb1cdfafee59 |
---|---|
25 cladeList = [] #empty list to hold clade tuples i.e. ("3C.3a", 1 ,{"3":"I", "9":"V"..}) | 25 cladeList = [] #empty list to hold clade tuples i.e. ("3C.3a", 1 ,{"3":"I", "9":"V"..}) |
26 | 26 |
27 '''Searches record for required amino acids at defined positions. If found, assigns | 27 '''Searches record for required amino acids at defined positions. If found, assigns |
28 clade name to sequence name by appending underscore and clade name to record id.''' | 28 clade name to sequence name by appending underscore and clade name to record id.''' |
29 def call_clade(record): | 29 def call_clade(record): |
30 print "---------------------------------------------------------------------" | 30 print("---------------------------------------------------------------------") |
31 print "Parsing %s for matching flu clade definitions..." % (record.id) | 31 print("Parsing %s for matching flu clade definitions..." % (record.id)) |
32 matchList = [] #empty list to hold clades that match 100% | 32 matchList = [] #empty list to hold clades that match 100% |
33 #iterate over each tuple in the clade list | 33 #iterate over each tuple in the clade list |
34 for clade in cladeList: | 34 for clade in cladeList: |
35 cladeName = clade[0] #temp variable for name | 35 cladeName = clade[0] #temp variable for name |
36 depth = clade[1] #temp variable for depth | 36 depth = clade[1] #temp variable for depth |
37 sites = clade[2] #temp variable for aa def dictionary | 37 sites = clade[2] #temp variable for aa def dictionary |
38 shouldFind = len(sites) #number of sites that should match | 38 shouldFind = len(sites) #number of sites that should match |
39 found = 0 #a counter to hold matches to antigenic sites | 39 found = 0 #a counter to hold matches to antigenic sites |
40 #iterate over each position in sites dictionary | 40 #iterate over each position in sites dictionary |
41 for pos, aa in sites.iteritems(): | 41 for pos, aa in sites.items(): |
42 #translate pos to corresponding index in target sequence | 42 #translate pos to corresponding index in target sequence |
43 index = int(pos) - 1 | 43 index = int(pos) - 1 |
44 #if record at index has same amino acid as 'aa', increment 'found' | 44 #if record at index has same amino acid as 'aa', increment 'found' |
45 if record[index] == aa: | 45 if record[index] == aa: |
46 found += 1 | 46 found += 1 |
85 aa = new_list[i + 1] | 85 aa = new_list[i + 1] |
86 sites[pos] = aa | 86 sites[pos] = aa |
87 #add the clade info as a tuple to the cladeList[] | 87 #add the clade info as a tuple to the cladeList[] |
88 oneClade =(name, depth, sites) | 88 oneClade =(name, depth, sites) |
89 cladeList.append(oneClade) | 89 cladeList.append(oneClade) |
90 print "The List of Clades:" | 90 print("The List of Clades:") |
91 for clade in cladeList: | 91 for clade in cladeList: |
92 print "Clade Name: %s Depth: %i Antigenic Sites: %i" % (clade[0], clade[1], len(clade[2])) | 92 print("Clade Name: %s Depth: %i Antigenic Sites: %i" % (clade[0], clade[1], len(clade[2]))) |
93 for pos, aa in clade[2].iteritems(): | 93 for pos, aa in clade[2].items(): |
94 print "Pos: %s\tAA: %s" % (pos,aa) | 94 print("Pos: %s\tAA: %s" % (pos,aa)) |
95 | 95 |
96 '''opens readable input file of sequences to parse using filename from cmd line, | 96 '''opens readable input file of sequences to parse using filename from cmd line, |
97 instantiates as AA Sequence objects, with ppercase sequences''' | 97 instantiates as AA Sequence objects, with ppercase sequences''' |
98 with open(inFileHandle1,'r') as inFile: | 98 with open(inFileHandle1,'r') as inFile: |
99 #read in Sequences from fasta file, uppercase and add to seqList | 99 #read in Sequences from fasta file, uppercase and add to seqList |
100 for record in SeqIO.parse(inFile, "fasta", alphabet=IUPAC.protein): | 100 for record in SeqIO.parse(inFile, "fasta", alphabet=IUPAC.protein): |
101 record = record.upper() | 101 record = record.upper() |
102 seqList.append(record) #add Seq to list of Sequences | 102 seqList.append(record) #add Seq to list of Sequences |
103 print "\n%i flu HA sequences will be compared to current clade definitions..." % len(seqList) | 103 print("\n%i flu HA sequences will be compared to current clade definitions..." % len(seqList)) |
104 #parse each target sequence object | 104 #parse each target sequence object |
105 for record in seqList: | 105 for record in seqList: |
106 clade_call = '' #empty variale for final clade call on sequence | 106 clade_call = '' #empty variale for final clade call on sequence |
107 matchingCladeList = call_clade(record) #holds matching clade tuples | 107 matchingCladeList = call_clade(record) #holds matching clade tuples |
108 #if there is more than one clade match | 108 #if there is more than one clade match |
114 clade = matchingCladeList[0] #take the first tuple in the list | 114 clade = matchingCladeList[0] #take the first tuple in the list |
115 clade_call = clade[0] #clade name is the first item in the tuple | 115 clade_call = clade[0] #clade name is the first item in the tuple |
116 #empty list return, no matches | 116 #empty list return, no matches |
117 else: | 117 else: |
118 clade_call = "No_Match" | 118 clade_call = "No_Match" |
119 print clade_call | 119 print(clade_call) |
120 seq_name = record.id | 120 seq_name = record.id |
121 mod_name = seq_name + "_" + clade_call | 121 mod_name = seq_name + "_" + clade_call |
122 print "New Sequence Name: " + mod_name | 122 print("New Sequence Name: " + mod_name) |
123 record.id = mod_name | 123 record.id = mod_name |
124 | 124 |
125 | 125 |
126 #output fasta file with clade calls appended to sequence names | 126 #output fasta file with clade calls appended to sequence names |
127 SeqIO.write(seqList,outFile,"fasta") | 127 SeqIO.write(seqList,outFile,"fasta") |