comparison aggregate_linelisting.py @ 3:45a01281f796 draft default tip

planemo upload for repository https://github.com/Public-Health-Bioinformatics/flu_classification_suite commit 6f09c69c51ec3d6bd7487f55384b97155355c456
author public-health-bioinformatics
date Mon, 04 Feb 2019 18:33:05 -0500
parents 91c7d74bc709
children
comparison
equal deleted inserted replaced
2:eb0701da22d1 3:45a01281f796
158 csv_seq = ",".join(sequence) +"," 158 csv_seq = ",".join(sequence) +","
159 comma_sep_output = name_part + N_part + clade_part + col + csv_seq + substitutions_part + percID_part + "\n" 159 comma_sep_output = name_part + N_part + clade_part + col + csv_seq + substitutions_part + percID_part + "\n"
160 #write first member of unique sequence list to csv 160 #write first member of unique sequence list to csv
161 agg_lineListFile.write(comma_sep_output) 161 agg_lineListFile.write(comma_sep_output)
162 #print sequence records in sequevar to console 162 #print sequence records in sequevar to console
163 print "\n\t\t%i SeqRecords matching Sequevar: %s" % (len(listOfSeqs), u) 163 print("\n\t\t%i SeqRecords matching Sequevar: %s" % (len(listOfSeqs), u))
164 164
165 #to uncollapse sequevar group, print each member of the sequevar list to csv output 165 #to uncollapse sequevar group, print each member of the sequevar list to csv output
166 '''for i in range(1,len(listOfSeqs)): 166 '''for i in range(1,len(listOfSeqs)):
167 currentRec = listOfSeqs[i] 167 currentRec = listOfSeqs[i]
168 province = extract_province(currentRec) 168 province = extract_province(currentRec)
212 numPos = len(positions) 212 numPos = len(positions)
213 empty_indicesLine = ',' * numPos 213 empty_indicesLine = ',' * numPos
214 #print column headers for sample sequences 214 #print column headers for sample sequences
215 row4 = "Sequence Name,N,Clade,Extra Substitutions," + empty_indicesLine + "Number of Amino Acid Substitutions in Antigenic Sites,% Identity of Antigenic Site Residues\n" 215 row4 = "Sequence Name,N,Clade,Extra Substitutions," + empty_indicesLine + "Number of Amino Acid Substitutions in Antigenic Sites,% Identity of Antigenic Site Residues\n"
216 agg_lineListFile.write(row4) 216 agg_lineListFile.write(row4)
217 print ("\nREFERENCE ANTIGENIC MAP: '%s' (%i amino acids)" % (record.id, len(record))) 217 print("\nREFERENCE ANTIGENIC MAP: '%s' (%i amino acids)" % (record.id, len(record)))
218 218
219 with open(cladeDefinitionFile,'r') as cladeFile: 219 with open(cladeDefinitionFile,'r') as cladeFile:
220 """Read clade definition file and store clade names in list.""" 220 """Read clade definition file and store clade names in list."""
221 #remove whitespace from the end of each line and split elements at commas 221 #remove whitespace from the end of each line and split elements at commas
222 for line in cladeFile: 222 for line in cladeFile:
230 for record in SeqIO.parse(extrAntigMapFile, "fasta", alphabet=IUPAC.protein): 230 for record in SeqIO.parse(extrAntigMapFile, "fasta", alphabet=IUPAC.protein):
231 record = record.upper() 231 record = record.upper()
232 seqList.append(record) #add Seq to list of Sequences 232 seqList.append(record) #add Seq to list of Sequences
233 233
234 #print number of sequences to be processed as user check 234 #print number of sequences to be processed as user check
235 print "\nCOMPARING %i flu antigenic map sequences to the reference..." % len(seqList) 235 print("\nCOMPARING %i flu antigenic map sequences to the reference..." % len(seqList))
236 for record in seqList: 236 for record in seqList:
237 #assign SeqRecords to province-specific dictionaries 237 #assign SeqRecords to province-specific dictionaries
238 sort_by_location(record) 238 sort_by_location(record)
239 239
240 #access prov segregated lists in order 240 #access prov segregated lists in order
241 sorted_prov_keys = sorted(prov_lists.keys()) 241 sorted_prov_keys = sorted(prov_lists.keys())
242 print "\nSequence Lists Sorted by Province: " 242 print("\nSequence Lists Sorted by Province: ")
243 for prov in sorted_prov_keys: 243 for prov in sorted_prov_keys:
244 current_list = prov_lists[prov] 244 current_list = prov_lists[prov]
245 #mask AA's identical to reference sequence with dot 245 #mask AA's identical to reference sequence with dot
246 masked_list = [] # empty temporary list to park masked sequences 246 masked_list = [] # empty temporary list to park masked sequences
247 for record in current_list: 247 for record in current_list:
251 251
252 #group sequences in province-sorted list into clades 252 #group sequences in province-sorted list into clades
253 for prov in sorted_prov_keys: 253 for prov in sorted_prov_keys:
254 prov_list = prov_lists[prov] 254 prov_list = prov_lists[prov]
255 by_clades_dict = {} #empty dict for clade:seqRecord list groups 255 by_clades_dict = {} #empty dict for clade:seqRecord list groups
256 print "\n'%s' List (Amino Acids identical to Reference are Masked): " % (prov) 256 print("\n'%s' List (Amino Acids identical to Reference are Masked): " % (prov))
257 for rec in prov_list: 257 for rec in prov_list:
258 clade = extract_clade(rec) 258 clade = extract_clade(rec)
259 if clade in by_clades_dict: 259 if clade in by_clades_dict:
260 #if clade already in dict as key, append record to list (value) 260 #if clade already in dict as key, append record to list (value)
261 by_clades_dict[clade].append(rec) 261 by_clades_dict[clade].append(rec)
262 else: #add clade as key to dict, value is list of 1 SeqRecord 262 else: #add clade as key to dict, value is list of 1 SeqRecord
263 by_clades_dict[clade] = [rec] 263 by_clades_dict[clade] = [rec]
264 #get list of alphabetically sorted clade keys 264 #get list of alphabetically sorted clade keys
265 sorted_clade_keys = sorted(by_clades_dict.keys()) 265 sorted_clade_keys = sorted(by_clades_dict.keys())
266 print "\tNumber of clades: ", len(by_clades_dict) 266 print("\tNumber of clades: ", len(by_clades_dict))
267 #group each list of sequences in clade by sequevars 267 #group each list of sequences in clade by sequevars
268 for key in sorted_clade_keys: 268 for key in sorted_clade_keys:
269 print "\n\tCLADE: %s Number of Members: %i" % (key, len(by_clades_dict[key])) 269 print("\n\tCLADE: %s Number of Members: %i" % (key, len(by_clades_dict[key])))
270 a_list = by_clades_dict[key] 270 a_list = by_clades_dict[key]
271 for seqrec in a_list: 271 for seqrec in a_list:
272 print "\t %s: %s" %(seqrec.id,str(seqrec.seq)) 272 print("\t %s: %s" %(seqrec.id,str(seqrec.seq)))
273 #output the list to csv as aggregated linelist 273 #output the list to csv as aggregated linelist
274 output_aggregated_linelist(a_list) 274 output_aggregated_linelist(a_list)
275 275
276 print("Aggregated Linelist written to file: '%s\n'" % (outFileHandle)) 276 print("Aggregated Linelist written to file: '%s\n'" % (outFileHandle))
277 extrAntigMapFile.close() 277 extrAntigMapFile.close()