Mercurial > repos > public-health-bioinformatics > aggregate_linelisting
comparison aggregate_linelisting.py @ 3:45a01281f796 draft default tip
planemo upload for repository https://github.com/Public-Health-Bioinformatics/flu_classification_suite commit 6f09c69c51ec3d6bd7487f55384b97155355c456
author | public-health-bioinformatics |
---|---|
date | Mon, 04 Feb 2019 18:33:05 -0500 |
parents | 91c7d74bc709 |
children |
comparison
equal
deleted
inserted
replaced
2:eb0701da22d1 | 3:45a01281f796 |
---|---|
158 csv_seq = ",".join(sequence) +"," | 158 csv_seq = ",".join(sequence) +"," |
159 comma_sep_output = name_part + N_part + clade_part + col + csv_seq + substitutions_part + percID_part + "\n" | 159 comma_sep_output = name_part + N_part + clade_part + col + csv_seq + substitutions_part + percID_part + "\n" |
160 #write first member of unique sequence list to csv | 160 #write first member of unique sequence list to csv |
161 agg_lineListFile.write(comma_sep_output) | 161 agg_lineListFile.write(comma_sep_output) |
162 #print sequence records in sequevar to console | 162 #print sequence records in sequevar to console |
163 print "\n\t\t%i SeqRecords matching Sequevar: %s" % (len(listOfSeqs), u) | 163 print("\n\t\t%i SeqRecords matching Sequevar: %s" % (len(listOfSeqs), u)) |
164 | 164 |
165 #to uncollapse sequevar group, print each member of the sequevar list to csv output | 165 #to uncollapse sequevar group, print each member of the sequevar list to csv output |
166 '''for i in range(1,len(listOfSeqs)): | 166 '''for i in range(1,len(listOfSeqs)): |
167 currentRec = listOfSeqs[i] | 167 currentRec = listOfSeqs[i] |
168 province = extract_province(currentRec) | 168 province = extract_province(currentRec) |
212 numPos = len(positions) | 212 numPos = len(positions) |
213 empty_indicesLine = ',' * numPos | 213 empty_indicesLine = ',' * numPos |
214 #print column headers for sample sequences | 214 #print column headers for sample sequences |
215 row4 = "Sequence Name,N,Clade,Extra Substitutions," + empty_indicesLine + "Number of Amino Acid Substitutions in Antigenic Sites,% Identity of Antigenic Site Residues\n" | 215 row4 = "Sequence Name,N,Clade,Extra Substitutions," + empty_indicesLine + "Number of Amino Acid Substitutions in Antigenic Sites,% Identity of Antigenic Site Residues\n" |
216 agg_lineListFile.write(row4) | 216 agg_lineListFile.write(row4) |
217 print ("\nREFERENCE ANTIGENIC MAP: '%s' (%i amino acids)" % (record.id, len(record))) | 217 print("\nREFERENCE ANTIGENIC MAP: '%s' (%i amino acids)" % (record.id, len(record))) |
218 | 218 |
219 with open(cladeDefinitionFile,'r') as cladeFile: | 219 with open(cladeDefinitionFile,'r') as cladeFile: |
220 """Read clade definition file and store clade names in list.""" | 220 """Read clade definition file and store clade names in list.""" |
221 #remove whitespace from the end of each line and split elements at commas | 221 #remove whitespace from the end of each line and split elements at commas |
222 for line in cladeFile: | 222 for line in cladeFile: |
230 for record in SeqIO.parse(extrAntigMapFile, "fasta", alphabet=IUPAC.protein): | 230 for record in SeqIO.parse(extrAntigMapFile, "fasta", alphabet=IUPAC.protein): |
231 record = record.upper() | 231 record = record.upper() |
232 seqList.append(record) #add Seq to list of Sequences | 232 seqList.append(record) #add Seq to list of Sequences |
233 | 233 |
234 #print number of sequences to be processed as user check | 234 #print number of sequences to be processed as user check |
235 print "\nCOMPARING %i flu antigenic map sequences to the reference..." % len(seqList) | 235 print("\nCOMPARING %i flu antigenic map sequences to the reference..." % len(seqList)) |
236 for record in seqList: | 236 for record in seqList: |
237 #assign SeqRecords to province-specific dictionaries | 237 #assign SeqRecords to province-specific dictionaries |
238 sort_by_location(record) | 238 sort_by_location(record) |
239 | 239 |
240 #access prov segregated lists in order | 240 #access prov segregated lists in order |
241 sorted_prov_keys = sorted(prov_lists.keys()) | 241 sorted_prov_keys = sorted(prov_lists.keys()) |
242 print "\nSequence Lists Sorted by Province: " | 242 print("\nSequence Lists Sorted by Province: ") |
243 for prov in sorted_prov_keys: | 243 for prov in sorted_prov_keys: |
244 current_list = prov_lists[prov] | 244 current_list = prov_lists[prov] |
245 #mask AA's identical to reference sequence with dot | 245 #mask AA's identical to reference sequence with dot |
246 masked_list = [] # empty temporary list to park masked sequences | 246 masked_list = [] # empty temporary list to park masked sequences |
247 for record in current_list: | 247 for record in current_list: |
251 | 251 |
252 #group sequences in province-sorted list into clades | 252 #group sequences in province-sorted list into clades |
253 for prov in sorted_prov_keys: | 253 for prov in sorted_prov_keys: |
254 prov_list = prov_lists[prov] | 254 prov_list = prov_lists[prov] |
255 by_clades_dict = {} #empty dict for clade:seqRecord list groups | 255 by_clades_dict = {} #empty dict for clade:seqRecord list groups |
256 print "\n'%s' List (Amino Acids identical to Reference are Masked): " % (prov) | 256 print("\n'%s' List (Amino Acids identical to Reference are Masked): " % (prov)) |
257 for rec in prov_list: | 257 for rec in prov_list: |
258 clade = extract_clade(rec) | 258 clade = extract_clade(rec) |
259 if clade in by_clades_dict: | 259 if clade in by_clades_dict: |
260 #if clade already in dict as key, append record to list (value) | 260 #if clade already in dict as key, append record to list (value) |
261 by_clades_dict[clade].append(rec) | 261 by_clades_dict[clade].append(rec) |
262 else: #add clade as key to dict, value is list of 1 SeqRecord | 262 else: #add clade as key to dict, value is list of 1 SeqRecord |
263 by_clades_dict[clade] = [rec] | 263 by_clades_dict[clade] = [rec] |
264 #get list of alphabetically sorted clade keys | 264 #get list of alphabetically sorted clade keys |
265 sorted_clade_keys = sorted(by_clades_dict.keys()) | 265 sorted_clade_keys = sorted(by_clades_dict.keys()) |
266 print "\tNumber of clades: ", len(by_clades_dict) | 266 print("\tNumber of clades: ", len(by_clades_dict)) |
267 #group each list of sequences in clade by sequevars | 267 #group each list of sequences in clade by sequevars |
268 for key in sorted_clade_keys: | 268 for key in sorted_clade_keys: |
269 print "\n\tCLADE: %s Number of Members: %i" % (key, len(by_clades_dict[key])) | 269 print("\n\tCLADE: %s Number of Members: %i" % (key, len(by_clades_dict[key]))) |
270 a_list = by_clades_dict[key] | 270 a_list = by_clades_dict[key] |
271 for seqrec in a_list: | 271 for seqrec in a_list: |
272 print "\t %s: %s" %(seqrec.id,str(seqrec.seq)) | 272 print("\t %s: %s" %(seqrec.id,str(seqrec.seq))) |
273 #output the list to csv as aggregated linelist | 273 #output the list to csv as aggregated linelist |
274 output_aggregated_linelist(a_list) | 274 output_aggregated_linelist(a_list) |
275 | 275 |
276 print("Aggregated Linelist written to file: '%s\n'" % (outFileHandle)) | 276 print("Aggregated Linelist written to file: '%s\n'" % (outFileHandle)) |
277 extrAntigMapFile.close() | 277 extrAntigMapFile.close() |