Mercurial > repos > public-health-bioinformatics > linelisting
comparison linelisting.py @ 0:bda72dec1f55 draft
planemo upload for repository https://github.com/Public-Health-Bioinformatics/flu_classification_suite commit 856d0b7ab7dc801c168fcdf45cfd2e31f062a37e-dirty
| author | public-health-bioinformatics |
|---|---|
| date | Wed, 09 Jan 2019 15:42:43 -0500 |
| parents | |
| children | 141cbefca027 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:bda72dec1f55 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 '''Reads in a fasta file of extracted antigenic sites and one containing a | |
| 3 reference flu antigenic map, reading them in as protein SeqRecords. Compares each amino | |
| 4 acid of each sample antigenic map to corresponding sites in the reference and replaces | |
| 5 identical amino acids with dots. Writes headers (including amino acid position numbers | |
| 6 read in from the respective index array), the reference amino acid sequence and column | |
| 7 headings required for non-aggregated line lists. Outputs headers and modified (i.e. dotted) | |
| 8 sequences to a csv file.''' | |
| 9 | |
| 10 '''Author: Diane Eisler, Molecular Microbiology & Genomics, BCCDC Public Health Laboratory, Nov 2017''' | |
| 11 | |
| 12 import sys,string,os, time, Bio, re, argparse | |
| 13 from Bio import Seq, SeqIO, SeqUtils, Alphabet, SeqRecord | |
| 14 from Bio.SeqRecord import SeqRecord | |
| 15 from Bio.Alphabet import IUPAC | |
| 16 from Bio.Seq import Seq | |
| 17 | |
| 18 inputAntigenicMaps = sys.argv[1] #batch fasta file with antigenic map sequences | |
| 19 refAntigenicMap = sys.argv[2] #fasta file of reference antigenic map sequence | |
| 20 antigenicSiteIndexArray = sys.argv[3] #antigenic site index array csv file | |
| 21 cladeDefinitionFile = sys.argv[4] #clade definition csv file | |
| 22 outFileHandle = sys.argv[5] #user-specifed output filename | |
| 23 | |
| 24 lineListFile = open(outFileHandle,'w') #open a writable output file | |
| 25 | |
| 26 indicesLine = "" #comma-separated antigenic site positions | |
| 27 cladeList = [] #list of clade names read from clade definition file | |
| 28 ref_seq = "" #reference antigenic map (protein sequence) | |
| 29 seqList = [] #list of aa sequences to compare to reference | |
| 30 | |
| 31 BC_list = [] #empty list for BC samples | |
| 32 AB_list = [] #empty list for AB samples | |
| 33 ON_list = [] #empty list for ON samples | |
| 34 QC_list = [] #empty list for QC samples | |
| 35 nonprov_list = [] #empty list for samples not in above 4 provinces | |
| 36 #dictionary for location-separated sequence lists | |
| 37 segregated_lists = {'1_BC':BC_list,'2_AB':AB_list,'3_ON':ON_list,'4_QC': QC_list, '5_nonprov': nonprov_list} | |
| 38 uniqueSeqs = {} #empty dict with unique seqs as keys and lists of SeqRecords as values | |
| 39 | |
| 40 def replace_matching_aa_with_dot(record): | |
| 41 """Compare amino acids in record to reference sequence, replace matching symbols | |
| 42 with dots, and return record with modified amino acid sequence.""" | |
| 43 orig_seq = str(record.seq) #get sequence string from SeqRecord | |
| 44 mod_seq = "" | |
| 45 #replace only those aa's matching the reference with dots | |
| 46 for i in range(0, len(orig_seq)): | |
| 47 if (orig_seq[i] == ref_seq[i]): | |
| 48 mod_seq = mod_seq + '.' | |
| 49 else: | |
| 50 mod_seq = mod_seq + orig_seq[i] | |
| 51 #assign modified sequence to new SeqRecord and return it | |
| 52 rec = SeqRecord(Seq(mod_seq,IUPAC.protein), id = record.id, name = "", description = "") | |
| 53 return rec | |
| 54 | |
| 55 def extract_clade(record): | |
| 56 """Extract clade name (or 'No_Match') from sequence name and return as clade name. """ | |
| 57 if record.id.endswith('No_Match'): | |
| 58 clade_name = 'No_Match' | |
| 59 end_index = record.id.index(clade_name) | |
| 60 record.id = record.id[:end_index -1] | |
| 61 return clade_name | |
| 62 else: # | |
| 63 for clade in cladeList: | |
| 64 if record.id.endswith(clade): | |
| 65 clade_name = clade | |
| 66 end_index = record.id.index(clade) | |
| 67 record.id = record.id[:end_index -1] | |
| 68 return clade_name | |
| 69 | |
| 70 def sort_by_location(record): | |
| 71 """Search sequence name for province name or 2 letter province code and add SeqRecord to | |
| 72 province-specific dictionary.""" | |
| 73 seq_name = record.id | |
| 74 if ('-BC-' in seq_name) or ('/British_Columbia/' in seq_name): | |
| 75 BC_list.append(record) #add Sequence record to BC_list | |
| 76 elif ('-AB-' in seq_name) or ('/Alberta/' in seq_name): | |
| 77 AB_list.append(record) #add Sequence record to AB_list | |
| 78 elif ('-ON-' in seq_name) or ('/Ontario/' in seq_name): | |
| 79 ON_list.append(record) #add Sequence record to ON_list | |
| 80 elif ('-QC-' in seq_name) or ('/Quebec/' in seq_name): | |
| 81 QC_list.append(record) #add Sequence record to QC_list | |
| 82 else: | |
| 83 nonprov_list.append(record) #add Sequence record to nonprov_list | |
| 84 return | |
| 85 | |
| 86 def extract_province(record): | |
| 87 """Search sequence name for province name or 2 letter province code and return province.""" | |
| 88 seq_name = record.id | |
| 89 if ('-BC-' in seq_name) or ('/British_Columbia/' in seq_name): | |
| 90 province = 'British Columbia' | |
| 91 elif ('-AB-' in seq_name) or ('Alberta' in seq_name): | |
| 92 province = '/Alberta/' | |
| 93 elif ('-ON-' in seq_name) or ('/Ontario/' in seq_name): | |
| 94 province = 'Ontario' | |
| 95 elif ('-QC-' in seq_name) or ('/Quebec/' in seq_name): | |
| 96 province = 'Quebec' | |
| 97 else: | |
| 98 province = "other" | |
| 99 return province | |
| 100 | |
| 101 def get_sequence_length(record): | |
| 102 """Return the length of a sequence in a Sequence record.""" | |
| 103 sequenceLength = len(str((record.seq))) | |
| 104 return sequenceLength | |
| 105 | |
| 106 def get_antigenic_site_substitutions(record): | |
| 107 """Count number of non-dotted amino acids in SeqRecord sequence and return as substitutions.""" | |
| 108 sequenceLength = get_sequence_length(record) | |
| 109 seqString = str(record.seq) | |
| 110 matches = seqString.count('.') | |
| 111 substitutions = sequenceLength - matches | |
| 112 return substitutions | |
| 113 | |
| 114 def calculate_percent_id(record, substitutions): | |
| 115 """Calculate sequence identity to a reference (based on substitutions and sequence length) and return percent id.""" | |
| 116 sequenceLength = get_sequence_length(record) | |
| 117 percentID = (1.00 - (float(substitutions)/float(sequenceLength))) | |
| 118 return percentID | |
| 119 | |
| 120 def output_linelist(sequenceList): | |
| 121 """Output a list of SeqRecords to a non-aggregated line list in csv format.""" | |
| 122 for record in sequenceList: | |
| 123 #get province, clade from sequence record | |
| 124 province = extract_province(record) | |
| 125 clade = extract_clade(record) | |
| 126 #calculate number of substitutions and % id to reference | |
| 127 substitutions = get_antigenic_site_substitutions(record) | |
| 128 percentID = calculate_percent_id(record,substitutions) | |
| 129 name_part = (record.id).rstrip() + ',' | |
| 130 clade_part = clade + ',' | |
| 131 substitutions_part = str(substitutions) + ',' | |
| 132 percID_part = str(percentID) + ',' | |
| 133 col = " ," #empty column | |
| 134 sequence = str(record.seq).strip() | |
| 135 csv_seq = ",".join(sequence) +"," | |
| 136 #write linelisted antigenic maps to csv file | |
| 137 comma_sep_line = name_part + col + clade_part + col + csv_seq + substitutions_part + percID_part + "\n" | |
| 138 lineListFile.write(comma_sep_line) | |
| 139 return | |
| 140 | |
| 141 with open (antigenicSiteIndexArray,'r') as siteIndices: | |
| 142 """Read amino acid positions from antigenic site index array and print as header after one empty row.""" | |
| 143 col = "," #empty column | |
| 144 #read items separated by comma's to position list | |
| 145 for line in siteIndices: | |
| 146 #remove whitespace from the end of each line | |
| 147 indicesLine = line.rstrip() | |
| 148 row1 = "\n" | |
| 149 #add comma-separated AA positions to header line | |
| 150 row2 = col + col + col + col + indicesLine + "\n" | |
| 151 #write first (empty) and 2nd (amino acid position) lines to linelist output file | |
| 152 lineListFile.write(row1) | |
| 153 lineListFile.write(row2) | |
| 154 | |
| 155 with open (refAntigenicMap,'r') as refMapFile: | |
| 156 """Read reference antigenic map from fasta and output amino acids, followed by column headers.""" | |
| 157 #read sequences from fasta to SeqRecord, uppercase, and store sequence string to ref_seq | |
| 158 record = SeqIO.read(refMapFile,"fasta",alphabet=IUPAC.protein) | |
| 159 record = record.upper() | |
| 160 ref_seq = str(record.seq).strip() #store sequence in variable for comparison to sample seqs | |
| 161 col = "," #empty column | |
| 162 name_part = (record.id).rstrip() + ',' | |
| 163 sequence = str(record.seq).strip() | |
| 164 csv_seq = ",".join(sequence) | |
| 165 #output row with reference sequence displayed above sample sequences | |
| 166 row3 = name_part + col + col + col + csv_seq + "\n" | |
| 167 lineListFile.write(row3) | |
| 168 #replaces digits in the indicesLine with empty strings | |
| 169 positions = indicesLine.split(',') | |
| 170 numPos = len(positions) | |
| 171 empty_indicesLine = ',' * numPos | |
| 172 #print column headers for sample sequences | |
| 173 row4 = "Sequence Name,N,Clade,Extra Substitutions," + empty_indicesLine + "Number of Amino Acid Substitutions in Antigenic Sites,% Identity of Antigenic Site Residues\n" | |
| 174 lineListFile.write(row4) | |
| 175 print ("\nREFERENCE ANTIGENIC MAP: '%s' (%i amino acids)" % (record.id, len(record))) | |
| 176 | |
| 177 with open(cladeDefinitionFile,'r') as cladeFile: | |
| 178 """Read clade definition file and store clade names in a list.""" | |
| 179 #remove whitespace from the end of each line and split elements at commas | |
| 180 for line in cladeFile: | |
| 181 elementList = line.rstrip().split(',') | |
| 182 name = elementList[0] #move 1st element to name field | |
| 183 cladeList.append(name) | |
| 184 | |
| 185 with open(inputAntigenicMaps,'r') as extrAntigMapFile: | |
| 186 """Read antigenic maps as protein SeqRecords and add to list.""" | |
| 187 #read Sequences from fasta file, uppercase and add to seqList | |
| 188 for record in SeqIO.parse(extrAntigMapFile, "fasta", alphabet=IUPAC.protein): | |
| 189 record = record.upper() | |
| 190 seqList.append(record) #add Seq to list of Sequences | |
| 191 | |
| 192 #print number of sequences to be process as user check | |
| 193 print "\nCOMPARING %i flu antigenic map sequences to the reference..." % len(seqList) | |
| 194 #parse each antigenic map sequence object | |
| 195 for record in seqList: | |
| 196 #assign Sequence to dictionaries according to location in name | |
| 197 sort_by_location(record) | |
| 198 #sort dictionary keys that access province-segregated lists | |
| 199 sorted_segregated_list_keys = sorted(segregated_lists.keys()) | |
| 200 print "\nSequence Lists Sorted by Province: " | |
| 201 #process each province-segregated SeqRecord list | |
| 202 for listname in sorted_segregated_list_keys: | |
| 203 #acesss list of sequences by the listname key | |
| 204 a_list = segregated_lists[listname] | |
| 205 # sort original SeqRecords by record id (i.e. name) | |
| 206 a_list = [f for f in sorted(a_list, key = lambda x : x.id)] | |
| 207 mod_list = [] # empty temporary list | |
| 208 for record in a_list: | |
| 209 #replace matching amino acid symbols with dots | |
| 210 rec = replace_matching_aa_with_dot(record) | |
| 211 mod_list.append(rec) #populate a list of modified records | |
| 212 segregated_lists[listname] = mod_list | |
| 213 print "\n'%s' List (Amino Acids identical to Reference Masked): " % (listname) | |
| 214 #output the list to csv as non-aggregated linelist | |
| 215 output_linelist(segregated_lists[listname]) | |
| 216 | |
| 217 extrAntigMapFile.close() | |
| 218 refMapFile.close() | |
| 219 lineListFile.close() |
