# HG changeset patch # User public-health-bioinformatics # Date 1547066012 18000 # Node ID 1dc65ec11a40d3ed1f2e46ee2fdadfd6f766d4ea # Parent 1f113d9db8ba59f0a3b30209df367fb7eb790a94 planemo upload for repository https://github.com/Public-Health-Bioinformatics/flu_classification_suite commit 856d0b7ab7dc801c168fcdf45cfd2e31f062a37e-dirty diff -r 1f113d9db8ba -r 1dc65ec11a40 README.md --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.md Wed Jan 09 15:33:32 2019 -0500 @@ -0,0 +1,24 @@ +# Flu Classification Suite +Influenza viruses continually evolve to evade population immunity. We have developed a publicly-available Galaxy workflow Flu Analysis Suite, for rapid clade-mapping of sequenced influenza viruses. This suite provides rapid, high-resolution understanding of circulating influenza strain evolution to inform influenza vaccine effectiveness and the need for potential vaccine reformulation. + +# Installation + +# Tools + +## Aggregate Line List +Transforms fasta files of flu antigenic site amino acids into aggregated line lists, comparing antigenic maps to that of a reference sequence and collapsing and enumerating identical sequences. + +## Antigenic Site Extraction +Extracts antigenic amino acids from flu sequence, using a specific index array (i.e. for H3, H1 etc.). + +## Assign Clades +Assign clade designations to influenza amino acid fasta files. + +## Change Fasta Deflines +Renames definition lines in fasta files. Requires a fasta file requiring sequence name changes and a 2-column renaming file (either tab-delimited text or csv). Searches for fasta definition lines matching column 1 and, if found, replaces fasta definition line with string specified in column 2 of the renaming file. + +## Line List +Transforms fasta files of flu antigenic site amino acids into line lists, comparing antigenic maps to that of a reference sequence. + +## Reformat USearch-Collapsed Fasta +Parses format of USearch-collapsed fasta output files and outputs fasta with customized definition line formatting. diff -r 1f113d9db8ba -r 1dc65ec11a40 assign_clades.py --- a/assign_clades.py Wed Jan 09 14:57:14 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,133 +0,0 @@ -#!/usr/bin/env python - -'''Accepts fasta files containing amino acid sequence, reading them in as -amino acid sequence objects. Reads influenza clade defintions (i.e. amino -acids at certain positions) from .csv file into dictionary structure. Searches -each of the amino acid sequence objects for a list of matching clades, assigns -the most 'evolved' (i.e. child as opposed to parent clade) to the sequence. Appends -"_cladename" to the Sequence name and generates a fasta file of original sequences with -modified names.''' - -'''Author: Diane Eisler, Molecular Microbiology & Genomics, BCCDC Public Health Laboratory, Oct 2017''' - -import sys,string,os, time, Bio -from Bio import Seq, SeqIO, SeqUtils, Alphabet, SeqRecord -from Bio.SeqRecord import SeqRecord -from Bio.Alphabet import IUPAC -from Bio.Seq import Seq - -localtime = time.asctime(time.localtime(time.time())) #date and time of analysis -inFileHandle1 = sys.argv[1] #batch fasta file with sequences to be parsed -inFileHandle2 = sys.argv[2] # .csv file containing clade definitions and "depth" -outFileHandle = sys.argv[3] #user-specified name for output file of aa seq's with clade suffixes -outFile= open(outFileHandle,'w') #open a writable, appendable output file -seqList = [] #list of aa sequence objects to parse for clade definitions -cladeList = [] #empty list to hold clade tuples i.e. ("3C.3a", 1 ,{"3":"I", "9":"V"..}) - -'''Searches record for required amino acids at defined positions. If found, assigns -clade name to sequence name by appending underscore and clade name to record id.''' -def call_clade(record): - print "---------------------------------------------------------------------" - print "Parsing %s for matching flu clade definitions..." % (record.id) - matchList = [] #empty list to hold clades that match 100% - #iterate over each tuple in the clade list - for clade in cladeList: - cladeName = clade[0] #temp variable for name - depth = clade[1] #temp variable for depth - sites = clade[2] #temp variable for aa def dictionary - shouldFind = len(sites) #number of sites that should match - found = 0 #a counter to hold matches to antigenic sites - #iterate over each position in sites dictionary - for pos, aa in sites.iteritems(): - #translate pos to corresponding index in target sequence - index = int(pos) - 1 - #if record at index has same amino acid as 'aa', increment 'found' - if record[index] == aa: - found += 1 - if (found == shouldFind): - #add the matching clade tuple to the list of matches - matchList.append(clade) - return matchList - -'''Compares depth level of clades in a list and returns the most granular one''' -def decide_clade_by_depth(matchList): - #empty variable for maximum depth encountered - max_depth = 0 - best_match_name = '' #variable to hold most granular clade - #for each matching clade, check depth of the corresponding tuple - for clade in matchList: - #if the current clade is 'deeper' than the one before it - if clade[1] > max_depth: - #store this depth - max_depth = clade[1] - #store name of the clade - best_match_name = clade[0] - return best_match_name - -'''opens the .csv file of clade definitions and clade "depth" ''' -with open (inFileHandle2, 'r') as clade_file: - #remove whitespace from the end of each line and split elements at commas - for line in clade_file: - #print "Current Line in File:" + line - sites={} #initialize a dictionary for clade - elementList = line.rstrip().split(',') - new_list = [] #start a new list to put non-empty strings into - #remove empty stings in list - for item in elementList: - if item != '': - new_list.append(item) - name = new_list.pop(0) #move 1st element to name field - depth = int(new_list.pop(0)) #move 2nd element to depth field - #read remaining pairs of non-null elements into clade def dictionary - for i in range(0, len(new_list), 2): - #move next 2 items from the list into the dict - pos = new_list[i] - aa = new_list[i + 1] - sites[pos] = aa - #add the clade info as a tuple to the cladeList[] - oneClade =(name, depth, sites) - cladeList.append(oneClade) - print "The List of Clades:" - for clade in cladeList: - print "Clade Name: %s Depth: %i Antigenic Sites: %i" % (clade[0], clade[1], len(clade[2])) - for pos, aa in clade[2].iteritems(): - print "Pos: %s\tAA: %s" % (pos,aa) - -'''opens readable input file of sequences to parse using filename from cmd line, - instantiates as AA Sequence objects, with ppercase sequences''' -with open(inFileHandle1,'r') as inFile: - #read in Sequences from fasta file, uppercase and add to seqList - for record in SeqIO.parse(inFile, "fasta", alphabet=IUPAC.protein): - record = record.upper() - seqList.append(record) #add Seq to list of Sequences - print "\n%i flu HA sequences will be compared to current clade definitions..." % len(seqList) - #parse each target sequence object - for record in seqList: - clade_call = '' #empty variale for final clade call on sequence - matchingCladeList = call_clade(record) #holds matching clade tuples - #if there is more than one clade match - if len(matchingCladeList) > 1: - #choose the most granular clade based on depth - clade_call = decide_clade_by_depth(matchingCladeList) - #if there is only one clade call - elif len(matchingCladeList) > 0: - clade = matchingCladeList[0] #take the first tuple in the list - clade_call = clade[0] #clade name is the first item in the tuple - #empty list return, no matches - else: - clade_call = "No_Match" - print clade_call - seq_name = record.id - mod_name = seq_name + "_" + clade_call - print "New Sequence Name: " + mod_name - record.id = mod_name - - -#output fasta file with clade calls appended to sequence names -SeqIO.write(seqList,outFile,"fasta") - -#print("\n%i Sequences Extracted to Output file: %s" % ((len(extractedSeqList),outFileHandle))) -inFile.close() -clade_file.close() -outFile.close() - diff -r 1f113d9db8ba -r 1dc65ec11a40 assign_clades.xml --- a/assign_clades.xml Wed Jan 09 14:57:14 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,29 +0,0 @@ - - - biopython - - - - - - - - - - - - - - - - - - - - diff -r 1f113d9db8ba -r 1dc65ec11a40 test-data/clades.csv --- a/test-data/clades.csv Wed Jan 09 14:57:14 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,15 +0,0 @@ -3C.2a,1,3,I,144,S,145,S,159,Y,160,T,225,D,311,H,489,N,,,,,,,,,,,,,, -3C.2a_+_T131K_+_R142K_+_R261Q,2,3,I,131,K,142,K,144,S,145,S,159,Y,160,T,225,D,261,Q,311,H,489,N,,,,,,,, -3C.2a_+_N121K_+_S144K,2,3,I,121,K,144,K,145,S,159,Y,160,T,225,D,311,H,489,N,,,,,,,,,,,, -3C.2a_+_Q197K_+_R261Q,2,3,I,144,S,145,S,159,Y,160,T,197,K,225,D,261,Q,311,H,489,N,,,,,,,,,, -3C.2a_+_N31S_+_D53N_+_R142G_+_S144R_+_N171K_+_I192T_+_Q197H_,2,3,I,31,S,53,N,142,G,144,R,145,S,159,Y,160,T,171,K,192,T,197,H,225,D,311,H,489,N,, -3C.2a1_(w/o_N121K),3,3,I,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,484,E,489,N,,,,,,,, -3C.2a1_+_N121K,4,3,I,121,K,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,484,E,489,N,,,,,, -3C.2a1_+_N121K_+_R142G_+_I242V,4,3,I,121,K,142,G,144,S,145,S,159,Y,160,T,171,K,225,D,242,V,311,H,406,V,479,E,484,E,489,N -3C.2a1_+_N121K_+_R142G,4,3,I,121,K,142,G,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,484,E,489,N,,,, -3C.2a1_+_N121K_+_T135K,4,3,I,121,K,135,K,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,479,E,484,E,489,N,, -3C.2a1_+_N121K_+_K92R_+_H311Q,4,3,I,92,R,121,K,144,S,145,S,159,Y,160,T,171,K,225,D,311,Q,406,V,484,E,489,N,,,, -3C.2a1_+_N121K_+_I140M,4,3,I,121,K,140,M,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,479,E,484,E,489,N,, -3C.2a1_+_R142G,4,3,I,142,G,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,484,E,489,N,,,,,, -3C.2a1_+_S47T_+_G78S,4,3,I,47,T,78,S,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,484,E,489,N,,,, -3C.3a,1,128,A,138,S,142,G,145,S,159,S,225,D,,,,,,,,,,,,,,,,,, diff -r 1f113d9db8ba -r 1dc65ec11a40 test-data/output.fa --- a/test-data/output.fa Wed Jan 09 14:57:14 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,10 +0,0 @@ ->test_3C.3a test -QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILD -GENCTLIDALLGDPQCDGFQNKKWDLFVERNKAYSSCYPYDVPDYASLRSLVASSGTLEF -NNESFNWAGVTQNGTSSSCIRGSKSSFFSRLNWLTHLNSKYPALNVTMPNNEQFDKLYIW -GVHHPGTDKDQISLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPG -DILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRI -TYGACPRYVKQSTLKLATGMRNVPERQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEG -RGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDL -WSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGS -IRNGTYDHNVYRDEALNNRFQIKGVELKSGYKDW diff -r 1f113d9db8ba -r 1dc65ec11a40 test-data/test_input.fasta --- a/test-data/test_input.fasta Wed Jan 09 14:57:14 2019 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2 +0,0 @@ ->test -QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERNKAYSSCYPYDVPDYASLRSLVASSGTLEFNNESFNWAGVTQNGTSSSCIRGSKSSFFSRLNWLTHLNSKYPALNVTMPNNEQFDKLYIWGVHHPGTDKDQISLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKQSTLKLATGMRNVPERQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHNVYRDEALNNRFQIKGVELKSGYKDW diff -r 1f113d9db8ba -r 1dc65ec11a40 tools/aggregate_linelisting/aggregate_linelisting.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/aggregate_linelisting/aggregate_linelisting.py Wed Jan 09 15:33:32 2019 -0500 @@ -0,0 +1,279 @@ +#!/usr/bin/env python +'''Reads in a fasta file of antigenic maps and one with the reference antigenic map as +protein SeqRecords. Compares amino acids of sample antigenic maps to corresponding sites +in the reference and masks identical amino acids with dots. Writes headers (including +amino acid position numbers read from the respective index array), the reference amino +acid sequence and column headings required for both non-aggregated and aggregated line lists. +Outputs all headers and modified (i.e. dotted) sample sequences to a csv file.''' + +'''Author: Diane Eisler, Molecular Microbiology & Genomics, BCCDC Public Health Laboratory, Jan 2018''' + +import sys,string,os, time, Bio, re, argparse +from Bio import Seq, SeqIO, SeqUtils, Alphabet, SeqRecord +from Bio.SeqRecord import SeqRecord +from Bio.Alphabet import IUPAC +from Bio.Seq import Seq + +inputAntigenicMaps = sys.argv[1] #batch fasta file with antigenic map sequences +refAntigenicMap = sys.argv[2] #fasta file of reference antigenic map sequence +antigenicSiteIndexArray = sys.argv[3] #antigenic site index array csv file +cladeDefinitionFile = sys.argv[4] #clade definition csv file +outFileHandle = sys.argv[5] #user-specifed output filename + +agg_lineListFile = open(outFileHandle,'w') #open a writable output file + +indicesLine = "" #comma-separated antigenic site positions +cladeList = [] #list of clade names read from clade definition file +ref_seq = "" #reference antigenic map (protein sequence) +seqList = [] #list of aa sequences to compare to reference + +BC_list = [] #empty list for BC samples +AB_list = [] #empty list for AB samples +ON_list = [] #empty list for ON samples +QC_list = [] #empty list for QC samples +nonprov_list = [] #empty list for samples not in above 4 provinces +#dictionary for location-separated sequence lists +prov_lists = {'1_BC':BC_list,'2_AB':AB_list,'3_ON':ON_list,'4_QC': QC_list, '5_nonprov': nonprov_list} + +def replace_matching_aa_with_dot(record): + """Compare amino acids in record to reference, mask identical symbols with dots, and return modified record.""" + orig_seq = str(record.seq) #sequence string from SeqRecord + mod_seq = "" + #replace only those aa's matching the reference with dots + for i in range(0, len(orig_seq)): + if (orig_seq[i] == ref_seq[i]): + mod_seq = mod_seq + '.' + else: + mod_seq = mod_seq + orig_seq[i] + #assign modified sequence to new SeqRecord and return it + rec = SeqRecord(Seq(mod_seq,IUPAC.protein), id = record.id, name = "", description = "") + return rec + +def extract_clade(record): + """Extract clade name (or 'No_Match') from sequence name and return as clade name. """ + if record.id.endswith('No_Match'): + clade_name = 'No_Match' + else: # + for clade in cladeList: + if record.id.endswith(clade): + clade_name = clade + return clade_name + +def extract_sample_name(record, clade): + """Extract sample name from sequence name and return sample name. """ + end_index = record.id.index(clade) + sample_name = record.id[:end_index -1] + #return sample name as sequence name minus underscore and clade name + return sample_name + +def sort_by_location(record): + """Search sequence name for province name or 2-letter province code and add SeqRecord to + province-specific dictionary.""" + seq_name = record.id + if ('-BC-' in seq_name) or ('/British_Columbia/' in seq_name): + BC_list.append(record) #add Sequence record to BC_list + elif ('-AB-' in seq_name) or ('/Alberta/' in seq_name): + AB_list.append(record) #add Sequence record to AB_list + elif ('-ON-' in seq_name) or ('/Ontario/' in seq_name): + ON_list.append(record) #add Sequence record to ON_list + elif ('-QC-' in seq_name) or ('/Quebec/' in seq_name): + QC_list.append(record) #add Sequence record to QC_list + else: + nonprov_list.append(record) #add Sequence record to nonprov_list + return + +def extract_province(record): + """Search sequence name for province name or 2-letter province code and return province.""" + seq_name = record.id + if ('-BC-' in seq_name) or ('/British_Columbia/' in seq_name): + province = 'British Columbia' + elif ('-AB-' in seq_name) or ('Alberta' in seq_name): + province = '/Alberta/' + elif ('-ON-' in seq_name) or ('/Ontario/' in seq_name): + province = 'Ontario' + elif ('-QC-' in seq_name) or ('/Quebec/' in seq_name): + province = 'Quebec' + else: + province = "other" + return province + +def get_sequence_length(record): + """Return length of sequence in a SeqRecord.""" + sequenceLength = len(str((record.seq))) + return sequenceLength + +def get_antigenic_site_substitutions(record): + """Count number of non-dotted amino acids in SeqRecord sequence and return as substitutions.""" + sequenceLength = get_sequence_length(record) + seqString = str(record.seq) + matches = seqString.count('.') + substitutions = sequenceLength - matches + return substitutions + +def calculate_percent_id(record, substitutions): + """Calculate percent sequence identity to reference sequence, based on substitutions +and sequence length and return percent id as a ratio (i.e. 0.90 no 90%).""" + sequenceLength = get_sequence_length(record) + percentID = (1.00 - (float(substitutions)/float(sequenceLength))) + return percentID + +def output_aggregated_linelist(a_list): + """Output aggregated line list of SeqRecords in csv format.""" + sequevars = {} #dict of sequevar: SeqRecord list + firstRecordID = None + #examine dotted/masked sequences in list and assign unique ones as dict keys + for rec in a_list: + rec = replace_matching_aa_with_dot(rec) + sequence =str(rec.seq) + #if the sequence is a key in the dict, add SeqRecord to list + if sequence in sequevars: + #if sequence already in dict as a key, increment the value + sequevars[sequence].append(rec) + else: + #if sequence not in dict, add is as new key with list of 1 SeqRecord + sequevars[sequence] = [rec] + #get list of sorted unique sequence keys + sorted_unique_seq_keys = sorted(sequevars.keys()) + #process each list of SeqRecords sharing a unique sequence + for u in sorted_unique_seq_keys: + #access list of sequences by unique sequence + listOfSeqs = sequevars[u] + #sort this list of SeqRecords by record.id (i.e. name) + listOfSeqs = [f for f in sorted(listOfSeqs, key = lambda x : x.id)] + N = len(listOfSeqs) + #output details of first SeqRecord to csv + firstRecord = listOfSeqs[0] + province = extract_province(firstRecord) + clade = extract_clade(firstRecord) + substitutions = get_antigenic_site_substitutions(firstRecord) + percentID = calculate_percent_id(firstRecord,substitutions) + name = extract_sample_name(firstRecord, clade) + name_part = name.rstrip() + ',' + N_part = str(N) + ',' + clade_part = clade + ',' + substitutions_part = str(substitutions) + ',' + percID_part = str(percentID) + ',' + col = " ," #empty column + sequence = str(firstRecord.seq).strip() + csv_seq = ",".join(sequence) +"," + comma_sep_output = name_part + N_part + clade_part + col + csv_seq + substitutions_part + percID_part + "\n" + #write first member of unique sequence list to csv + agg_lineListFile.write(comma_sep_output) + #print sequence records in sequevar to console + print "\n\t\t%i SeqRecords matching Sequevar: %s" % (len(listOfSeqs), u) + + #to uncollapse sequevar group, print each member of the sequevar list to csv output + '''for i in range(1,len(listOfSeqs)): + currentRec = listOfSeqs[i] + province = extract_province(currentRec) + clade = extract_clade(currentRec) + substitutions = get_antigenic_site_substitutions(currentRec) + percentID = calculate_percent_id(currentRec,substitutions) + name_part = (currentRec.id).rstrip() + ',' + N_part = "n/a" + ',' + clade_part = clade + ',' + substitutions_part = str(substitutions) + ',' + percID_part = str(percentID) + ',' + col = " ," #empty column + sequence = str(currentRec.seq).strip() + csv_seq = ",".join(sequence) +"," + comma_sep_output = name_part + N_part + clade_part + col + csv_seq + substitutions_part + percID_part + "\n" + agg_lineListFile.write(comma_sep_output) ''' + return + +with open (antigenicSiteIndexArray,'r') as siteIndices: + """Read amino acid positions from antigenic site index array and print as header after one empty row.""" + col = "," #empty column + #read amino acid positions and remove trailing whitespace + for line in siteIndices: + #remove whitespace from the end of each line + indicesLine = line.rstrip() + row1 = "\n" + #add comma-separated AA positions to header line + row2 = col + col + col + col + indicesLine + "\n" + #write first (empty) and 2nd (amino acid position) lines to output file + agg_lineListFile.write(row1) + agg_lineListFile.write(row2) + +with open (refAntigenicMap,'r') as refMapFile: + """Read reference antigenic map from fasta and output amino acids, followed by column headers.""" + #read sequences from fasta to SeqRecord, uppercase, and store sequence string to ref_seq + record = SeqIO.read(refMapFile,"fasta",alphabet=IUPAC.protein) + record = record.upper() + ref_seq = str(record.seq).strip() #store sequence in variable for comparison to sample seqs + col = "," #empty column + name_part = (record.id).rstrip() + ',' + sequence = str(record.seq).strip() + csv_seq = ",".join(sequence) + #output row with reference sequence displayed above sample sequences + row3 = name_part + col + col + col + csv_seq + "\n" + agg_lineListFile.write(row3) + positions = indicesLine.split(',') + numPos = len(positions) + empty_indicesLine = ',' * numPos + #print column headers for sample sequences + row4 = "Sequence Name,N,Clade,Extra Substitutions," + empty_indicesLine + "Number of Amino Acid Substitutions in Antigenic Sites,% Identity of Antigenic Site Residues\n" + agg_lineListFile.write(row4) + print ("\nREFERENCE ANTIGENIC MAP: '%s' (%i amino acids)" % (record.id, len(record))) + +with open(cladeDefinitionFile,'r') as cladeFile: + """Read clade definition file and store clade names in list.""" + #remove whitespace from the end of each line and split elements at commas + for line in cladeFile: + elementList = line.rstrip().split(',') + name = elementList[0] #move 1st element to name field + cladeList.append(name) + +with open(inputAntigenicMaps,'r') as extrAntigMapFile: + """Read antigenic maps as protein SeqRecords and add to list.""" + #read Sequences from fasta file, uppercase and add to seqList + for record in SeqIO.parse(extrAntigMapFile, "fasta", alphabet=IUPAC.protein): + record = record.upper() + seqList.append(record) #add Seq to list of Sequences + +#print number of sequences to be processed as user check +print "\nCOMPARING %i flu antigenic map sequences to the reference..." % len(seqList) +for record in seqList: + #assign SeqRecords to province-specific dictionaries + sort_by_location(record) + +#access prov segregated lists in order +sorted_prov_keys = sorted(prov_lists.keys()) +print "\nSequence Lists Sorted by Province: " +for prov in sorted_prov_keys: + current_list = prov_lists[prov] + #mask AA's identical to reference sequence with dot + masked_list = [] # empty temporary list to park masked sequences + for record in current_list: + masked_rec = replace_matching_aa_with_dot(record) + masked_list.append(masked_rec) + prov_lists[prov] = masked_list #replace original SeqRecord list with masked list + +#group sequences in province-sorted list into clades +for prov in sorted_prov_keys: + prov_list = prov_lists[prov] + by_clades_dict = {} #empty dict for clade:seqRecord list groups + print "\n'%s' List (Amino Acids identical to Reference are Masked): " % (prov) + for rec in prov_list: + clade = extract_clade(rec) + if clade in by_clades_dict: + #if clade already in dict as key, append record to list (value) + by_clades_dict[clade].append(rec) + else: #add clade as key to dict, value is list of 1 SeqRecord + by_clades_dict[clade] = [rec] + #get list of alphabetically sorted clade keys + sorted_clade_keys = sorted(by_clades_dict.keys()) + print "\tNumber of clades: ", len(by_clades_dict) + #group each list of sequences in clade by sequevars + for key in sorted_clade_keys: + print "\n\tCLADE: %s Number of Members: %i" % (key, len(by_clades_dict[key])) + a_list = by_clades_dict[key] + for seqrec in a_list: + print "\t %s: %s" %(seqrec.id,str(seqrec.seq)) + #output the list to csv as aggregated linelist + output_aggregated_linelist(a_list) + +print("Aggregated Linelist written to file: '%s\n'" % (outFileHandle)) +extrAntigMapFile.close() +refMapFile.close() +agg_lineListFile.close() diff -r 1f113d9db8ba -r 1dc65ec11a40 tools/aggregate_linelisting/aggregate_linelisting.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/aggregate_linelisting/aggregate_linelisting.xml Wed Jan 09 15:33:32 2019 -0500 @@ -0,0 +1,35 @@ + + + biopython + + + + + + + + + + + + + + + + + + + + + + + + diff -r 1f113d9db8ba -r 1dc65ec11a40 tools/aggregate_linelisting/test-data/FluA_H3_antigenic_aa_indices.csv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/aggregate_linelisting/test-data/FluA_H3_antigenic_aa_indices.csv Wed Jan 09 15:33:32 2019 -0500 @@ -0,0 +1,1 @@ +44,45,46,47,48,50,51,53,54,57,59,62,63,67,75,78,80,81,82,83,86,87,88,91,92,94,96,102,103,109,117,121,122,124,126,128,129,130,131,132,133,135,137,138,140,142,143,144,145,146,150,152,155,156,157,158,159,160,163,164,165,167,168,170,171,172,173,174,175,176,177,179,182,186,187,188,189,190,192,193,194,196,197,198,201,203,207,208,209,212,213,214,215,216,217,218,219,226,227,228,229,230,238,240,242,244,246,247,248,260,261,262,265,273,275,276,278,279,280,294,297,299,300,304,305,307,308,309,310,311,312 diff -r 1f113d9db8ba -r 1dc65ec11a40 tools/aggregate_linelisting/test-data/Flu_Clade_Definitions_H3_20171121.csv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/aggregate_linelisting/test-data/Flu_Clade_Definitions_H3_20171121.csv Wed Jan 09 15:33:32 2019 -0500 @@ -0,0 +1,12 @@ +3C.2a,1,3,I,144,S,145,S,159,Y,160,T,225,D,311,H,489,N,,,,,,,,,,,,,,,,,,,, +3C.2a_+_T131K_+_R142K_+_R261Q,2,3,I,131,K,142,K,144,S,145,S,159,Y,160,T,225,D,261,Q,311,H,489,N,,,,,,,,,,,,,, +3C.2a_+_N121K_+_S144K,2,3,I,121,K,144,K,145,S,159,Y,160,T,225,D,311,H,489,N,,,,,,,,,,,,,,,,,, +3C.2a_+_N31S_+_D53N_+_R142G_+_S144R_+_N171K_+_I192T_+_Q197H,2,3,I,31,S,53,N,142,G,144,R,145,S,159,Y,160,T,171,K,192,T,197,H,225,D,311,H,489,N,,,,,,,, +3C.2a1,2,3,I,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,484,E,489,N,,,,,,,,,,,,,, +3C.2a1_+_N121K,3,3,I,121,K,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,484,E,489,N,,,,,,,,,,,, +3C.2a1_+_N121K_+_K92R_+_H311Q,4,3,I,92,R,121,K,144,S,145,S,159,Y,160,T,171,K,225,D,311,Q,406,V,484,E,489,N,,,,,,,,,, +3C.2a1_+_N121K_+_T135K,4,3,I,121,K,135,K,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,479,E,484,E,489,N,,,,,,,, +3C.2a1_+_N121K_+_I140M,4,3,I,121,K,140,M,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,479,E,484,E,489,N,,,,,,,, +3C.2a1_+_N121K_+_R142G,4,3,I,121,K,142,G,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,484,E,489,N,,,,,,,,,, +3C.2a1_+_N121K_+_R142G_+_I242V,5,3,I,121,K,142,G,144,S,145,S,159,Y,160,T,171,K,225,D,242,V,311,H,406,V,479,E,484,E,489,N,,,,,, +3C.3a,1,128,A,138,S,142,G,145,S,159,S,225,D,,,,,,,,,,,,,,,,,,,,,,,, diff -r 1f113d9db8ba -r 1dc65ec11a40 tools/aggregate_linelisting/test-data/MAP_3C.2a_A_Hong_Kong_4801_2014_X-263B_EGG.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/aggregate_linelisting/test-data/MAP_3C.2a_A_Hong_Kong_4801_2014_X-263B_EGG.fasta Wed Jan 09 15:33:32 2019 -0500 @@ -0,0 +1,4 @@ +>Clade_3C.2a_A/Hong_Kong/4801/2014_X-263B_EGG +QNSSIEIDSQLENIQGQNKKLFVSKYSVPRTNNSNTGVTQNTSAIRSSSSRNTHLNYKAL +NTMNNEQFDKLIVGTDKDIFPAQSRXKRSAVIPNIGSIPSRIKGILNSTIRSSPGKKSEF +VRIACRYVKHS diff -r 1f113d9db8ba -r 1dc65ec11a40 tools/aggregate_linelisting/test-data/fluA_H3_clade_assigned_antigenic_sites_extracted.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/aggregate_linelisting/test-data/fluA_H3_clade_assigned_antigenic_sites_extracted.fasta Wed Jan 09 15:33:32 2019 -0500 @@ -0,0 +1,94 @@ +>A-ON-314-2017_3C.3a +QNSSIEIDSQLENIQGQNKKLFVNKYNVPRTNNSNAGVTQNTSSIGSKSSRNTHLNSKAL +NTMNNEQFDKLIVGTDKDISLAQSRTKRSAVIPNIGSIPSRIKGILNSTIRSSPGKKSEF +VRIACRYVKQS + +>A-AB-399-2017_3C.2a_+_N31S_+_D53N_+_R142G_+_S144R_+_N171K_+_I192T_+_Q197H +QNSSIEINSQLENIQGQNKKLFVSKYNVPRTNNSNTGVTQNTSAIGSRSSRNTHLNYTAL +NTMNKEQFDKLIVGTDKDTFLAHSRTKRSAVIPNIGSIPSRIKGLLNSTIRSSPGKKSEF +VRIACRYVKHS + +>A-QC-303-2017_3C.2a_+_N31S_+_D53N_+_R142G_+_S144R_+_N171K_+_I192T_+_Q197H +QNSSIEINSQLENIQGQNKKLFVSKYNVPRTNNSNTGVTQNTSAIGSRSSRNTHLNYTAL +NTMNKEQFDKLIVGTDKDTFLAHSRTKRSAVIPNIGSIPSRIKGLLNSTIRSSPGKKSEF +VRIACRYVKHS + +>A-AB-319-2017_3C.2a_+_N31S_+_D53N_+_R142G_+_S144R_+_N171K_+_I192T_+_Q197H +QNSSIEINSQLENIQGQNKKLFVSKYNVPRTNNSNTGVTQNTSAIGSRSSRNTHLNYTAL +NTMNKEQFDKLIVGTDKDTFLAHSRTKRSAVIPNIGSIPSRIKGLLNSTIRSSPGKKSEF +VRIACRYVKHS + +>A-AB-308-2017_3C.2a1_+_N121K_+_T135K +QNSSIEIDSQLENIQDQNKKLFVSKHNVPRTKDSNTGVTQNKSAIRSSSSRNTHLNYTAL +NTMNKEQFDKLIIGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIRSSPGKKSEF +VRIACRYVKHS + +>A-AB-341-2017_3C.2a1_+_N121K_+_T135K +QNSSIEIDSQLENIQDQNKKLFVSKHNVPRTKDSNTGVTQNKSAIRSSSSRNTHLNYTAL +NTMNKEQFDKLIIGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIRSSPGKKSEF +VRIACRYVKHS + +>A-BC-024-2018_3C.2a1_+_N121K_+_T135K +QNSSIEIDSQLENIQDQNKKLFVSKHNVPRTKNSNTGVTQNKSAIRSSSSRNTHLNYTAL +NTMNKEQFDKLIIGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIRSSPGKKSEF +VRIACRYVKHS + +>A-QC-309-2017_3C.2a1_+_N121K_+_K92R_+_H311Q +QNSSIEIDSQLGNIQDQNKKLFVSRYNVPRTKDSNTGVTQNKSAIGSSSSRNTHLNYTAL +NTMNKEQFDKLIVGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIRSSPGKKSEF +VRIACRYVKQS + +>A-BC-324-2017_3C.2a1_+_N121K_+_K92R_+_H311Q +QNSSMEIDSQLGNIQGQNKKLFVSRYNVPRTKNSNTGVTQNKSAIGSSSSRNTHLNYTAL +NTMNKEQFDKLIVGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIRSSPGKKSEF +VRIACRYVKQS + +>A-QC-315-2017_3C.2a1_+_N121K_+_K92R_+_H311Q +QNSSIEIDSQLGNIQGQNKKLFVSRYNVPRTKNSNAGVTQNKSAIGSSSSRNTHLNYTAL +NTMNKEQFDKLIVGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIRSSPGKKSEF +VRIACRYVKQS + +>A-ON-016-2018_3C.2a_+_N121K_+_S144K +QNSSIEIDSQLENIQGQNKKLFVSKYNVPRTKNSNTGVTQNKSAIRSKSSKNTHLNYTAL +NTMNNEQFDKLIVGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIQSSPGKKSEF +VRIACRYVKHS + +>A-BC-325-2017_3C.2a_+_N121K_+_S144K +QNSSIEIDSQLENIQGQNKKLFVSKYNVPRTKNSNTGVTQNKSAIRSKSSKNTHLNYTAL +NTMNNEQFDKLIVGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIQSSPGKKSEF +VRIACRYVKHS + +>A-ON-003-2018_3C.2a_+_N121K_+_S144K +QNSSIEIDSQLENIQGQNKKLFVSKYNVPRTKNSNTGVTQNKSAIRSKSSKNTHLNYTAL +NTMNNEQFDKLIVGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIQSSPGKKSEF +VRIACRYVKHS + +>A-ON-309-2017_No_Match +QNSSIEIDSQLENIQGQNKKLFVSKYNVPRTNNSNTGVKQNTSAIKSSSSRNTHLNYKAL +NTMNNEQFDKLIVGTDKDIFLAQSKTKISAVIPNIGSIPSRIKGILNSTIQSSPGKKSEF +VRIACRYVKHS + +>A-BC-330-2017_No_Match +QNSSIEIDSQLENIQGQNKKLFVSKYNVPRTNNSNTGVKQNTSAIKSRSSRNTHLNYTAL +NTMNNEQFDKLIVGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIQSSPGKKSEF +VRIACRYVKHS + +>A-AB-415-2017_3C.2a_+_T131K_+_R142K_+_R261Q +QNSSIEIDSQLENIQGQNKKLFVSRYNVPRTNNSNTGVKQNTSAIKSSSSRNTHLNYTAL +NTMNNEQFDKLIVGTDKDIFLAQSRTKRSAVIPNIGFIPSRIKGILNSTIQSSPGKKSEF +VRIACRYVKHS + +>A-AB-400-2017_3C.2a_+_T131K_+_R142K_+_R261Q +QNSSIEIDSQLENIQGQNKKLFVSRYNVPRTNNSNTGVKQNTSAIKSSSSRNTHLNYTAL +NTMNNEQFDKLIVGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIQSSPGKKSEF +VRIACRYVKHS + +>A-AB-416-2017_3C.2a_+_T131K_+_R142K_+_R261Q +QNSSIEIDSQLENIQGQNKKLFVSRYNVPRTNNSNTGVKQNTSAIKSSSSRNTHLNYTAL +NTMNNEQFDKLIVGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIQSSPGKKSEF +VRIACRYVKHS + +>A-QC-316-2017_3C.2a_+_T131K_+_R142K_+_R261Q +QNSSIEIDSQLENIQGQNKKLFVSRYNVPRTNNSNTGVKQNTSAIKSSSSRNTHLNYTAL +NTMNNEQFDKLIVGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIQSSPGKKSEF +VRIACRYVKHS diff -r 1f113d9db8ba -r 1dc65ec11a40 tools/antigenic_site_extraction/antigenic_site_extraction.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/antigenic_site_extraction/antigenic_site_extraction.py Wed Jan 09 15:33:32 2019 -0500 @@ -0,0 +1,95 @@ +#!/usr/bin/env python + +'''Accepts fasta files of amino acid sequence, extracts specific amino acids (defined in a csv index array), +and outputs extracted sequences - representing flu antigenic sites - to fasta (default) or csv.''' + +'''Author: Diane Eisler, Molecular Microbiology & Genomics, BCCDC Public Health Laboratory,Sept 2017''' + +import sys,string,os, time, Bio, argparse +from Bio import Seq, SeqIO, SeqUtils, Alphabet, SeqRecord +from Bio.SeqRecord import SeqRecord +from Bio.Alphabet import IUPAC +from Bio.Seq import Seq + +#parse command line arguments +parser = argparse.ArgumentParser() +parser.add_argument("-c","--csv",help="export extracted antigenic sites to csv file",action="store_true") +parser.add_argument("inFileHandle1") #batch fasta file with sequences to be parsed +parser.add_argument("inFileHandle2") # .csv file containing positions of aa's to extract +parser.add_argument("outFileHandle") #user-specified name for output file of extracted aa seq's +args = parser.parse_args() + +#inFileHandle1 = sys.argv[1] #batch fasta file with sequences to be parsed +#inFileHandle2 = sys.argv[2] # .csv file containing positions of aa's to extract +#outFileHandle = sys.argv[3] #user-specified name for output file of extracted aa seq's + +outFile= open(args.outFileHandle,'w') #open a writable, appendable output file +localtime = time.asctime(time.localtime(time.time())) #date and time of analysis +seqList = [] #list of aa sequence objects to parse for oligo sequences +indexArray = [] # .csv list of aa's corresponding to antigenic site positions +extractedSeqList = [] #list of extracted antigenic sites extracted from seqList + +def extract_aa_from_sequence(record): + """Extract specific amino acids from SeqRecord, create new SeqRecord and append to list.""" + original_sequence = str(record.seq) #pull out the SeqRecord's Seq object and ToString it + new_sequence = "" #set variable to empty + new_id = record.id #store the same sequence id as the original sequence + #iterate over each position in index array, extract corresponding aa and add to string + for pos in indexArray: + char = original_sequence[pos-1] #aa positions must be zero indexed + new_sequence = new_sequence + char + rec = SeqRecord(Seq(new_sequence,IUPAC.protein), id = record.id, name = "", description = "") + extractedSeqList.append(rec) #add new SeqRecord object to the list + +with open (args.inFileHandle2,'r') as inFile2: + '''Open csv file containing amino acid positions to extract and add to list.''' + #read items separated by comma's to position list + positionList = "" + for line in inFile2: + #remove whitespace from the end of each line + strippedLine = line.rstrip() + #split the line at commas and assigned the returned list as indexArray + positionList = strippedLine.split(',') + #Convert string items in positionList from strings to int and add to indexArray + for item in positionList: + indexArray.append(int(item)) + #print number of amino acids to extract and array to console as user check + print "Amino Acid positions to extract: %i " %(len(indexArray)) + print indexArray + +with open(args.inFileHandle1,'r') as inFile: + '''Open fasta of amino acid sequences to parse, uppercase and add to protein Sequence list.''' + #read in Sequences from fasta file, uppercase and add to seqList + for record in SeqIO.parse(inFile, "fasta", alphabet=IUPAC.protein): + record = record.upper() + seqList.append(record) #add Seq to list of Sequences + #print number of sequences to be process as user check + print "\n%i flu sequences will be extracted for antigenic sites..." % len(seqList) + #parse each target sequence object + for record in seqList: + extract_aa_from_sequence(record) + +#print original and extracted sequence +for x in range(0, len(seqList)): + print "Original %s: %i amino acids,\tExtracted: %i" % (seqList[x].id,len(seqList[x]),len(extractedSeqList[x])) + +#determine if output format is fasta (default) or csv +if args.csv: + #write csv file of extracted antigenic sits + for record in extractedSeqList: + #outFile.write(record.id),"," + name_part = (record.id).rstrip() + ',' + sequence = str(record.seq).strip() + csv_seq = ",".join(sequence) + comma_separated_sequence = name_part + csv_seq + "\n" + print comma_separated_sequence + outFile.write(comma_separated_sequence) +else: + #write fasta file of extracted antigenic sites + SeqIO.write(extractedSeqList,outFile,"fasta") + +print("\n%i Sequences Extracted to Output file: %s" % ((len(extractedSeqList),args.outFileHandle))) +inFile.close() +inFile2.close() +outFile.close() + diff -r 1f113d9db8ba -r 1dc65ec11a40 tools/antigenic_site_extraction/antigenic_site_extraction.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/antigenic_site_extraction/antigenic_site_extraction.xml Wed Jan 09 15:33:32 2019 -0500 @@ -0,0 +1,38 @@ + + + biopython + + + + + + + + + + + + + + + + + + + + + + + + + diff -r 1f113d9db8ba -r 1dc65ec11a40 tools/antigenic_site_extraction/test-data/14_H3_aa_seqs_aligned.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/antigenic_site_extraction/test-data/14_H3_aa_seqs_aligned.fasta Wed Jan 09 15:33:32 2019 -0500 @@ -0,0 +1,28 @@ +>Seq1(3C.2a.3) +QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWTGVKQNGTSSACIRKSSSSFFSRLNWLTHLNYTYPALNVTMPNNEQFDKLYIWGVHHPGTDKDQIFLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIQSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKHSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHNVYRDEALNNRFQIKGVELKSGYKDW +>Seq2(3C.2a.4) +QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFKDESFNWTGVTQNGTSSACIRRSKSSFFSRLNWLTHLNYTYPALNVTMPNNEQFDKLYIWGVHHPGTDKDQIFLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNIIAPRGYFKIRNGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKHSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHNVYRDEALNNRFQIKGVELKSGYKDW +>Seq3(3C.2a.3) +QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICNSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWTGVKQNGTSSACIRKSSSSFFSRLNWLTHLNYTYPALNVTMPNNEQFDKLYIWGVHHPGTDKDQISLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIQSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKHSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHNVYRDEALNNRFQIKGVELKSGYKDW +>Seq4(3C.2a.2) +QKIPGNDNSMATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWTGVTQNGTSSACMRRSSSSFFSRLNWLTHLNYTYPALNVTMPNNEQFDKLYIWGVHHPGTDKDQIFLYAKSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIQSGKSSIMRSNAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKHSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMMDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDLWSYDAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHNVYRDEALNNRFHIKGVELKSGYKDW +>Seq5(3C.2a.3) +QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWTGVKQNGTSSACIRKSSSSFFSRLNWLTHLNYTYPALNVTMPNNEQFDKLYIWGVHHPGTDKDQISLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIQSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKHSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHNVYRDEALNNRFQIKGVELKSGYKDW +>Seq5(3C.2a.4) +QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASFGTLEFKNESFNWTGVTQNGTSSACIRRSKSSFFSRLNWLTHLNYTYPALNVTMPNNEQFDKLYIWGVHHPGTDKDQIFLYAQSSGRITVSTKRSQQAVIPNIGYRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKHSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHNVYRDEALNNRFQIKGVELKSGYKDW +>Seq6(3C.3a) +QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERNKAYSSCYPYDVPDYASLRSLVASSGTLEFNNESFNWAGVTQNGTSSSCIRGSKSSFFSRLNWLTHLNSKYPALNVTMPNNEQFDKLYIWGVHHPGTDKDQISLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKQSTLKLATGMRNVPERQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHNVYRDEALNNRFQIKGVELKSGYKDW +>Seq7(3C.3a) +QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERNKAYSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWAGVTQNGTSSSCIRGSKSSFFSRLNWLTHLNSKYPALNVTMPNNEQFDKLYIWGVHHPGTDKDQISLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKQSTLKLATGMRNVPERQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHSVYRDEALNNRFQIKGVELKSGYKDW +>Seq8(3C.2a.1) +QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFKNESFNWTGVTQNGKSSACIRRSSSSFFSRLNWLTHLNYTYPALNVTMPNKEQFDKLYIWGVHHPGTDKDQIFLYAQPSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKHSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRVQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIESIRNETYDHNVYRDEALNNRFQIKGVELKSGYKDW +>Clade_3C.2a_A/Hong_Kong/5738/2014 +QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWTGVTQNGTSSACIRRSSSSFFSRLNWLTHLNYTYPALNVTMPNNEQFDKLYIWGVHHPGTDKDQIFLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKH?TLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHNVYRDEALNNRFQIKGVELKSGYKDW +>Clade_3C.3a_A/Switzerland/9715293/2013 +QKLPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWAGVTQNGTSSSCIRGSNSSFFSRLNWLTHLNSKYPALNVTMPNNEQFDKLYIWGVHHPGTDKDQIFLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKQSTLKLATGMRNVPERQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHDVYRDEALNNRFQIKGVELKSGYKDW +>Seq9(3C.2a.1) +QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWTGVTQNGTSSACIRRSSSSFFSRLNWLTHLNYTYPALNVTMPNKEQFDKLYIWGVHHPGTDKDQIFLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKHSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRVQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNETYDHNVYRDEALNNRFQIKGVELKSGYKDW +>Seq10(3C.2a.1) +QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFKNESFNWTGVTQNGKSSACIRRSSSSFFSRLNWLTHLNYTYPALNVTMPNKEQFDKLYIWGVHHPGTDKDQIFLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKHSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRVQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIESIRNETYDHNVYRDEALNNRFQIKGVELKSGYKDW +>Clade_3C.2a.1_A/Bolzano/7/2016 +QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWTGVTQNGTSSACIRRSSSSFFSRLNWLTHLNYTYPALNVTMPNKEQFDKLYIWGVHHPGTDKDQIFLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKHSTLKLATGMRNVPEKQARGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRVQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNETYDHNVYRDEALNNRFQIKGVELKSGYKDW diff -r 1f113d9db8ba -r 1dc65ec11a40 tools/antigenic_site_extraction/test-data/FluA_H3_antigenic_aa_indices.csv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/antigenic_site_extraction/test-data/FluA_H3_antigenic_aa_indices.csv Wed Jan 09 15:33:32 2019 -0500 @@ -0,0 +1,1 @@ +44,45,46,47,48,50,51,53,54,57,59,62,63,67,75,78,80,81,82,83,86,87,88,91,92,94,96,102,103,109,117,121,122,124,126,128,129,130,131,132,133,135,137,138,140,142,143,144,145,146,150,152,155,156,157,158,159,160,163,164,165,167,168,170,171,172,173,174,175,176,177,179,182,186,187,188,189,190,192,193,194,196,197,198,201,203,207,208,209,212,213,214,215,216,217,218,219,226,227,228,229,230,238,240,242,244,246,247,248,260,261,262,265,273,275,276,278,279,280,294,297,299,300,304,305,307,308,309,310,311,312 diff -r 1f113d9db8ba -r 1dc65ec11a40 tools/assign_clades/assign_clades.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/assign_clades/assign_clades.py Wed Jan 09 15:33:32 2019 -0500 @@ -0,0 +1,133 @@ +#!/usr/bin/env python + +'''Accepts fasta files containing amino acid sequence, reading them in as +amino acid sequence objects. Reads influenza clade defintions (i.e. amino +acids at certain positions) from .csv file into dictionary structure. Searches +each of the amino acid sequence objects for a list of matching clades, assigns +the most 'evolved' (i.e. child as opposed to parent clade) to the sequence. Appends +"_cladename" to the Sequence name and generates a fasta file of original sequences with +modified names.''' + +'''Author: Diane Eisler, Molecular Microbiology & Genomics, BCCDC Public Health Laboratory, Oct 2017''' + +import sys,string,os, time, Bio +from Bio import Seq, SeqIO, SeqUtils, Alphabet, SeqRecord +from Bio.SeqRecord import SeqRecord +from Bio.Alphabet import IUPAC +from Bio.Seq import Seq + +localtime = time.asctime(time.localtime(time.time())) #date and time of analysis +inFileHandle1 = sys.argv[1] #batch fasta file with sequences to be parsed +inFileHandle2 = sys.argv[2] # .csv file containing clade definitions and "depth" +outFileHandle = sys.argv[3] #user-specified name for output file of aa seq's with clade suffixes +outFile= open(outFileHandle,'w') #open a writable, appendable output file +seqList = [] #list of aa sequence objects to parse for clade definitions +cladeList = [] #empty list to hold clade tuples i.e. ("3C.3a", 1 ,{"3":"I", "9":"V"..}) + +'''Searches record for required amino acids at defined positions. If found, assigns +clade name to sequence name by appending underscore and clade name to record id.''' +def call_clade(record): + print "---------------------------------------------------------------------" + print "Parsing %s for matching flu clade definitions..." % (record.id) + matchList = [] #empty list to hold clades that match 100% + #iterate over each tuple in the clade list + for clade in cladeList: + cladeName = clade[0] #temp variable for name + depth = clade[1] #temp variable for depth + sites = clade[2] #temp variable for aa def dictionary + shouldFind = len(sites) #number of sites that should match + found = 0 #a counter to hold matches to antigenic sites + #iterate over each position in sites dictionary + for pos, aa in sites.iteritems(): + #translate pos to corresponding index in target sequence + index = int(pos) - 1 + #if record at index has same amino acid as 'aa', increment 'found' + if record[index] == aa: + found += 1 + if (found == shouldFind): + #add the matching clade tuple to the list of matches + matchList.append(clade) + return matchList + +'''Compares depth level of clades in a list and returns the most granular one''' +def decide_clade_by_depth(matchList): + #empty variable for maximum depth encountered + max_depth = 0 + best_match_name = '' #variable to hold most granular clade + #for each matching clade, check depth of the corresponding tuple + for clade in matchList: + #if the current clade is 'deeper' than the one before it + if clade[1] > max_depth: + #store this depth + max_depth = clade[1] + #store name of the clade + best_match_name = clade[0] + return best_match_name + +'''opens the .csv file of clade definitions and clade "depth" ''' +with open (inFileHandle2, 'r') as clade_file: + #remove whitespace from the end of each line and split elements at commas + for line in clade_file: + #print "Current Line in File:" + line + sites={} #initialize a dictionary for clade + elementList = line.rstrip().split(',') + new_list = [] #start a new list to put non-empty strings into + #remove empty stings in list + for item in elementList: + if item != '': + new_list.append(item) + name = new_list.pop(0) #move 1st element to name field + depth = int(new_list.pop(0)) #move 2nd element to depth field + #read remaining pairs of non-null elements into clade def dictionary + for i in range(0, len(new_list), 2): + #move next 2 items from the list into the dict + pos = new_list[i] + aa = new_list[i + 1] + sites[pos] = aa + #add the clade info as a tuple to the cladeList[] + oneClade =(name, depth, sites) + cladeList.append(oneClade) + print "The List of Clades:" + for clade in cladeList: + print "Clade Name: %s Depth: %i Antigenic Sites: %i" % (clade[0], clade[1], len(clade[2])) + for pos, aa in clade[2].iteritems(): + print "Pos: %s\tAA: %s" % (pos,aa) + +'''opens readable input file of sequences to parse using filename from cmd line, + instantiates as AA Sequence objects, with ppercase sequences''' +with open(inFileHandle1,'r') as inFile: + #read in Sequences from fasta file, uppercase and add to seqList + for record in SeqIO.parse(inFile, "fasta", alphabet=IUPAC.protein): + record = record.upper() + seqList.append(record) #add Seq to list of Sequences + print "\n%i flu HA sequences will be compared to current clade definitions..." % len(seqList) + #parse each target sequence object + for record in seqList: + clade_call = '' #empty variale for final clade call on sequence + matchingCladeList = call_clade(record) #holds matching clade tuples + #if there is more than one clade match + if len(matchingCladeList) > 1: + #choose the most granular clade based on depth + clade_call = decide_clade_by_depth(matchingCladeList) + #if there is only one clade call + elif len(matchingCladeList) > 0: + clade = matchingCladeList[0] #take the first tuple in the list + clade_call = clade[0] #clade name is the first item in the tuple + #empty list return, no matches + else: + clade_call = "No_Match" + print clade_call + seq_name = record.id + mod_name = seq_name + "_" + clade_call + print "New Sequence Name: " + mod_name + record.id = mod_name + + +#output fasta file with clade calls appended to sequence names +SeqIO.write(seqList,outFile,"fasta") + +#print("\n%i Sequences Extracted to Output file: %s" % ((len(extractedSeqList),outFileHandle))) +inFile.close() +clade_file.close() +outFile.close() + diff -r 1f113d9db8ba -r 1dc65ec11a40 tools/assign_clades/assign_clades.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/assign_clades/assign_clades.xml Wed Jan 09 15:33:32 2019 -0500 @@ -0,0 +1,29 @@ + + + biopython + + + + + + + + + + + + + + + + + + + + diff -r 1f113d9db8ba -r 1dc65ec11a40 tools/assign_clades/test-data/clades.csv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/assign_clades/test-data/clades.csv Wed Jan 09 15:33:32 2019 -0500 @@ -0,0 +1,15 @@ +3C.2a,1,3,I,144,S,145,S,159,Y,160,T,225,D,311,H,489,N,,,,,,,,,,,,,, +3C.2a_+_T131K_+_R142K_+_R261Q,2,3,I,131,K,142,K,144,S,145,S,159,Y,160,T,225,D,261,Q,311,H,489,N,,,,,,,, +3C.2a_+_N121K_+_S144K,2,3,I,121,K,144,K,145,S,159,Y,160,T,225,D,311,H,489,N,,,,,,,,,,,, +3C.2a_+_Q197K_+_R261Q,2,3,I,144,S,145,S,159,Y,160,T,197,K,225,D,261,Q,311,H,489,N,,,,,,,,,, +3C.2a_+_N31S_+_D53N_+_R142G_+_S144R_+_N171K_+_I192T_+_Q197H_,2,3,I,31,S,53,N,142,G,144,R,145,S,159,Y,160,T,171,K,192,T,197,H,225,D,311,H,489,N,, +3C.2a1_(w/o_N121K),3,3,I,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,484,E,489,N,,,,,,,, +3C.2a1_+_N121K,4,3,I,121,K,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,484,E,489,N,,,,,, +3C.2a1_+_N121K_+_R142G_+_I242V,4,3,I,121,K,142,G,144,S,145,S,159,Y,160,T,171,K,225,D,242,V,311,H,406,V,479,E,484,E,489,N +3C.2a1_+_N121K_+_R142G,4,3,I,121,K,142,G,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,484,E,489,N,,,, +3C.2a1_+_N121K_+_T135K,4,3,I,121,K,135,K,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,479,E,484,E,489,N,, +3C.2a1_+_N121K_+_K92R_+_H311Q,4,3,I,92,R,121,K,144,S,145,S,159,Y,160,T,171,K,225,D,311,Q,406,V,484,E,489,N,,,, +3C.2a1_+_N121K_+_I140M,4,3,I,121,K,140,M,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,479,E,484,E,489,N,, +3C.2a1_+_R142G,4,3,I,142,G,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,484,E,489,N,,,,,, +3C.2a1_+_S47T_+_G78S,4,3,I,47,T,78,S,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,484,E,489,N,,,, +3C.3a,1,128,A,138,S,142,G,145,S,159,S,225,D,,,,,,,,,,,,,,,,,, diff -r 1f113d9db8ba -r 1dc65ec11a40 tools/assign_clades/test-data/output.fa --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/assign_clades/test-data/output.fa Wed Jan 09 15:33:32 2019 -0500 @@ -0,0 +1,10 @@ +>test_3C.3a test +QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILD +GENCTLIDALLGDPQCDGFQNKKWDLFVERNKAYSSCYPYDVPDYASLRSLVASSGTLEF +NNESFNWAGVTQNGTSSSCIRGSKSSFFSRLNWLTHLNSKYPALNVTMPNNEQFDKLYIW +GVHHPGTDKDQISLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPG +DILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRI +TYGACPRYVKQSTLKLATGMRNVPERQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEG +RGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDL +WSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGS +IRNGTYDHNVYRDEALNNRFQIKGVELKSGYKDW diff -r 1f113d9db8ba -r 1dc65ec11a40 tools/assign_clades/test-data/test_input.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/assign_clades/test-data/test_input.fasta Wed Jan 09 15:33:32 2019 -0500 @@ -0,0 +1,2 @@ +>test +QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERNKAYSSCYPYDVPDYASLRSLVASSGTLEFNNESFNWAGVTQNGTSSSCIRGSKSSFFSRLNWLTHLNSKYPALNVTMPNNEQFDKLYIWGVHHPGTDKDQISLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKQSTLKLATGMRNVPERQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHNVYRDEALNNRFQIKGVELKSGYKDW diff -r 1f113d9db8ba -r 1dc65ec11a40 tools/change_fasta_deflines/change_fasta_deflines.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/change_fasta_deflines/change_fasta_deflines.py Wed Jan 09 15:33:32 2019 -0500 @@ -0,0 +1,62 @@ +#!/usr/bin/env python +import sys, argparse +'''Accepts either csv (default) or tab-delimited files with old/new sequence names, creating a dictionary of +respective key:value pairs. Parses an input fasta file for 'old' names, replacing them with 'new' names, writing +renamed sequences to a fasta file. NOTE: use of tab-delim text file for renaming requires '-t' on cmd line.''' +#USAGE EXAMPLE 1: python change_fasta_def_lines.py csv_rename_file.csv fasta_2_rename.fasta renamedSequences.fasta +#USAGE EXAMPLE 2: python change_fasta_def_lines.py tab_delim_rename_file.txt -t fasta_2_rename.fasta renamedSequences.fasta + +'''Author: Diane Eisler, Molecular Microbiology & Genomics, BCCDC Public Health Laboratory,Sept 2017''' + +#parse command line arguments +parser = argparse.ArgumentParser() +parser.add_argument ("-t", "--tab_delim", help = "name fasta definition lines from tab-delim file", action = "store_true") +parser.add_argument("inFileHandle") #csv file with current fasta file names in column 1 and desired names in col 2 +parser.add_argument("inFileHandle2") #fasta file containing sequences requiring name replacement +parser.add_argument("outFileHandle") #user-specified output filename +args = parser.parse_args() + +#open a writable output file that will be over-written if it already exists +outfile= open(args.outFileHandle,'w') +dict = {} #dictionary to hold old_name:new_name key:value pairs +splitter = ',' #default char to split lines at +#determine if input naming file is csv (default) or tab delim text +if args.tab_delim: + splitter = '\t' #change splitter to tab if comd line args contain '-t' + +#create dictionary using key/value pairs from csv file of old/new names +with open(args.inFileHandle,'r') as inputFile: +#read in each line and split at comma into key:value pairs + for line in inputFile: + #remove whitespace from end of lines, split at comma, assigning to key:value pairs + line2 = line.rstrip() + splitLine = line2.split(splitter) + old_name = splitLine[0] + new_name = splitLine[1] + dict[old_name] = new_name + +#parse fasta deflines for 'old' names and, if found, replace with new names +with open(args.inFileHandle2,'r') as inputFile2: + for line in inputFile2: + #find the definition lines, remove trailing whitespace & '>' + if ">" in line: + originalDefline = line.rstrip().replace(">","",1) + #check for a match to any of the dict key + if dict.has_key(originalDefline): + #find the index of that item in the list + newDefline= dict[originalDefline] + #print("the new name"), newDefline + # print each item to make sure the right name is being entered + outfile.write(">" + newDefline + "\n") + else: + #write out the original defline sequence name + print ("Defline not in dictionary: "), originalDefline + outfile.write(">" + originalDefline + "\n") + else: + #in lines without ">", write out sequence as it was + seq = line.rstrip() + outfile.write(seq+"\n") + +inputFile.close() +inputFile2.close() +outfile.close() diff -r 1f113d9db8ba -r 1dc65ec11a40 tools/change_fasta_deflines/change_fasta_deflines.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/change_fasta_deflines/change_fasta_deflines.xml Wed Jan 09 15:33:32 2019 -0500 @@ -0,0 +1,39 @@ + + + biopython + + + + + + + + + + + + + + + + + + + + + + + + + + + diff -r 1f113d9db8ba -r 1dc65ec11a40 tools/change_fasta_deflines/test-data/csv_rename_file.csv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/change_fasta_deflines/test-data/csv_rename_file.csv Wed Jan 09 15:33:32 2019 -0500 @@ -0,0 +1,10 @@ +s1,sample_1 +s2,sample_2 +s3,sample_3 +s4,sample_4 +s5,sample_5 +s6,sample_6 +s7,sample_7 +s8,sample_8 +s9,sample_9 +s10,sample_10 diff -r 1f113d9db8ba -r 1dc65ec11a40 tools/change_fasta_deflines/test-data/fasta_2_rename.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/change_fasta_deflines/test-data/fasta_2_rename.fasta Wed Jan 09 15:33:32 2019 -0500 @@ -0,0 +1,20 @@ +>s1 +QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWTGVKQNGTSSACIRKSSSSFFSRLNWLTHLNYTYPALNVTMPNNEQFDKLYIWGVHHPGTDKDQIFLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIQSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKHSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHNVYRDEALNNRFQIKGVELKSGYKDWILWISFAISCFLLCVALLGFIMWACQKGNIRCNICI +>s2 +QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWTGVKQNGTSSACIRKSSSSFFSRLNWLTHLNYTYPALNVTMPNNEQFDKLYIWGVHHPGTDKDQIFLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIQSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKHSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHNVYRDEALNNRFQIKGVELKSGYKDWILWISFAISCFLLCVALLGFIMWACQKGNIRCNICI +>s3 +QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWTGVKQNGTSSACIRKSSSSFFSRLNWLTHLNYTYPALNVTMPNNEQFDKLYIWGVHHPGTDKDQIFLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIQSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKHSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHNVYRDEALNNRFQIKGVELKSGYKDWILWISFAISCFLLCVALLGFIMWACQKGNIRCNICI +>s4 +QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGGNCTLIDALLGDPQCDGFQNKKWDLFVERSRAYSNCYPYDVPDYASLRSLVASSGTLEFKNESFNWTGVTQNGTSSACIRGSSSSFFSRLNWLTHLNYTYPALNVTMPNKEQFDKLYIWGVHHPGTDKDQIFLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKQSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRVQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNETYDHNVYRDEALNNRFQIKGVELKSGYKDWILWISFAISCFLLCIALLGFIMWACQKGNIRCNICI +>s5 +QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWTGVKQNGTSSACIRKSSSSFFSRLNWLTHLNYTYPALNVTMPNNEQFDKLYIWGVHHPGTDKDQIFLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIQSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKHSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHNVYRDEALNNRFQIKGVELKSGYKDWILWISFAISCFLLCVALLGFIMWACQKGNIRCNICI +>s6 +QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWTGVKQNGTSSACIRKSSSSFFSRLNWLTHLNYTYPALNVTMPNNEQFDKLYIWGVHHPGTDKDQIFLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIQSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKHSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHNVYRDEALNNRFQIKGVELKSGYKDWILWISFAISCFLLCVALLGFIMWACQKGNIRCNICI +>s7 +QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWTGVKQNGTSSACIRKSSSSFFSRLNWLTHLNYTYPALNVTMPNNEQFDKLYIWGVHHPGTDKDQIFLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIQSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKHSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHNVYRDEALNNRFQIKGVELKSGYKDWILWISFAISCFLLCVALLGFIMWACQKGNIRCNICI +>s8 +QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWTGVKQNGTSSACIRKSSSSFFSRLNWLTHLNYTYPALNVTMPNNEQFDKLYIWGVHHPGTDKDQIFLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIQSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKHSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHNVYRDEALNNRFQIKGVELKSGYKDWILWISFAISCFLLCVALLGFIMWACQKGNIRCNICI +>s9 +QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSKAYSNCYPYDVPDYASLRSLVASSGTLEFNNESFNWTGVKQNGTSSACIRKSSSSFFSRLNWLTHLNYTYPALNVTMPNNEQFDKLYIWGVHHPGTDKDQIFLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIQSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKHSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRIQDLEKYVEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNGTYDHNVYRDEALNNRFQIKGVELKSGYKDWILWISFAISCFLLCVALLGFIMWACQKGNIRCNICI +>s10 +QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGEICDSPHQILDGENCTLIDALLGDPQCDGFQNKKWDLFVERSRAYSNCYPYDVPDYASLRSLVASSGTLEFKNESFNWTGVTQNGNSSACIRRSSSSFFSRLNWLTHLNYTYPALNVTMPNKEQFDKLYIWGVHHPGTDKDQIFLYAQSSGRITVSTKRSQQAVIPNIGSRPRIRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGKCKSECITPNGSIPNDKPFQNVNRITYGACPRYVKQSTLKLATGMRNVPEKQTRGIFGAIAGFIENGWEGMVDGWYGFRHQNSEGRGQAADLKSTQAAIDQINGKLNRLIGKTNEKFHQIEKEFSEVEGRVQDLEKYIEDTKIDLWSYNAELLVALENQHTIDLTDSEMNKLFEKTKKQLRENAEDMGNGCFKIYHKCDNACIGSIRNETYDHNVYRDEALNNRFQIKGVELKSGYKDWILWISFAISCFLLCVALLGFIMWACQKGNIRCNICI diff -r 1f113d9db8ba -r 1dc65ec11a40 tools/change_fasta_deflines/test-data/tab_delim_rename_file.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/change_fasta_deflines/test-data/tab_delim_rename_file.txt Wed Jan 09 15:33:32 2019 -0500 @@ -0,0 +1,10 @@ +s1 sample_1 +s2 sample_2 +s3 sample_3 +s4 sample_4 +s5 sample_5 +s6 sample_6 +s7 sample_7 +s8 sample_8 +s9 sample_9 +s10 sample_10 diff -r 1f113d9db8ba -r 1dc65ec11a40 tools/linelisting/linelisting.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/linelisting/linelisting.py Wed Jan 09 15:33:32 2019 -0500 @@ -0,0 +1,219 @@ +#!/usr/bin/env python +'''Reads in a fasta file of extracted antigenic sites and one containing a +reference flu antigenic map, reading them in as protein SeqRecords. Compares each amino +acid of each sample antigenic map to corresponding sites in the reference and replaces +identical amino acids with dots. Writes headers (including amino acid position numbers +read in from the respective index array), the reference amino acid sequence and column +headings required for non-aggregated line lists. Outputs headers and modified (i.e. dotted) +sequences to a csv file.''' + +'''Author: Diane Eisler, Molecular Microbiology & Genomics, BCCDC Public Health Laboratory, Nov 2017''' + +import sys,string,os, time, Bio, re, argparse +from Bio import Seq, SeqIO, SeqUtils, Alphabet, SeqRecord +from Bio.SeqRecord import SeqRecord +from Bio.Alphabet import IUPAC +from Bio.Seq import Seq + +inputAntigenicMaps = sys.argv[1] #batch fasta file with antigenic map sequences +refAntigenicMap = sys.argv[2] #fasta file of reference antigenic map sequence +antigenicSiteIndexArray = sys.argv[3] #antigenic site index array csv file +cladeDefinitionFile = sys.argv[4] #clade definition csv file +outFileHandle = sys.argv[5] #user-specifed output filename + +lineListFile = open(outFileHandle,'w') #open a writable output file + +indicesLine = "" #comma-separated antigenic site positions +cladeList = [] #list of clade names read from clade definition file +ref_seq = "" #reference antigenic map (protein sequence) +seqList = [] #list of aa sequences to compare to reference + +BC_list = [] #empty list for BC samples +AB_list = [] #empty list for AB samples +ON_list = [] #empty list for ON samples +QC_list = [] #empty list for QC samples +nonprov_list = [] #empty list for samples not in above 4 provinces +#dictionary for location-separated sequence lists +segregated_lists = {'1_BC':BC_list,'2_AB':AB_list,'3_ON':ON_list,'4_QC': QC_list, '5_nonprov': nonprov_list} +uniqueSeqs = {} #empty dict with unique seqs as keys and lists of SeqRecords as values + +def replace_matching_aa_with_dot(record): + """Compare amino acids in record to reference sequence, replace matching symbols + with dots, and return record with modified amino acid sequence.""" + orig_seq = str(record.seq) #get sequence string from SeqRecord + mod_seq = "" + #replace only those aa's matching the reference with dots + for i in range(0, len(orig_seq)): + if (orig_seq[i] == ref_seq[i]): + mod_seq = mod_seq + '.' + else: + mod_seq = mod_seq + orig_seq[i] + #assign modified sequence to new SeqRecord and return it + rec = SeqRecord(Seq(mod_seq,IUPAC.protein), id = record.id, name = "", description = "") + return rec + +def extract_clade(record): + """Extract clade name (or 'No_Match') from sequence name and return as clade name. """ + if record.id.endswith('No_Match'): + clade_name = 'No_Match' + end_index = record.id.index(clade_name) + record.id = record.id[:end_index -1] + return clade_name + else: # + for clade in cladeList: + if record.id.endswith(clade): + clade_name = clade + end_index = record.id.index(clade) + record.id = record.id[:end_index -1] + return clade_name + +def sort_by_location(record): + """Search sequence name for province name or 2 letter province code and add SeqRecord to + province-specific dictionary.""" + seq_name = record.id + if ('-BC-' in seq_name) or ('/British_Columbia/' in seq_name): + BC_list.append(record) #add Sequence record to BC_list + elif ('-AB-' in seq_name) or ('/Alberta/' in seq_name): + AB_list.append(record) #add Sequence record to AB_list + elif ('-ON-' in seq_name) or ('/Ontario/' in seq_name): + ON_list.append(record) #add Sequence record to ON_list + elif ('-QC-' in seq_name) or ('/Quebec/' in seq_name): + QC_list.append(record) #add Sequence record to QC_list + else: + nonprov_list.append(record) #add Sequence record to nonprov_list + return + +def extract_province(record): + """Search sequence name for province name or 2 letter province code and return province.""" + seq_name = record.id + if ('-BC-' in seq_name) or ('/British_Columbia/' in seq_name): + province = 'British Columbia' + elif ('-AB-' in seq_name) or ('Alberta' in seq_name): + province = '/Alberta/' + elif ('-ON-' in seq_name) or ('/Ontario/' in seq_name): + province = 'Ontario' + elif ('-QC-' in seq_name) or ('/Quebec/' in seq_name): + province = 'Quebec' + else: + province = "other" + return province + +def get_sequence_length(record): + """Return the length of a sequence in a Sequence record.""" + sequenceLength = len(str((record.seq))) + return sequenceLength + +def get_antigenic_site_substitutions(record): + """Count number of non-dotted amino acids in SeqRecord sequence and return as substitutions.""" + sequenceLength = get_sequence_length(record) + seqString = str(record.seq) + matches = seqString.count('.') + substitutions = sequenceLength - matches + return substitutions + +def calculate_percent_id(record, substitutions): + """Calculate sequence identity to a reference (based on substitutions and sequence length) and return percent id.""" + sequenceLength = get_sequence_length(record) + percentID = (1.00 - (float(substitutions)/float(sequenceLength))) + return percentID + +def output_linelist(sequenceList): + """Output a list of SeqRecords to a non-aggregated line list in csv format.""" + for record in sequenceList: + #get province, clade from sequence record + province = extract_province(record) + clade = extract_clade(record) + #calculate number of substitutions and % id to reference + substitutions = get_antigenic_site_substitutions(record) + percentID = calculate_percent_id(record,substitutions) + name_part = (record.id).rstrip() + ',' + clade_part = clade + ',' + substitutions_part = str(substitutions) + ',' + percID_part = str(percentID) + ',' + col = " ," #empty column + sequence = str(record.seq).strip() + csv_seq = ",".join(sequence) +"," + #write linelisted antigenic maps to csv file + comma_sep_line = name_part + col + clade_part + col + csv_seq + substitutions_part + percID_part + "\n" + lineListFile.write(comma_sep_line) + return + +with open (antigenicSiteIndexArray,'r') as siteIndices: + """Read amino acid positions from antigenic site index array and print as header after one empty row.""" + col = "," #empty column + #read items separated by comma's to position list + for line in siteIndices: + #remove whitespace from the end of each line + indicesLine = line.rstrip() + row1 = "\n" + #add comma-separated AA positions to header line + row2 = col + col + col + col + indicesLine + "\n" + #write first (empty) and 2nd (amino acid position) lines to linelist output file + lineListFile.write(row1) + lineListFile.write(row2) + +with open (refAntigenicMap,'r') as refMapFile: + """Read reference antigenic map from fasta and output amino acids, followed by column headers.""" + #read sequences from fasta to SeqRecord, uppercase, and store sequence string to ref_seq + record = SeqIO.read(refMapFile,"fasta",alphabet=IUPAC.protein) + record = record.upper() + ref_seq = str(record.seq).strip() #store sequence in variable for comparison to sample seqs + col = "," #empty column + name_part = (record.id).rstrip() + ',' + sequence = str(record.seq).strip() + csv_seq = ",".join(sequence) + #output row with reference sequence displayed above sample sequences + row3 = name_part + col + col + col + csv_seq + "\n" + lineListFile.write(row3) + #replaces digits in the indicesLine with empty strings + positions = indicesLine.split(',') + numPos = len(positions) + empty_indicesLine = ',' * numPos + #print column headers for sample sequences + row4 = "Sequence Name,N,Clade,Extra Substitutions," + empty_indicesLine + "Number of Amino Acid Substitutions in Antigenic Sites,% Identity of Antigenic Site Residues\n" + lineListFile.write(row4) + print ("\nREFERENCE ANTIGENIC MAP: '%s' (%i amino acids)" % (record.id, len(record))) + +with open(cladeDefinitionFile,'r') as cladeFile: + """Read clade definition file and store clade names in a list.""" + #remove whitespace from the end of each line and split elements at commas + for line in cladeFile: + elementList = line.rstrip().split(',') + name = elementList[0] #move 1st element to name field + cladeList.append(name) + +with open(inputAntigenicMaps,'r') as extrAntigMapFile: + """Read antigenic maps as protein SeqRecords and add to list.""" + #read Sequences from fasta file, uppercase and add to seqList + for record in SeqIO.parse(extrAntigMapFile, "fasta", alphabet=IUPAC.protein): + record = record.upper() + seqList.append(record) #add Seq to list of Sequences + +#print number of sequences to be process as user check +print "\nCOMPARING %i flu antigenic map sequences to the reference..." % len(seqList) +#parse each antigenic map sequence object +for record in seqList: + #assign Sequence to dictionaries according to location in name + sort_by_location(record) +#sort dictionary keys that access province-segregated lists +sorted_segregated_list_keys = sorted(segregated_lists.keys()) +print "\nSequence Lists Sorted by Province: " +#process each province-segregated SeqRecord list +for listname in sorted_segregated_list_keys: + #acesss list of sequences by the listname key + a_list = segregated_lists[listname] + # sort original SeqRecords by record id (i.e. name) + a_list = [f for f in sorted(a_list, key = lambda x : x.id)] + mod_list = [] # empty temporary list + for record in a_list: + #replace matching amino acid symbols with dots + rec = replace_matching_aa_with_dot(record) + mod_list.append(rec) #populate a list of modified records + segregated_lists[listname] = mod_list + print "\n'%s' List (Amino Acids identical to Reference Masked): " % (listname) + #output the list to csv as non-aggregated linelist + output_linelist(segregated_lists[listname]) + +extrAntigMapFile.close() +refMapFile.close() +lineListFile.close() \ No newline at end of file diff -r 1f113d9db8ba -r 1dc65ec11a40 tools/linelisting/linelisting.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/linelisting/linelisting.xml Wed Jan 09 15:33:32 2019 -0500 @@ -0,0 +1,35 @@ + + + biopython + + + + + + + + + + + + + + + + + + + + + + + + diff -r 1f113d9db8ba -r 1dc65ec11a40 tools/linelisting/test-data/FluA_H3_antigenic_aa_indices.csv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/linelisting/test-data/FluA_H3_antigenic_aa_indices.csv Wed Jan 09 15:33:32 2019 -0500 @@ -0,0 +1,1 @@ +44,45,46,47,48,50,51,53,54,57,59,62,63,67,75,78,80,81,82,83,86,87,88,91,92,94,96,102,103,109,117,121,122,124,126,128,129,130,131,132,133,135,137,138,140,142,143,144,145,146,150,152,155,156,157,158,159,160,163,164,165,167,168,170,171,172,173,174,175,176,177,179,182,186,187,188,189,190,192,193,194,196,197,198,201,203,207,208,209,212,213,214,215,216,217,218,219,226,227,228,229,230,238,240,242,244,246,247,248,260,261,262,265,273,275,276,278,279,280,294,297,299,300,304,305,307,308,309,310,311,312 diff -r 1f113d9db8ba -r 1dc65ec11a40 tools/linelisting/test-data/Flu_Clade_Definitions_H3_20171121.csv --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/linelisting/test-data/Flu_Clade_Definitions_H3_20171121.csv Wed Jan 09 15:33:32 2019 -0500 @@ -0,0 +1,12 @@ +3C.2a,1,3,I,144,S,145,S,159,Y,160,T,225,D,311,H,489,N,,,,,,,,,,,,,,,,,,,, +3C.2a_+_T131K_+_R142K_+_R261Q,2,3,I,131,K,142,K,144,S,145,S,159,Y,160,T,225,D,261,Q,311,H,489,N,,,,,,,,,,,,,, +3C.2a_+_N121K_+_S144K,2,3,I,121,K,144,K,145,S,159,Y,160,T,225,D,311,H,489,N,,,,,,,,,,,,,,,,,, +3C.2a_+_N31S_+_D53N_+_R142G_+_S144R_+_N171K_+_I192T_+_Q197H,2,3,I,31,S,53,N,142,G,144,R,145,S,159,Y,160,T,171,K,192,T,197,H,225,D,311,H,489,N,,,,,,,, +3C.2a1,2,3,I,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,484,E,489,N,,,,,,,,,,,,,, +3C.2a1_+_N121K,3,3,I,121,K,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,484,E,489,N,,,,,,,,,,,, +3C.2a1_+_N121K_+_K92R_+_H311Q,4,3,I,92,R,121,K,144,S,145,S,159,Y,160,T,171,K,225,D,311,Q,406,V,484,E,489,N,,,,,,,,,, +3C.2a1_+_N121K_+_T135K,4,3,I,121,K,135,K,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,479,E,484,E,489,N,,,,,,,, +3C.2a1_+_N121K_+_I140M,4,3,I,121,K,140,M,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,479,E,484,E,489,N,,,,,,,, +3C.2a1_+_N121K_+_R142G,4,3,I,121,K,142,G,144,S,145,S,159,Y,160,T,171,K,225,D,311,H,406,V,484,E,489,N,,,,,,,,,, +3C.2a1_+_N121K_+_R142G_+_I242V,5,3,I,121,K,142,G,144,S,145,S,159,Y,160,T,171,K,225,D,242,V,311,H,406,V,479,E,484,E,489,N,,,,,, +3C.3a,1,128,A,138,S,142,G,145,S,159,S,225,D,,,,,,,,,,,,,,,,,,,,,,,, diff -r 1f113d9db8ba -r 1dc65ec11a40 tools/linelisting/test-data/MAP_3C.2a_A_Hong_Kong_4801_2014_X-263B_EGG.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/linelisting/test-data/MAP_3C.2a_A_Hong_Kong_4801_2014_X-263B_EGG.fasta Wed Jan 09 15:33:32 2019 -0500 @@ -0,0 +1,4 @@ +>Clade_3C.2a_A/Hong_Kong/4801/2014_X-263B_EGG +QNSSIEIDSQLENIQGQNKKLFVSKYSVPRTNNSNTGVTQNTSAIRSSSSRNTHLNYKAL +NTMNNEQFDKLIVGTDKDIFPAQSRXKRSAVIPNIGSIPSRIKGILNSTIRSSPGKKSEF +VRIACRYVKHS diff -r 1f113d9db8ba -r 1dc65ec11a40 tools/linelisting/test-data/fluA_H3_clade_assigned_antigenic_sites_extracted.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/linelisting/test-data/fluA_H3_clade_assigned_antigenic_sites_extracted.fasta Wed Jan 09 15:33:32 2019 -0500 @@ -0,0 +1,94 @@ +>A-ON-314-2017_3C.3a +QNSSIEIDSQLENIQGQNKKLFVNKYNVPRTNNSNAGVTQNTSSIGSKSSRNTHLNSKAL +NTMNNEQFDKLIVGTDKDISLAQSRTKRSAVIPNIGSIPSRIKGILNSTIRSSPGKKSEF +VRIACRYVKQS + +>A-AB-399-2017_3C.2a_+_N31S_+_D53N_+_R142G_+_S144R_+_N171K_+_I192T_+_Q197H +QNSSIEINSQLENIQGQNKKLFVSKYNVPRTNNSNTGVTQNTSAIGSRSSRNTHLNYTAL +NTMNKEQFDKLIVGTDKDTFLAHSRTKRSAVIPNIGSIPSRIKGLLNSTIRSSPGKKSEF +VRIACRYVKHS + +>A-QC-303-2017_3C.2a_+_N31S_+_D53N_+_R142G_+_S144R_+_N171K_+_I192T_+_Q197H +QNSSIEINSQLENIQGQNKKLFVSKYNVPRTNNSNTGVTQNTSAIGSRSSRNTHLNYTAL +NTMNKEQFDKLIVGTDKDTFLAHSRTKRSAVIPNIGSIPSRIKGLLNSTIRSSPGKKSEF +VRIACRYVKHS + +>A-AB-319-2017_3C.2a_+_N31S_+_D53N_+_R142G_+_S144R_+_N171K_+_I192T_+_Q197H +QNSSIEINSQLENIQGQNKKLFVSKYNVPRTNNSNTGVTQNTSAIGSRSSRNTHLNYTAL +NTMNKEQFDKLIVGTDKDTFLAHSRTKRSAVIPNIGSIPSRIKGLLNSTIRSSPGKKSEF +VRIACRYVKHS + +>A-AB-308-2017_3C.2a1_+_N121K_+_T135K +QNSSIEIDSQLENIQDQNKKLFVSKHNVPRTKDSNTGVTQNKSAIRSSSSRNTHLNYTAL +NTMNKEQFDKLIIGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIRSSPGKKSEF +VRIACRYVKHS + +>A-AB-341-2017_3C.2a1_+_N121K_+_T135K +QNSSIEIDSQLENIQDQNKKLFVSKHNVPRTKDSNTGVTQNKSAIRSSSSRNTHLNYTAL +NTMNKEQFDKLIIGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIRSSPGKKSEF +VRIACRYVKHS + +>A-BC-024-2018_3C.2a1_+_N121K_+_T135K +QNSSIEIDSQLENIQDQNKKLFVSKHNVPRTKNSNTGVTQNKSAIRSSSSRNTHLNYTAL +NTMNKEQFDKLIIGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIRSSPGKKSEF +VRIACRYVKHS + +>A-QC-309-2017_3C.2a1_+_N121K_+_K92R_+_H311Q +QNSSIEIDSQLGNIQDQNKKLFVSRYNVPRTKDSNTGVTQNKSAIGSSSSRNTHLNYTAL +NTMNKEQFDKLIVGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIRSSPGKKSEF +VRIACRYVKQS + +>A-BC-324-2017_3C.2a1_+_N121K_+_K92R_+_H311Q +QNSSMEIDSQLGNIQGQNKKLFVSRYNVPRTKNSNTGVTQNKSAIGSSSSRNTHLNYTAL +NTMNKEQFDKLIVGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIRSSPGKKSEF +VRIACRYVKQS + +>A-QC-315-2017_3C.2a1_+_N121K_+_K92R_+_H311Q +QNSSIEIDSQLGNIQGQNKKLFVSRYNVPRTKNSNAGVTQNKSAIGSSSSRNTHLNYTAL +NTMNKEQFDKLIVGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIRSSPGKKSEF +VRIACRYVKQS + +>A-ON-016-2018_3C.2a_+_N121K_+_S144K +QNSSIEIDSQLENIQGQNKKLFVSKYNVPRTKNSNTGVTQNKSAIRSKSSKNTHLNYTAL +NTMNNEQFDKLIVGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIQSSPGKKSEF +VRIACRYVKHS + +>A-BC-325-2017_3C.2a_+_N121K_+_S144K +QNSSIEIDSQLENIQGQNKKLFVSKYNVPRTKNSNTGVTQNKSAIRSKSSKNTHLNYTAL +NTMNNEQFDKLIVGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIQSSPGKKSEF +VRIACRYVKHS + +>A-ON-003-2018_3C.2a_+_N121K_+_S144K +QNSSIEIDSQLENIQGQNKKLFVSKYNVPRTKNSNTGVTQNKSAIRSKSSKNTHLNYTAL +NTMNNEQFDKLIVGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIQSSPGKKSEF +VRIACRYVKHS + +>A-ON-309-2017_No_Match +QNSSIEIDSQLENIQGQNKKLFVSKYNVPRTNNSNTGVKQNTSAIKSSSSRNTHLNYKAL +NTMNNEQFDKLIVGTDKDIFLAQSKTKISAVIPNIGSIPSRIKGILNSTIQSSPGKKSEF +VRIACRYVKHS + +>A-BC-330-2017_No_Match +QNSSIEIDSQLENIQGQNKKLFVSKYNVPRTNNSNTGVKQNTSAIKSRSSRNTHLNYTAL +NTMNNEQFDKLIVGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIQSSPGKKSEF +VRIACRYVKHS + +>A-AB-415-2017_3C.2a_+_T131K_+_R142K_+_R261Q +QNSSIEIDSQLENIQGQNKKLFVSRYNVPRTNNSNTGVKQNTSAIKSSSSRNTHLNYTAL +NTMNNEQFDKLIVGTDKDIFLAQSRTKRSAVIPNIGFIPSRIKGILNSTIQSSPGKKSEF +VRIACRYVKHS + +>A-AB-400-2017_3C.2a_+_T131K_+_R142K_+_R261Q +QNSSIEIDSQLENIQGQNKKLFVSRYNVPRTNNSNTGVKQNTSAIKSSSSRNTHLNYTAL +NTMNNEQFDKLIVGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIQSSPGKKSEF +VRIACRYVKHS + +>A-AB-416-2017_3C.2a_+_T131K_+_R142K_+_R261Q +QNSSIEIDSQLENIQGQNKKLFVSRYNVPRTNNSNTGVKQNTSAIKSSSSRNTHLNYTAL +NTMNNEQFDKLIVGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIQSSPGKKSEF +VRIACRYVKHS + +>A-QC-316-2017_3C.2a_+_T131K_+_R142K_+_R261Q +QNSSIEIDSQLENIQGQNKKLFVSRYNVPRTNNSNTGVKQNTSAIKSSSSRNTHLNYTAL +NTMNNEQFDKLIVGTDKDIFLAQSRTKRSAVIPNIGSIPSRIKGILNSTIQSSPGKKSEF +VRIACRYVKHS diff -r 1f113d9db8ba -r 1dc65ec11a40 tools/reformat_usearch_collapsed_fasta/reformat_usearch_collapsed_fasta.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/reformat_usearch_collapsed_fasta/reformat_usearch_collapsed_fasta.py Wed Jan 09 15:33:32 2019 -0500 @@ -0,0 +1,42 @@ +#!/usr/bin/env python +import sys, re + +'''Accepts a sequence-collapsed fasta output from USEARCH (drive5) software and reformats the fasta +definition lines by replacing occurences of ';size=N;' with '_xN' and writing output to fasta +(N = number of identical sequences represented by the collapsed sequence). If N is not greater +than 1 (i.e. only 1 sample with that sequence), replaces ';size=N;' with ''. For example, +'>sequence_A;size=2;' is replaced with '>sequence_A_x2', whereas '>sequence_B;size=1;' is +replaced with 'sequence_B'. +#USAGE EXAMPLE: python reformat_usearch_collapsed_fasta.py usearch_collapsed_sequences.fasta output.fasta + +Author: Diane Eisler, Molecular Microbiology & Genomics, BCCDC Public Health Laboratory,Feb 2018''' + +inFileHandle = sys.argv[1] #input fasta filename +outFileHandle = sys.argv[2] #output fasta filename +outFile = open(outFileHandle,'w') #open a writable output file + +separator = "_x" #the string separating sequence name from number of sequences, N +regex = re.compile(";size=[0-9]{0,};") #regex snippet from debuggex + +#parse fasta definition lines for pattern matching regex +with open(inFileHandle,'r') as inFile: + for line in inFile: + if ">" in line: + #look for regex pattern in fasta definition line + matchArray = regex.findall(line) + if len(matchArray) > 0: #replace the matching substring + substringToReplace = matchArray[0] + endIndex = len(substringToReplace) + digits = substringToReplace[6:endIndex -1] #digits between ';size=' and ';' + if int(digits) > 1: #show number of sequences if greater than 1 + replacementString = separator + digits + else: + replacementString = "" #otherwise, just display sequence name + newDefline = line.rstrip().replace(substringToReplace, replacementString) + outFile.write(newDefline + "\n") + else: #in lines without ">", write out sequence unmodified + seq = line.rstrip() + outFile.write(seq+"\n") + +inFile.close() +outFile.close() diff -r 1f113d9db8ba -r 1dc65ec11a40 tools/reformat_usearch_collapsed_fasta/reformat_usearch_collapsed_fasta.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/reformat_usearch_collapsed_fasta/reformat_usearch_collapsed_fasta.xml Wed Jan 09 15:33:32 2019 -0500 @@ -0,0 +1,30 @@ + + + biopython + + + + + + + + + + + + + + + + + + + + + + diff -r 1f113d9db8ba -r 1dc65ec11a40 tools/reformat_usearch_collapsed_fasta/test-data/10_usearch_collapsed_sequences.fasta --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/reformat_usearch_collapsed_fasta/test-data/10_usearch_collapsed_sequences.fasta Wed Jan 09 15:33:32 2019 -0500 @@ -0,0 +1,210 @@ +>SampleA;size=40; +CAAAAAATTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAATGGAACGATAGTGAA +AACAATCACAAATGACCGAATTGAAGTTACTAATGCTACTGAGTTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACA +GTCCTCATCAGATCCTTGATGGAGAGAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTTCAA +AATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTC +CCTTAGGTCACTAGTTGCCTCATCTGGCACACTGGAGTTTAAAAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACG +GAACAAGTTCTGCTTGCATAAGGGGATCTAGTAGTAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTACACA +TATCCAGCATTGAACGTGACTATGCCAAACAAGGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTAC +GGACAAGGACCAAATCTTCCTGTATGCTCAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAA +TCCCAAATATCGGATCTAGACCCAGAATAAGGGATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGA +GACGTACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAAT +AATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCAT +TCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCATAGCACTCTGAAATTGGCAACAGGAATG +CGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGT +GGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAA +TCGATCAAATCAATGGGAAGCTGAATCGGTTGATCGGAAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCA +GAAGTAGAAGGAAGAGTTCAAGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCT +TCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGC +AACTGAGGGAAAATGCTGAGGATATGGGAAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGAATCA +ATAAGAAATGAAACTTATGACCACAATGTGTACAGGGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCT +GAAGTCAGGGTACAAAGATTGG +>SampleB;size=24; +CAAAAAATTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAA +AACAATCACAAATGACCGAATTGAAGTTACTAATGCTACTGAGTTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACA +GTCCTCATCAGATCCTTGATGGAGAGAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTTCAA +AATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTC +CCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAAAAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACG +GAACAAGTTCTGCTTGCATAAGGGGATCTAGTAGTAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTACACA +TATCCAGCATTGAACGTGACTATGCCAAACAAGGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTAC +GGACAAGGACCAAATCTTCCTGTATGCTCAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAA +TCCCAAATATCGGATCTAGACCCAGAATAAGGGATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGA +GACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAAT +AATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCAT +TCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCATAGCACTCTGAAATTGGCAACAGGAATG +CGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGT +GGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAA +TCGATCAAATCAATGGGAAGCTGAATCGGTTGATCGGAAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCA +GAAGTAGAAGGAAGAGTTCAAGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCT +TCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGC +AACTGAGGGAAAATGCTGAGGATATGGGAAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCA +ATAAGAAATGAAACTTATGACCACAATGTGTACAGGGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCT +GAAGTCAGGGTACAAAGATTGG +>SampleC;size=22; +CAAAAAATTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAA +AACAATCACAAATGACCGAATTGAAGTTACTAATGCTACTGAGTTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACA +GTCCTCATCAGGTCCTTGATGGAGAGAACTGCACACTAATAGATGCTCTATTGGGGGACCCTCAGTGTGACGGCTTTCAA +AATAAGAAATGGGACCTTTTTGTTGAACGAAGCAGAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTC +CCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAAAAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACG +GAACAAGTTCTGCTTGCATAAGGAGATCTAGTAGTAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTACACA +TATCCAGCACTGAACGTGACTATGCCAAACAAGGAACAATTTGACAAATTGTACATTTGGGGGGTCCACCACCCGGGTAC +GGACAAGGACCAAATCTTCCTGTATGCTCGATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAA +TCCCAAATATCGGATCTAGACCCAGAATAAGGGATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGA +GACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAAT +AATGAGATCAGATGCACCCATTGGCAAATGCAAATCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCAT +TCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCAAAGCACTCTGAAATTGGCAACAGGAATG +CGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGT +GGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAA +TCGATCAAATCAATGGGAAGCTGAATCGATTGATCGGAAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCA +GAAGTAGAAGGAAGAGTTCAAGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCT +TCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGC +AACTGAGGGAAAATGCTGAGGATATGGGAAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCA +ATAAGAAATGAAACTTATGACCACAATGTGTACAGGGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCT +GAAGTCAGGGTACAAAGATTGG +>SampleD;size=13; +CAAAAAATTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAATGGAACGATAGTGAA +AACAATCACAAATGACCGAATTGAAGTTACTAATGCTACTGAGTTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACA +GTCCTCATCAGATCCTTGATGGAGAGAACTGCACACTAATTGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTTCAA +AATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTC +CCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAAAAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACG +GAAAAAGTTCTGCTTGCATAAGGAGATCTAGTAGTAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTACACA +TATCCAGCATTGAACGTGAGTATGCCAAACAAGGAACAATTTGACAAATTGTACATTTGGGGGGTTCATCACCCGGGTAC +GGACAAGGACCAAATCTTCCTGTATGCTCAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAA +TCCCAAATATCGGATCTAGACCCAGAATAAGGGATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGA +GACATACTTTTGATTAACAGCACAGGAAATCTAATTGCTCCTAGGGGCTACTTCAAAATACGAAGTGGGAAAAGCTCAAT +AATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCAT +TCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCATAGCACTCTGAAATTGGCAACAGGAATG +CGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGT +GGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGGAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAA +TCGATCAAATCAATGGGAAGCTGAATCGGTTGATCGGAAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCA +GAAGTAGAAGGAAGAGTTCAAGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCT +TCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGC +AACTGAGGGAAAATGCTGAGGATATGGGAAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGAATCA +ATAAGAAATGAAACTTATGACCACAATGTGTACAGGGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCT +GAAGTCAGGGTACAAAGATTGG +>SampleE;size=9; +CAAAAAATTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAATGGAACGATAGTGAA +AACAATCACAAATGACCGAATTGAAGTTACTAATGCTACTGAGTTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACA +GTCCTCATCAGATCCTTGATGGAGAGAACTGCACACTAATTGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTTCAA +AATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTC +CCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAAAAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAATG +GAAAAAGTTCTGCTTGCATAAGGAGATCTAGTAGTAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTACACA +TATCCAGCATTGAACGTGACTATGCCAAACAAGGAACAATTTGACAAATTGTACATTTGGGGGGTTCATCACCCGGGTAC +GGACAAGGACCAAATCTTCCTGTATGCTCAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAA +TCCCAAATATCGGATCTAGACCCAGAATAAGGGATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGA +GACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAAT +AATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCAT +TCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCATAGCACTCTGAAATTGGCAACAGGAATG +CGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGT +GGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAA +TCGATCAAATCAATGGGAAGCTGAATCGGTTGATCGGAAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCA +GAAGTAGAAGGAAGAGTTCAAGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCT +TCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGC +AACTGAGGGAAAATGCTGAGGATATGGGAAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGAATCA +ATAAGAAATGAAACTTATGACCACAATGTGTACAGGGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCT +GAAGTCAGGGTACAAAGATTGG +>SampleF;size=8; +CAAAAAATTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAATGGAACGATAGTGAA +AACAATCACAAATGACCGAATTGAAGTTACTAATGCTACTGAGTTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACA +GTCCTCATCAGATCCTTGATGGAGAGAACTGCACACTAATTGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTTCAA +AATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAGCTGTTACCCTTATGATGTGCCGGATTATGCCTC +CCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAAAAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACG +GAAAAAGTTCTGCTTGCATAAGGAGATCTAGTAGTAGTTTCTTTAGTAGATTAAATTGGTTAACCCACTTAAACTACACA +TATCCAGCATTGAACGTGACTATGCCAAACAAGGAACAATTTGACAAATTGTACATTTGGGGGGTTCATCACCCGGGTAC +GGACAAGGACCAAATCTTCCTGTATGCTCAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAA +TCCCAAATATCGGATCTAGACCCAGAATAAGGGATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGA +GACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAAT +AATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCAT +TCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCATAGCACTCTGAAATTGGCAACAGGAATG +CGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGT +GGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAA +TCGATCAAATCAATGGGAAGCTGAATCGGTTGATCGGAAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCA +GAAGTAGAAGGAAGAGTTCAAGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCT +TCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGC +AACTGAGGGAAAATGCTGAGGATATGGGAAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGAATCA +ATAAGAAATGAAACTTATGACCACAATGTGTACAGGGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCT +GAAGTCAGGGTACAAAGATTGG +>SampleG;size=8; +CAAAAAATTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAA +AACAATCACAAATGACCGAATTGAAGTTACTAATGCTACTGAGTTGGTTCAGAATTCCTCAATAGGTGAAATATGCAACA +GTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTTCAA +AATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTC +CCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCAAACAAAACG +GAACAAGTTCTGCTTGTATAAGGAAATCTAGTAGTAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTACACA +TATCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTAC +GGACAAGGACCAAATCTCCCTGTATGCTCAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAA +TCCCAAATATCGGATCCAGACCCAGAATAAGGGATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGA +GACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACAAAGTGGGAAAAGCTCAAT +AATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCAT +TCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCATAGCACTCTGAAATTGGCAACAGGAATG +CGAAATGTACCAGAGAAACAAACTAGGGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGT +GGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGGGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAA +TCGATCAAATCAATGGGAAGCTGAATCGGTTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCA +GAAGTAGAAGGAAGAATTCAGGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCT +TCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGC +AACTGAGGGAAAATGCTGAGGATATGGGAAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGTTCA +ATAAGAAATGGAACTTATGACCACAATGTGTACAGGGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAACT +GAAGTCAGGGTACAAAGATTGG +>SampleH;size=7; +CAAAAAATTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAA +AACAATCACAAATGACCGAATTGAAGTTACTAATGCTACTGAGTTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACA +GTCCTCATCAGATCCTTGATGGAGAGAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTTCAA +AATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTC +CCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAAAAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACG +GAACAAGTTCTGCTTGCATAAGGGGATCTAGTAGTAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTACACA +TATCCAGCATTGAACGTGACTATGCCAAACAAGGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTAC +GGACAAGGACCAAATCTTCCTGTATGCTCAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAA +TCCCAAATATCGGATCTAGACCCAGAATAAGGGATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGA +GACATACTTTTGATTAACAGCACGGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAAT +AATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCAT +TCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCATAGCACTCTGAAATTGGCAACAGGAATG +CGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGT +GGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACGCAGGCAGCAA +TCGATCAAATCAATGGGAAGCTGAATCGGTTGATCGGAAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCA +GAAGTAGAAGGAAGAGTTCAAGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTTTGGTCATACAACGCGGAGCT +TCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGC +AACTGAGGGAAAATGCTGAGGATATGGGAAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGATCA +ATAAGAAATGAAACTTATGACCACAATGTGTACAGGGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCT +GAAGTCAGGGTACAAAGATTGG +>SampleI;size=6; +CAAAAAATTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAATGGAACGATAGTGAA +AACAATCACAAATGACCGAATTGAAGTTACTAATGCTACTGAGTTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACA +GTCCTCATCAGATCCTTGATGGAGAGAACTGCACACTAATTGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTTCAA +AATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTC +CCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAAAAATGAAAGCTTCAATTGGACTGGAGTCACTCAAAACG +GAAAAAGTTCTGCTTGCATAAGGAGATCTAGTAGTAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTACACA +TATCCAGCATTGAACGTGACTATGCCAAACAAGGAACAATTTGACAAATTGTACATTTGGGGGGTTCATCACCCGGGTAC +GGACAAGGACCAAATCTTCCTGTATGCTCAATCATCAGGAAGAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAA +TCCCAAATATCGGATCTAGACCCAGAATAAGGGATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGA +GACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACGAAGTGGGAAAAGCTCAAT +AATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCAT +TCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCATAGCACTCTGAAATTGGCAACAGGAATG +CGAAATGTACCAGAGAAACAAACTAGAGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGT +GGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAA +TCGATCAAATCAATGGGAAGCTGAATCGGTTGATCGGAAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCA +GAAGTAGAAGGAAGAGTTCAAGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCT +TCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGC +AACTGAGGGAAAATGCTGAGGATATGGGAAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGAATCA +ATAAGAAATGAAACTTATGACCACAATGTGTACAGGGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCT +GAAGTCAGGGTACAAAGATTGG +>SampleJ;size=5; +CAAAAAATTCCTGGAAATGACAATAGCACGGCAACGCTGTGCCTTGGGCACCATGCAGTACCAAACGGAACGATAGTGAA +AACAATCACAAATGACCGAATTGAAGTTACTAATGCTACTGAGTTGGTTCAGAATTCCTCAATAGGTGAAATATGCGACA +GTCCTCATCAGATCCTTGATGGAGAAAACTGCACACTAATAGATGCTCTATTGGGAGACCCTCAGTGTGATGGCTTTCAA +AATAAGAAATGGGACCTTTTTGTTGAACGAAGCAAAGCCTACAGCAACTGTTACCCTTATGATGTGCCGGATTATGCCTC +CCTTAGGTCACTAGTTGCCTCATCCGGCACACTGGAGTTTAACAATGAAAGCTTCAATTGGACTGGAGTCAAACAAAACG +GAACAAGTTCTGCTTGTATAAGGAAATCTAGTAGTAGTTTCTTTAGTAGATTAAATTGGTTGACCCACTTAAACTACACA +TATCCAGCATTGAACGTGACTATGCCAAACAATGAACAATTTGACAAATTGTACATTTGGGGGGTTCACCACCCGGGTAC +GGACAAGGACCAAATCTTCCTGTATGCTCAATCATCAGGAAAAATCACAGTATCTACCAAAAGAAGCCAACAAGCTGTAA +TCCCAAATATCGGATCCAGACCCAGAATAAGGGATATCCCTAGCAGAATAAGCATCTATTGGACAATAGTAAAACCGGGA +GACATACTTTTGATTAACAGCACAGGGAATCTAATTGCTCCTAGGGGTTACTTCAAAATACAAAGTGGGAAAAGCTCAAT +AATGAGATCAGATGCACCCATTGGCAAATGCAAGTCTGAATGCATCACTCCAAATGGAAGCATTCCCAATGACAAACCAT +TCCAAAATGTAAACAGGATCACATACGGGGCCTGTCCCAGATATGTTAAGCATAGCACTCTGAAATTGGCAACAGGAATG +CGAAATGTACCAGAGAAACAAACTAGGGGCATATTTGGCGCAATAGCGGGTTTCATAGAAAATGGTTGGGAGGGAATGGT +GGATGGTTGGTACGGTTTCAGGCATCAAAATTCTGAGGGAAGAGGACAAGCAGCAGATCTCAAAAGCACTCAAGCAGCAA +TCGATCAAATCAATGGGAAGCTGAATCGGTTGATCGGGAAAACCAACGAGAAATTCCATCAGATTGAAAAAGAATTCTCA +GAAGTAGAAGGAAGAATTCAGGACCTTGAGAAATATGTTGAGGACACTAAAATAGATCTCTGGTCATACAACGCGGAGCT +TCTTGTTGCCCTGGAGAACCAACATACAATTGATCTAACTGACTCAGAAATGAACAAACTGTTTGAAAAAACAAAGAAGC +AACTGAGGGAAAATGCTGAGGATATGGGAAATGGTTGTTTCAAAATATACCACAAATGTGACAATGCCTGCATAGGTTCA +ATAAGAAATGGAACTTATGACCACAATGTGTACAGGGATGAAGCATTAAACAACCGGTTCCAGATCAAGGGAGTTGAGCT +GAAGTCAGGGTACAAAGATTGG \ No newline at end of file