Mercurial > repos > rijst > snptools
changeset 2:7e46920d9664 draft
Deleted selected files
author | rijst |
---|---|
date | Wed, 12 Dec 2012 06:31:15 -0500 |
parents | 14dfc38234cd |
children | 1f00946b18c2 |
files | gbk_to_fasta.py gbk_to_fasta.xml snpsplit.py snpsplit.xml tablemerger.py tablemerger.xml trams.py trams.xml |
diffstat | 8 files changed, 0 insertions(+), 737 deletions(-) [+] |
line wrap: on
line diff
--- a/gbk_to_fasta.py Wed Dec 12 06:26:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,29 +0,0 @@ -import sys - -if len(sys.argv) < 3: - exit("Not enough arguments passed, pleas provide names of input- and output file") - -input_name = sys.argv[1] -output_name = sys.argv[2] - -from Bio import GenBank - -try: seq_record = GenBank.RecordParser().parse(open(input_name)) -except: exit("Error reading %s, check file correctness." % input_name) - -try: out_file = open(output_name, 'w') -except IOError as e: - exit("Error trying to open '%s': {1}".format(e.errno, e.strerror)) - -accession = definition = '' -if seq_record.accession[0] != '': accession = '|gb|'+seq_record.accession[0] -if seq_record.definition != '': definition = '|'+seq_record.definition - -out_file.write(">gi|%s%s%s\n" % (seq_record.gi,accession,definition)) - -i = 0 -while i < len(seq_record.sequence): - out_file.write(seq_record.sequence[i:i+70]+"\n") - i += 70 - -out_file.close()
--- a/gbk_to_fasta.xml Wed Dec 12 06:26:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,13 +0,0 @@ -<tool id="gbk2fasta" name="Genbank-to-Fasta"> - <description>produces a Fasta file from a genbank file</description> - <command interpreter="python">gbk_to_fasta.py $input $output</command> - <inputs> - <param name="input" type="data" format="text" label="Genbank file" /> - </inputs> - <outputs> - <data name="output" format="fasta" label="${tool.name} on ${on_string}: converted Genbank file" /> - </outputs> - <help> - This tool produces a fasta file from a genbank file containing a sequence. - </help> -</tool>
--- a/snpsplit.py Wed Dec 12 06:26:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,45 +0,0 @@ -'''This script takes a tab-delimited file containting position, ref base, mut base and splits any multicharacter ref or mut base entries into seperate lines and calculating the new positions''' - -import sys - -if len(sys.argv) != 3: - exit("snpsplit takes exactly two arguments (input and output file), no more and no less") - -input_name = sys.argv[1] -output_name = sys.argv[2] - -try: - in_file = open(input_name) -except IOError as e: - exit("Error trying to open '"+input_name+"': {1}".format(e.errno, e.strerror)) - -try: - out_file = open(output_name, 'w') -except IOError as e: - exit("Error trying to open '"+output_name+"': {1}".format(e.errno, e.strerror)) - -def splitter(cells): - global out_lines - for i in range(0,len(cells[1])): - if cells[1][i] == cells[2][i]: continue - out_file.write(str(int(cells[0])+i)+'\t'+cells[1][i]+'\t'+cells[2][i]+'\n') - out_lines += 1 - -in_lines=out_lines=0 -out_file.write("Position\tRef\tMut\n") -for line in in_file: - in_lines += 1 - cells = line.rstrip().split('\t') - if not str(line[0]).isdigit(): - out_file.write(line) - continue - - # Can only deal with SNPs/MNPs, not indels. - if len(cells[1]) != len(cells[2]): continue - splitter(cells) - -in_file.close() -out_file.close() - -print "Lines read: %s" % in_lines -print "Lines printed: %s" % out_lines
--- a/snpsplit.xml Wed Dec 12 06:26:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,33 +0,0 @@ -<tool id="snpsplit" name="SNP splitter"> - <description>splits multicharacter entries into separate lines</description> - <command interpreter="python">snpsplit.py $input $output</command> - <inputs> - <param name="input" type="data" format="tabular" label="SNP table" /> - </inputs> - <outputs> - <data name="output" format="tabular" label="${tool.name} on ${on_string}" /> - </outputs> - <help> - -Input: tab delimited, format Position Ref Mut -Position is 1-based genomic coordinate -Ref is the reference sequence -Mut is the mutant sequence - -Ref en Mut sequences consisting of more than one character will be split up into separate lines. Example: -Input: -123 CGT ATG -Output: -123 C A -124 G T -125 T G - -Bases that are the same in both columns, will be skipped. Example: -Input: -123 CGT AGG -Output: -123 C A -125 T G - - </help> -</tool>
--- a/tablemerger.py Wed Dec 12 06:26:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,73 +0,0 @@ -'''Takes tab-delimited SNP tables from user input and merges them into one.''' - -import sys -files = [] -filenames = [] - -try: - output = open(sys.argv[1], "w") - output.write('Position\tReference') -except: - exit("No output file given or unable to open output file.") -for name in sys.argv[2:]: - try: - files.append(open(name, "rU")) - except: - continue - -# Fetch headers and print them to output file; -headers = [header.readline()[:-1].split('\t')[2:] for header in files] -columns = [len(strains) for strains in headers] -for strain in [a for b in headers for a in b]: - output.write('\t'+strain) - output.flush() - -file_active = [True]*len(files) -snps = [row.readline()[:-1].split('\t') for row in files] - -while True in file_active: - for h in range(0,len(snps)): - if file_active[h]: - cur_pos = [h] - lowest = int(snps[h][0]) - break - i = 1 - - # Determine lowest position - while i < len(snps): - if int(snps[i][0]) < lowest and file_active[i]: - lowest = int(snps[i][0]) - cur_pos = [i] - elif int(snps[i][0]) == lowest: - cur_pos.append(i) - i+=1 - - # Check if all SNPs sharing a position have the same reference base, exit with message otherwise; - if len(cur_pos) > 1: - ref_base = snps[cur_pos[0]][1].lower() - for j in cur_pos[1:]: - if snps[j][1].lower() != ref_base: - error = '\nError: Reference bases not the same for position %s, present in following files:' % lowest - for k in cur_pos: - error += ' '+filenames[k] - exit(error+'.') - - # Write line to output file containing position, ref base and snps/empty cells; - output.write('\n'+snps[cur_pos[0]][0]+'\t'+snps[cur_pos[0]][1].lower()) - for l,row in enumerate(snps): - if l in cur_pos: - for snp in row[2:]: - output.write('\t'+snp) - else: - output.write('\t'*columns[l]) - - # Read new line in files that had snp at current position; - for m in cur_pos: - line = files[m].readline() - if line == '': file_active[m] = False - else: - snps[m] = line.split('\t') - snps[m][-1] = snps[m][-1].rstrip()# Remove newline character at end of line; - -for it in files: it.close() -output.close()
--- a/tablemerger.xml Wed Dec 12 06:26:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,34 +0,0 @@ -<tool id="tablemerger" name="SNP table merger"> - <description>merges any number of SNP tables into one</description> - <command interpreter="python">tablemerger.py $output - #for $f in $inputs: - $f.in - #end for - </command> - <inputs> - <repeat name="inputs" title="Input files"> - <param name="in" type="data" format="tabular" label="Input SNP table" /> - </repeat> - </inputs> - <outputs> - <data name="output" format="tabular" label="${tool.name} on various SNP tables" /> - </outputs> - <help> -This tool takes any number of tab-delimited SNP tables and merges them together.It puts SNPs at the same position on the same line and ignores bases that are the same in two strains. -Example input: -Position Ref Strain1 -123 A G -125 C T - -Position Ref Strain2 -123 A T -124 G C -125 C T - -Would give output: -Position Ref Strain1 Strain2 -123 A G T -124 G C -125 C T T - </help> -</tool>
--- a/trams.py Wed Dec 12 06:26:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,487 +0,0 @@ -################# -transl_table = 11 -intro_message = ''' +------------------------------------------------------------------+ - | Tool for Rapid Annotation of Microbial SNPs (TRAMS): a simple | - | program for rapid annotation of genomic variation in prokaryotes | - | | - | Developed by: Richard A. Reumerman, Paul R. Herron, | - | Paul A. Hoskisson and Vartul Sangal | - +------------------------------------------------------------------+\n''' -################# - -import sys -import time -start = time.clock() - -# Command line files: SNP REF REF-TYPE ANNOT OVERL SUM; -if len(sys.argv) < 7: - exit("\nNot enough arguments given.\nUsage: TRAMS_Galaxy.py [SNP.] [REF.] [ANNOT.] [OVERL.] [SUM.]") -try: - file_snps = open(sys.argv[1], "rU") -except IOError as e: - exit("Error trying to open '"+sys.argv[1]+"': {1}".format(e.errno, e.strerror)) -try: - file_ref = open(sys.argv[2], "rU") -except IOError as e: - exit("Error trying to open '"+sys.argv[2]+"': {1}".format(e.errno, e.strerror)) - -filetype_reference = sys.argv[3] - -try: - file_out = open(sys.argv[4], "w") -except IOError as e: - exit("Error trying to open '"+sys.argv[4]+"': {1}".format(e.errno, e.strerror)) -try: - file_overlap = open(sys.argv[5], "w") -except IOError as e: - exit("Error trying to open '"+sys.argv[5]+"': {1}".format(e.errno, e.strerror)) -try: - file_summary = open(sys.argv[6], "w") -except IOError as e: - exit("Error trying to open '"+sys.argv[6]+"': {1}".format(e.errno, e.strerror)) - -import Bio -from Bio import SeqIO, SeqFeature -from Bio.SeqRecord import SeqRecord -from Bio.Seq import Seq -from Bio.Alphabet import generic_dna, IUPAC -from Bio.Data import CodonTable - -modules_loaded = time.clock() - -def non_coding_calc(gene, pos = 0): - '''This function takes a pseudogene and returns the number of bases - located in between the sub-features before 'pos'. Returns 0 if 'pseudo' = False. - Input: {start, subfeats, pseudo}, pos (default = 0)''' - if not gene['pseudo']: return 0 - - non_coding_bases = 0 - prev_subfeat_end = gene['start'] - if gene['strand'] == -1: - for subfeature in gene['subfeats']: - if subfeature.location._start.position < pos: - prev_subfeat_end = subfeature.location._end.position - continue - non_coding_bases += (subfeature.location._start.position - prev_subfeat_end) - prev_subfeat_end = subfeature.location._end.position - else: - for subfeature in gene['subfeats']: - non_coding_bases += (subfeature.location._start.position - prev_subfeat_end) - prev_subfeat_end = subfeature.location._end.position - if prev_subfeat_end >= pos and pos != 0: break - - return non_coding_bases - - -def region_calc(bounds,length): - regions = [] - lastend=i=0 - while i < len(bounds): - if bounds[i]['start'] > lastend:# Intergenic region present; - regions.append([lastend,bounds[i]['start'],-1]) - lastend = bounds[i]['start'] - else: - regions.append([bounds[i]['start'],bounds[i]['end'],i]) - if bounds[i]['end'] > lastend: - lastend = bounds[i]['end'] - i += 1 - - if regions[-1][1] < length:# Final tail of genome; - regions.append([lastend,length,-1]) - - return regions - - -def overlap_calc(bounds): - '''This function takes an array of feature starts and ends and - returns an array of starts and ends of all overlapping regions. - Input: [{start,end}]''' - i = 0 - overlaps = [] - while i < len(bounds) - 1: - for downstr in bounds[i+1:]: - if downstr[0] < bounds[i][1]:# Features overlap; - if downstr[1] < bounds[i][1]:# Complete overlap; - overlaps.append([downstr[0],downstr[1],bounds[i][2],downstr[2],[0,0]]) - else:# Partial overlap; - overlaps.append([downstr[0],bounds[i][1],bounds[i][2],downstr[2],[0,0]]) - else:# No use looking further; - break - - i += 1 - - return overlaps - - -def match_feature(bounds,pos,prev=0): - '''This function checks if a position is located inside a feature and - returns the feature's number if found or -1 if none is found. - Input: {start,end},pos,prev_feat (default = 0)''' - for i in range(prev, len(bounds)): - if (pos >= bounds[i]['start']) and (pos < bounds[i]['end']): - return i - elif pos < bounds[i]['start']:# No use looking further - return -1 - - return -1 - - -def write_output(line,target=file_out): - '''This function takes the 2 dimensional array containing all the SNP - data. It contains an array of information on the feature and an array - for each strain for which SNPs are given. - Input: [[pos],[ref],[cells],[cells],etc]''' - target.write('\n'+str(line[0][0])) - for cell in line[1]: - target.write('\t'+str(cell)) - for strain in line[2:]: - target.write('\t') - for cell in strain: - target.write('\t'+str(cell)) - - target.flush() - - -def new_codon_calc(ref_codon, new_base, pos_in_cod): - return str(ref_codon[0:pos_in_cod-1]+new_base+ref_codon[pos_in_cod:len(ref_codon)]) - - -def mut_type_check(ref_res, ref_codon, pos_in_gene, new_base, new_codon): - if str(new_codon).lower() == str(ref_codon).lower(): - return ['','','',''] - new_residue = Seq(new_codon).translate(table=transl_table) - if str(new_residue) == str(ref_res): - mut_type = 'synonymous' - elif (pos_in_gene / 3) < 1 and str(ref_codon).upper() in CodonTable.unambiguous_dna_by_id[transl_table].start_codons:# position 0,1 or 2 and SNP is in start codon; - if str(new_codon).upper() in CodonTable.unambiguous_dna_by_id[transl_table].start_codons: mut_type = 'nonsynonymous' - else: mut_type = 'nonstart' - elif str(new_residue) == '*': mut_type = 'nonsense' - elif str(ref_res) == '*': mut_type = 'nonstop' - else: mut_type = 'nonsynonymous' - - return [mut_type,new_base,new_codon,new_residue] - - -def codon_process(codon): - '''This function processes a codon. It loops through it 3 times, - once to determine which is the highest position mutated, once to - fill in the cells for the output file and once to output all lines. - Input: [empty,start_pos,[line1],[line2],[line3],strand] - It also uses global variable strain_nr''' - lastposition = [-1]*(strain_nr-1) - new_codons = ['']*(strain_nr-1) - if codon[-1] == -1:# Change codon position order for -1 features; - temp = codon [1:-1] - temp.reverse() - codon[1:-1] = temp - for i,line in enumerate(codon[1:-1],1): - if line == '': continue - for j,strain in enumerate(line[2:]): - if strain[1] in ['a','g','c','t']: - lastposition[j] = i - new_codons[j] = codon[i][1][8] - - for i,line in enumerate(codon[1:-1],1): - if codon[-1] == -1: pos_in_cod = 4-i - else: pos_in_cod = i - - if line == '': continue - for j,strain in enumerate(line[2:]): - if i == lastposition[j]: # Check for synonymous etc.; - new_codons[j] = new_codon_calc(new_codons[j],strain[1],pos_in_cod) - codon[i][j+2] = mut_type_check(line[1][9],line[1][8],codon[0],strain[1],new_codons[j]) - straininfo[j][codon[i][j+2][0]] += 1# Counting; - elif strain[1] in ['a','g','c','t']: - codon[i][j+2] = ['MNP',strain[1],'',''] - straininfo[j]['mnps'] += 1 - new_codons[j] = new_codon_calc(new_codons[j],strain[1],pos_in_cod) - elif strain[0] == 'Allele missing': codon[i][j+2] = strain - else: codon[i][j+2] = ['']*4 - - for line in codon[1:-1]: - if line != '': write_output(line) - -def feature_props(feature): - properties = {'type':feature.type,'strand':feature.location._strand, - 'sequence':feature.extract(seq_record.seq),'pseudo': False, - 'locus_tag':'','gene_name':'','product':'', - 'start':int(feature.location._start.position), - 'end':int(feature.location._end.position)} - if 'pseudo' in feature.qualifiers: - properties['pseudo'] = True - properties['type'] = 'pseudogene' - properties['pure_seq'] = properties['sequence'] - if properties['strand'] == -1: - properties['sequence'] = seq_record.seq[feature.location._start.position:feature.location._end.position].reverse_complement() - else: - properties['sequence'] = seq_record.seq[feature.location._start.position:feature.location._end.position] - if feature.sub_features: properties['subfeats'] = feature.sub_features - if 'locus_tag' in feature.qualifiers: properties['locus_tag'] = feature.qualifiers['locus_tag'][0] - if 'gene' in feature.qualifiers: properties['gene_name']= feature.qualifiers['gene'][0] - if feature.type in ['tRNA','rRNA','CDS']: properties['product'] = feature.qualifiers['product'][0] - - return properties - -# Read embl/genbank file for information on sequence features; -try: - seq_record = SeqIO.parse(file_ref, filetype_reference).next() -except: - file_ref.close() - quit("Error reading "+sys.argv[2]+", please check file for errors.") -file_ref.close() - -# Loop through genome features and save relevant properties; -feats = []# Dictionary of properties; - -feature_types = {'intergenic':0,'gene':0,'pseudogene':0} -feat_temp_store = '' -for feature in seq_record.features: - # Check if gene is defined as other feature (e.g. CDS). Else, save info from 'gene'; - if feat_temp_store != '': - if (feature.location._start.position == feat_temp_store.location._start.position and - feature.location._end.position == feat_temp_store.location._end.position):# Gene also defined as other feature; - feat_temp_store = '' - else:# Gene not also defined as CDS; - feats.append(feature_props(feat_temp_store)) - feat_temp_store = '' - elif feature.type == 'gene': - feat_temp_store = feature - - if not feature.type in ['source','gene','misc_feature']: - if not feature.type in feature_types and feature.type != 'CDS': feature_types[feature.type] = 0 - feats.append(feature_props(feature)) - - -feat_props = sorted(feats, key=lambda cells:int(cells['start'])) -feat_boundaries = [{'start':item['start'],'end':item['end']} for item in feat_props] -regions = region_calc(feat_boundaries,len(seq_record.seq)) -feat_overlap = overlap_calc(regions) - -reference_loaded = time.clock() - -# Create array of SNPs from input file for processing; -lines = [line.split('\t') for line in file_snps if line.strip()] -file_snps.close() -# First line contains headers, extract number of strains etc; -headers = lines[0] -snp_table = sorted(lines[1:], key=lambda cells:int(cells[0])) - -snps_loaded = time.clock() - -# Print output file headers; -headers[-1] = headers[-1].rstrip()# Remove newline character; -strain_nr = len(headers)-1 -strains_found = 'Found '+str(strain_nr)+' strains: '+headers[1]+' (reference)' -first_line = '\t'+headers[1]+'\t'*9 -second_line = 'Position\tFeature\tLocus tag\tGene\tProduct\tStart\tEnd\tStrand\tRef. base\tRef. codon\tRef. res.' -straininfo = [0]*(len(headers[2:])) -for i,strain in enumerate(headers[2:]): - straininfo[i] = {'snps':0,'mnps':0,'synonymous':0,'nonsynonymous':0,'nonstart':0,'nonstop':0,'nonsense':0} - straininfo[i].update(feature_types) - strains_found += ', '+strain - first_line += '\t\t'+strain+'\t'*3 - second_line += '\t\tSNP type\tNew base\tNew codon\tNew res.' - -file_out.write(first_line+'\n'+second_line) -file_out.flush() - -# Loop through SNPs from array and process them; -props = {}# Properties of a feature; -prev_snp = ''# Position of previous SNP; -to_write = []# Information of current SNP; -compl_bases = {'a':'t','t':'a','g':'c','c':'g'} -firstsnp = True# First snp of region, or of codon in cases of 3 positions in codon mutated; -prev_start=j=k=0 -overlap_snps = [] -codon = ['']*5# Array of codon positions. First item is position of first base of codon in the gene; - -for region in regions: - firstsnp = True - i = prev_start - while i < len(snp_table):# Loop through SNPs - snp_entry = snp_table[i] - if not str(snp_entry[0]).isdigit():# Not a valid line, skip; - i += 1 - continue - - pos = int(snp_entry[0])-1 - if pos < region[0]:# Not inside region yet; - i += 1 - continue - elif firstsnp and pos < region[1]: - prev_start = i - elif pos >= region[1]:# End of region, process and next; - if not firstsnp and codon != ['','','','','']: - codon_process(codon) - break - - # Documentation of SNPs in feature overlaps; - while j < len(feat_overlap)-1 and pos > feat_overlap[j][1]: j += 1 - k = j - while k < len(feat_overlap) and pos >= feat_overlap[k][0]: - if pos < feat_overlap[k][1]: - if feat_overlap[k][4][0] == 0: - feat_overlap[k][4][0] = pos - feat_overlap[k][4][1] = pos - k += 1 - - - snp_entry[-1] = snp_entry[-1].rstrip()# Remove newline character at end of line; - mnp=in_feat=False - snp_feat = region[2] - ref_base = snp_entry[1] - - to_write = [[pos+1]] - - # Output feature properties and reference situation; - if snp_feat == -1: - codon = ['']*5 - to_write.append(['intergenic','','','','','','',ref_base.upper(),'','']) - elif feat_props[snp_feat]['type'] not in ['CDS','gene','pseudogene']:# In feature, but non-coding; - codon = ['']*5 - props = feat_props[snp_feat] - if props['strand'] == -1: ref_base = (compl_bases[snp_entry[1].lower()]) - else: ref_base = snp_entry[1] - to_write.append([props['type'],props['locus_tag'],props['gene_name'], - props['product'],props['start']+1,props['end'], - '',ref_base.upper(),'','']) - else:# in CDS/gene feature, check codon etc; - props = feat_props[snp_feat] - sequence = props['sequence'] - if props['strand'] == -1: - pos_in_gene = props['end'] - pos - 1# Python counting - ref_base = (compl_bases[snp_entry[1].lower()]) - else: - pos_in_gene = pos - props['start']# Python counting - ref_base = snp_entry[1] - - in_feat = True - if props['pseudo'] and 'subfeats' in props:# Pseudogene that needs special attention; - in_feat = False - subfeat_boundaries = [{'start':item.location._start.position,'end':item.location._end.position} - for item in props['subfeats']] - snp_subfeat = match_feature(subfeat_boundaries,pos) - if snp_subfeat != -1: - in_feat = True - pos_in_gene -= non_coding_calc({'start':props['start'],'subfeats':props['subfeats'], - 'pseudo':True,'strand':props['strand']},pos) - sequence = props['pure_seq'] - - if not in_feat:# In pseudogene non-coding region; - codon = ['']*5 - to_write.append(['non coding',props['locus_tag'],props['gene_name'],props['product'], - props['start']+1,props['end'],props['strand'],ref_base.upper(), - '','']) - else:# In coding region; - pos_in_cod = (pos_in_gene+1)%3 - if pos_in_cod == 0: pos_in_cod = 3# Remainder of division 0 means 3rd place in codon; - - old_codon = sequence[pos_in_gene-pos_in_cod+1:pos_in_gene-pos_in_cod+4].upper() - old_residue = old_codon.translate(table=transl_table) - to_write.append([props['type'],props['locus_tag'],props['gene_name'],props['product'], - props['start']+1,props['end'],props['strand'],ref_base.upper(), - old_codon,old_residue]) - - if in_feat and not firstsnp and (pos >= prev_snp):# Check if snp is in same codon as previous snp. Position check for overlapping features; - if props['strand'] == 1 and (pos - prev_snp + 1) < pos_in_cod:# Same codon (Positive strand); - mnp = True - elif props['strand'] == -1 and (pos - prev_snp + 1) <= (3 - pos_in_cod):# Same codon (negative strand); - mnp = True - - # Process previous codon if not MNP; - if in_feat and not mnp: - if not firstsnp: - codon_process(codon) - codon = [pos_in_gene-pos_in_cod+1,'','','',props['strand']] - - - for l, snp in enumerate(snp_entry[2:]):# Loop through SNPs/strains; - - snp = snp.lower() - if snp == '':# Empty cell; - to_write.append(['','','','']) - continue - - if snp == '-': # Feature not present in this strain; - to_write.append(['Allele missing','','','']) - continue - - if snp_feat == -1:# Intergenic; - if snp == ref_base.lower(): - to_write.append(['']*4) - else: - to_write.append(['',snp,'','']) - straininfo[l]['intergenic'] += 1 - straininfo[l]['snps'] += 1 - continue - - if props['strand'] == -1: - snp = compl_bases[snp] - - if snp == ref_base.lower(): - to_write.append(['']*4) - else: - to_write.append(['',snp,'','']) - straininfo[l]['snps'] += 1 - if props['type'] != 'CDS': - straininfo[l][props['type']] += 1 - - - - if props['type'] in ['CDS','gene','pseudogene'] and in_feat: - codon[pos_in_cod] = to_write - else: - write_output(to_write) - - if firstsnp: firstsnp = False - prev_snp = pos+1 - i += 1 - - -if codon != ['','','','','']: codon_process(codon) - -file_out.close() - -end = time.clock() - -file_summary.write("\n") -file_summary.write(intro_message) -file_summary.write('\n'+strains_found+'.\n') - -file_summary.write("\nFinished annotation. Total time: %s s\n\n" % round(end-start,1)) - - -file_overlap.write('SNP start\tSNP end\tFeature 1\tLocus tag\tProduct\t\tFeature 2\tLocus tag\tProduct') -for overlap in feat_overlap: - if overlap[4] != [0,0]: - overlap[4][0]+=1 - overlap[4][1]+=1 - if overlap[4][0] == overlap[4][1]: overlap[4][1] = '' - write_output([[str(overlap[4][0])],[str(overlap[4][1]),feat_props[overlap[2]]['type'],feat_props[overlap[2]]['locus_tag'],feat_props[overlap[2]]['product']], - [feat_props[overlap[3]]['type'],feat_props[overlap[3]]['locus_tag'],feat_props[overlap[3]]['product']]], - file_overlap) - - -for i,strain in enumerate(headers[2:]): - file_summary.write("\n") - info = straininfo[i] - file_summary.write("+ Strain %s:\n" % strain) - file_summary.write(" %s SNPs found\n" % info['snps']) - file_summary.write(" Number of SNPs found CDS features: %s\n" % (info['mnps']+info['nonstart']+info['nonstop']+info['nonsense']+ - info['synonymous']+info['nonsynonymous'])) - file_summary.write(" (of which in pseudogenes: %s)\n" % info['pseudogene']) - file_summary.write(" - MNPs: %s\n" % info['mnps']) - file_summary.write(" - Synonymous: %s\n" % info['synonymous']) - file_summary.write(" - Nonsynonymous: %s\n" % info['nonsynonymous']) - file_summary.write(" - Nonsense: %s\n" % info['nonsense']) - file_summary.write(" - Nonstart: %s\n" % info['nonstart']) - file_summary.write(" - Nonstop: %s\n" % info['nonstop']) - file_summary.write(" Intergenic: %s\n" % info['intergenic']) - - for typ in feature_types: - if typ not in ['intergenic','pseudogene'] and info[typ] != 0: - file_summary.write(" %s: %s\n" % (typ,info[typ])) - file_summary.flush() - -file_overlap.close() -file_summary.close()
--- a/trams.xml Wed Dec 12 06:26:42 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,23 +0,0 @@ -<tool id="trams" name="TRAMS"> - <description>Tool for Rapid Annotation of Microbial SNPs</description> - <command interpreter="python">trams.py $input1 $input2 $ref_format $annot $overl $sum</command> - <inputs> - <param name="input1" type="data" format="tabular" label="SNP table" /> - <param name="input2" type="data" format="txt" label="Reference genome" help="Reference genome file, should be genbank or embl file." /> - <param name="ref_format" type="select"> - <label>Reference file type</label> - <option value="gb">Genbank</option> - <option value="embl">EMBL</option> - </param> - </inputs> - <outputs> - <data name="annot" format="tabular" label="${tool.name} on ${on_string}: SNP annotation file" /> - <data name="overl" format="tabular" label="${tool.name} on ${on_string}: Overlap file" /> - <data name="sum" format="txt" label="${tool.name} on ${on_string}: Summary file" /> - </outputs> - <help> - -This tool annotates SNPs from a SNP table and a genbank file. - - </help> -</tool>