Mercurial > repos > abims-sbr > mutcount
view scripts/S02a_codon_counting.py @ 3:263caa68d7bb draft
planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 76e603ecd0118c8060d972b675a13db858956eb6
| author | abims-sbr |
|---|---|
| date | Wed, 17 Jan 2018 11:35:37 -0500 |
| parents | 988467f963f0 |
| children | 5766f80370e7 |
line wrap: on
line source
#!/usr/bin/env python # coding:utf-8 # Command line : ./S02a_codon_counting.py <concatenation_file_from_phylogeny> #python codoncounting.py alignement_file #Warning!! alignement_file should always be preceded by a path, at least ./ #Warning!! pairs.txt should be located in the same folder as alignement_file #the script counts the number of codons, amino acids, and types of amino acids in sequences, as well as the mutation bias from one item to another between 2 sequences #counting is then compared to empirical p-values, obtained from bootstrapped sequences obtained from a subset of sequences #the script takes as input the DNA alignment (fasta format): python codoncounting.py file_path.fasta #in the output files, the pvalues indicate the position of the observed data in a distribution of empirical countings obtained from a resample of the data. Values above 0.95 indicate a significantly higher counting, values under 0.05 a significantly lower counting #the script automatically reads the sequences to compare from a file that must be called pairs.txt and located with the .fasta file #in the pairs.txt file, sequences (let's assume X, Y, Z, U, V) pairs must be written as 'X Y\nU V\nZ V' #in this case, codoncounting will count the occurence of codons, amino acids, and types of amino acids in X, U, Z, and count the mutation bias from Y to X, V to U and V to Z #you can add comments in the pairs.txt file inbetween lines, beginning with '#'. E.G. 'X Y\n#This is my comment\nU V\nZ V' #X, Y, Z, U, V must be character strings contained in the sequences names in the .fasta file (and be specific to each of them) #in pairs.txt, you must write how should be built the bootstrapped resampling of sequences. This must be formated as:'X Y\nbackground: length iterration plusminus listofspecies\nU V\nZ V', explanation below #backgrounds must be excplicitely written in the pairs.txt file (the script still integers default parameters). This implies that the first line of pairs.txt should be a background line #by default, once the background has been determined, it will be applied to each subsequent analysis until another background is written #e.g. 'background: length1 iterration1 plusminus1 listofspecies1\nU V\nZ V\nbackground: length2 iterration2 plusminus2 listofspecies2\nX Y' the first background is applied to U V and Z V and the 2nd background to X Y #the script resamples random pairs of aligned codon to determine what countings can be expected under the hypothesis of an homogenous dataset #countings are performed on each generated random alignement, thousands of alignments allow to draw a gaussian distribution of the countings #then the script simply checks whether the observed data are within the 5% lowest or 5% highest values of the distribution #in background: length iterration plusminus listofspecies #-> length is the number of pairs of codons in each generated alignments (effect on the robustness on the countings performed on this alignement) #-> iterration is the number of alignments that will be generated (effect on the resolution of the gaussian distribution) #-> plusminus can be either '+' or '-', '+' indicates that the following species only must be resampled, '-' that the following species must be excluded from the resampling #-> listofspecies is the list of species (names contained in the sequences names from the fasta file) that must be included or excluded from the sampling. You can also write 'all' to include every species (in this case, plusminus parameter is ignored) #full example: background 5000 10000 + melanogaster elegans sapiens #iterration shouldn't be lower that 1000 to have a relatively smooth gaussian distribution, length shouldn't be lower as 1000 to detect codons with relatively low occurence (<1%) #for the list of species, you can try to form subgroups depending on the studied parameter (e.g. comparing a terrestrial species with a background composed of marine species) #added: also counts GC3 and IVYWREL #added: also counts GC12, CvP bias and EK/QH #added: also counts purine load and PAYRE/SDGM def viable(seqs,pos): r=0 compt=0 for i in range(len(seqs)): if i%2==1: if not '-' in seqs[i][pos:pos+3]: compt+=1 if compt>1: r=1 return r def substrcountings(table,pvalues): string=[] names=[] numbers=[] stats=['pvalues'] for f in table: names.append(',%s' % f) numbers.append(',%f' % table[f]) stats.append(',%f' % pvalues[f]) names.append('\n') numbers.append('\n') stats.append('\n') string=names+numbers+stats return string def substrbiases(table,pvalues): string=[] names=[] numbers=[] stats=['pvalues'] for f in table: for g in table[f]: names.append(',%s>%s' % (g,f)) numbers.append(',%f' % table[f][g]) stats.append(',%f' % pvalues[f][g]) names.append('\n') numbers.append('\n') stats.append('\n') string=names+numbers+stats return string def strcountings(codons, aa, classif,codonspvalues,aapvalues,classifpvalues,GC3,GC12,IVYWREL,EKQH,PAYRESDGM,purineload,CvP): subcodons=['occurence of codons\n']+substrcountings(codons,codonspvalues) subaa=subcodons+['\noccurence of amino_acids\n']+substrcountings(aa,aapvalues) subclassif=subaa+['\noccurence of amino_acid types\n']+substrcountings(classif,classifpvalues) string=subclassif+[('\nGC3 GC12 IVYWREL EK/QH PAYRE/SDGM Purine_load CvP\n%f,%f,%f,%f,%f,%f,%f\n' % (GC3, GC12, IVYWREL, EKQH, PAYRESDGM, purineload, CvP))] return string def strbiases(codons, aa, classif,codonspvalues,aapvalues,classifpvalues,fileaa, fileaatype, p): # Writing to files (tab separated; the tab characters are in the lists values) aafreq, aapval = substrbiases(aa,aapvalues)[401:801], substrbiases(aa,aapvalues)[803:-1] aatypefreq, aatypepval = substrbiases(classif,classifpvalues)[17:33], substrbiases(classif,classifpvalues)[35:-1] fileaa.write("%s>%s" %(p[1], p[0])) for value in aafreq: fileaa.write(str(value)) fileaa.write("\n") fileaa.write("%s>%s_pvalue" %(p[1], p[0])) for value in aapval: fileaa.write(str(value)) fileaa.write("\n") fileaatype.write("%s>%s" %(p[1], p[0])) for value in aatypefreq: fileaatype.write(str(value)) fileaatype.write("\n") fileaatype.write("%s>%s_pvalue" %(p[1], p[0])) for value in aatypepval: fileaatype.write(str(value)) fileaatype.write("\n") # Origin code subcodons=['codons mutations biases\n']+substrbiases(codons,codonspvalues) subaa=subcodons+['\namino acids mutation biases\n']+substrbiases(aa,aapvalues) subclassif=subaa+['\ntypes of amino_acids mutation biasecodoncountingv22.pys\n']+substrbiases(classif,classifpvalues) return subclassif def testpvalue(bootstrap,value,iterration): #computes where the observed value is located in the expected counting distribution maxval=iterration-1 minval=0 testval=(maxval+minval)/2 while maxval-minval > 1: if value > bootstrap[testval]: minval=testval testval=(maxval+minval)/2 elif value < bootstrap[testval]: maxval=testval testval=(maxval+minval)/2 else: break pvalue=(testval+1)/float(iterration+1) return pvalue def gettables(content,reversecode,code,classif): #generates the tables contening all the countings if content==[]: codonscount={k:[] for k in reversecode} aacount={k:[] for k in code} aaclassifcount={k:[] for k in classif} codons={} for k in reversecode: codons[k]={} for q in reversecode: codons[k][q]=[] aa={} for k in code: aa[k]={} for q in code: aa[k][q]=[] aaclassif={} for k in classif: aaclassif[k]={} for q in classif: aaclassif[k][q]=[] elif content==0: codonscount={k:0 for k in reversecode} aacount={k:0 for k in code} aaclassifcount={k:0 for k in classif} codons={} for k in reversecode: codons[k]={} for q in reversecode: codons[k][q]=0 aa={} for k in code: aa[k]={} for q in code: aa[k][q]=0 aaclassif={} for k in classif: aaclassif[k]={} for q in classif: aaclassif[k][q]=0 return codonscount, aacount, aaclassifcount, codons, aa, aaclassif def countings(seq1,seq2,code,classif,reversecode,reverseclassif): # COMPTAGES (output counts.txt) functions used : gettables() #countings actually counts occurence and mutation bias of codons, amino acids and types of amino acids codonscount, aacount, aaclassifcount, codons, aa, aaclassif=gettables(0,reversecode,code,classif) codonscount2, aacount2, aaclassifcount2, _, _, _=gettables(0,reversecode,code,classif) G12=0 C12=0 G3=0 C3=0 A=0 T=0 i=0 compcodons=0 while i<len(seq1)-1: if not '-' in seq1[i:i+3]: #coutings of occurences is obtained from the maximum of full codons being in the sequence compcodons+=1 if (seq1[i]=='g'): G12+=1 if (seq1[i]=='c'): C12+=1 if (seq1[i+1]=='g'): G12+=1 if (seq1[i+1]=='c'): C12+=1 if (seq1[i+2]=='g'): G3+=1 if (seq1[i+2]=='c'): C3+=1 if (seq1[i]=='a'): A+=1 if (seq1[i]=='t'): T+=1 if (seq1[i+1]=='a'): A+=1 if (seq1[i+1]=='t'): T+=1 if (seq1[i+2]=='a'): A+=1 if (seq1[i+2]=='t'): T+=1 codonscount[seq1[i:i+3]]+=1 aacount[reversecode[seq1[i:i+3]]]+=1 aaclassifcount[reverseclassif[reversecode[seq1[i:i+3]]]]+=1 if (not '-' in seq1[i:i+3]) and (not '-' in seq2[i:i+3]) and (not seq1[i:i+3]==seq2[i:i+3]): #mutation biases are count from non ambiguous pairs of codons in the 2 sequences codons[seq1[i:i+3]][seq2[i:i+3]]+=1 codonscount2[seq2[i:i+3]]+=1 aacount2[reversecode[seq2[i:i+3]]]+=1 aaclassifcount2[reverseclassif[reversecode[seq2[i:i+3]]]]+=1 i+=3 IVYWREL=aacount['ile']+aacount['val']+aacount['tyr']+aacount['trp']+aacount['arg']+aacount['glu']+aacount['leu'] EKQH=(aacount['glu']+aacount['lys'])/max([float(aacount['gln']+aacount['his']),1]) PAYRESDGM=(aacount['pro']+aacount['ala']+aacount['tyr']+aacount['arg']+aacount['glu'])/max([float(aacount['ser']+aacount['asp']+aacount['gly']+aacount['met']),1]) for i in codons: for j in codons[i]: if not reversecode[i]==reversecode[j]: aa[reversecode[i]][reversecode[j]]+=codons[i][j] for i in aa: for j in aa[i]: if not reverseclassif[i]==reverseclassif[j]: aaclassif[reverseclassif[i]][reverseclassif[j]]+=aa[i][j] for f in codons: #mutations from codon C in sequence X to codon c in sequence Y are normalized by the total number of codons C in sequence X for g in codons[f]: codons[f][g]=codons[f][g]/float(max([codonscount2[g],1])) for f in aa: for g in aa[f]: aa[f][g]=aa[f][g]/float(max([aacount2[g],1])) for f in aaclassif: for g in aaclassif[f]: aaclassif[f][g]=aaclassif[f][g]/float(max([aaclassifcount2[g],1])) for f in codonscount: #occurences are normalized by the total number of non ambiguous codons in the sequence codonscount[f]=codonscount[f]/float(compcodons) for f in aacount: aacount[f]=aacount[f]/float(compcodons) for f in aaclassifcount: aaclassifcount[f]=aaclassifcount[f]/float(compcodons) for f in codons: for g in codons[f]: if f==g: break if (codons[g][f]==0) and (codons[f][g])>0: codons[f][g]=1 elif (codons[g][f]==0) and (codons[f][g])==0: codons[f][g]=0 else: x=codons[f][g]/codons[g][f] codons[f][g]=-pow(2,1-x)+1 for f in aa: for g in aa[f]: if f==g: break if (aa[g][f]==0) and (aa[f][g])>0: aa[f][g]=1 elif (aa[g][f]==0) and (aa[f][g])==0: aa[f][g]=0 else: x=aa[f][g]/aa[g][f] aa[f][g]=-pow(2,1-x)+1 for f in aaclassif: for g in aaclassif[f]: if f==g: break if (aaclassif[g][f]==0) and (aaclassif[f][g])>0: aaclassif[f][g]=1 elif (aaclassif[g][f]==0) and (aaclassif[f][g])==0: aaclassif[f][g]=0 else: x=aaclassif[f][g]/aaclassif[g][f] aaclassif[f][g]=-pow(2,1-x)+1 GC3=(G3+C3)/float(compcodons) GC12=(G12+C12)/float(2*compcodons) purineload=(1000*(G12+G3+A-C12-C3-T))/float(3*compcodons) IVYWREL=IVYWREL/float(compcodons) CvP=aaclassifcount['charged']-aaclassifcount['polar'] return codonscount, aacount, aaclassifcount, codons, aa, aaclassif, GC3, GC12, IVYWREL, EKQH, PAYRESDGM, purineload, CvP def sampling(inputfile,length,iterration,plusminus,species,code,classif,reversecode,reverseclassif): # functions used : gettables, countings #sampling provides 'iterations' pairs of sequences of 'length' non ambiguous codons obtained from the dataset specified by 'species' and 'plusminus' #sort of bootstrap codonscount, aacount, aaclassifcount, codons, aa, aaclassif=gettables([],reversecode,code,classif) inputfile.seek(0) #generates the bootstrapped sequences lines=[] comp=-1 if species==['all']: while 1: line=inputfile.readline()[:-1] if not line: break lines.append(line) comp+=1 else: if plusminus=='+': while 1: line1=inputfile.readline()[:-1] if not line1: break line2=inputfile.readline()[:-1] if any(spe in line1 for spe in species): lines.append(line1) lines.append(line2) comp+=2 elif plusminus=='-': while 1: line1=inputfile.readline()[:-1] if not line1: break line2=inputfile.readline()[:-1] if not any(spe in line1 for spe in species): lines.append(line1) lines.append(line2) comp+=2 l=len(lines[1])-1 for z in range(iterration): if z%(iterration/10)==0: print str(10*z/(iterration/10))+' %' seqa=[] seqb=[] for i in range(length): site=random.randrange(0,l,3) while viable(lines,site)==0: site=random.randrange(0,l,3) a='---' b='---' while ('-' in a) or ('-' in b): posa=2 posb=2 while posa==posb: posa=random.randrange(1,comp+1,2) posb=random.randrange(1,comp+1,2) a=lines[posa][site:site+3] b=lines[posb][site:site+3] seqa.append(a) seqb.append(b) seqa=(''.join(seqa)).lower() seqb=(''.join(seqb)).lower() codonscounttemp, aacounttemp, aaclassifcounttemp, codonstemp, aatemp, aaclassiftemp, _, _, _, _, _, _, _=countings(seqa,seqb,code,classif,reversecode,reverseclassif) for f in codonscount: codonscount[f].append(codonscounttemp[f]) for f in aacount: aacount[f].append(aacounttemp[f]) for f in aaclassifcount: aaclassifcount[f].append(aaclassifcounttemp[f]) for f in codons: for g in codons[f]: codons[f][g].append(codonstemp[f][g]) for f in aa: for g in aa[f]: aa[f][g].append(aatemp[f][g]) for f in aaclassif: for g in aaclassif[f]: aaclassif[f][g].append(aaclassiftemp[f][g]) #counts the occurences and mutation biases for each the bootstrapped sequence print '100 %' for f in codonscount: codonscount[f].sort() for f in aacount: aacount[f].sort() for f in aaclassifcount: aaclassifcount[f].sort() for f in codons: for g in codons[f]: codons[f][g].sort() for f in aa: for g in aa[f]: aa[f][g].sort() for f in aaclassif: for g in aaclassif[f]: aaclassif[f][g].sort() return codonscount, aacount, aaclassifcount, codons, aa, aaclassif ################ ######RUN####### ################ import string, os, sys, re, random from math import pow codons_counts = open("codons_counts.csv", "w") aa_counts = open("aa_counts.csv", "w") aatypes_counts = open("aatypes_counts.csv", "w") gc_counts = open("gc_counts.csv", "w") # ou bien utiliser le reversecode plus bas column_index_codons = "Species,tat,tgt,tct,ttt,tgc,tgg,tac,ttc,tcg,tta,ttg,tcc,tca,gca,gta,gcc,gtc,gcg,gtg,caa,gtt,gct,acc,ggt,cga,cgc,gat,aag,cgg,act,ggg,gga,ggc,gag,aaa,gac,cgt,gaa,ctt,atg,aca,acg,atc,aac,ata,agg,cct,agc,aga,cat,aat,att,ctg,cta,ctc,cac,ccg,agt,cag,cca,ccc\n" column_index_aa = "Species,cys,asn,his,ile,ser,gln,lys,met,pro,thr,phe,ala,gly,val,leu,asp,arg,trp,glu,tyr\n" column_index_aatypes = "Species,aromatics,polar,unpolar,charged\n" column_index_gc = "Species,GC3,GC12,IVYWREL,EKQH,PAYRESDGM,purineload,CvP\n" codons_counts.write(column_index_codons) aa_counts.write(column_index_aa) aatypes_counts.write(column_index_aatypes) gc_counts.write(column_index_gc) aa_transitions = open("aa_transitions.csv", "w") aatypes_transitions = open("aatypes_transitions.csv", "w") column_index_aa_transitions = "Species,cys>cys,asn>cys,his>cys,ile>cys,ser>cys,gln>cys,lys>cys,met>cys,pro>cys,thr>cys,phe>cys,ala>cys,gly>cys,val>cys,leu>cys,asp>cys,arg>cys,trp>cys,glu>cys,tyr>cys,cys>asn,asn>asn,his>asn,ile>asn,ser>asn,gln>asn,lys>asn,met>asn,pro>asn,thr>asn,phe>asn,ala>asn,gly>asn,val>asn,leu>asn,asp>asn,arg>asn,trp>asn,glu>asn,tyr>asn,cys>his,asn>his,his>his,ile>his,ser>his,gln>his,lys>his,met>his,pro>his,thr>his,phe>his,ala>his,gly>his,val>his,leu>his,asp>his,arg>his,trp>his,glu>his,tyr>his,cys>ile,asn>ile,his>ile,ile>ile,ser>ile,gln>ile,lys>ile,met>ile,pro>ile,thr>ile,phe>ile,ala>ile,gly>ile,val>ile,leu>ile,asp>ile,arg>ile,trp>ile,glu>ile,tyr>ile,cys>ser,asn>ser,his>ser,ile>ser,ser>ser,gln>ser,lys>ser,met>ser,pro>ser,thr>ser,phe>ser,ala>ser,gly>ser,val>ser,leu>ser,asp>ser,arg>ser,trp>ser,glu>ser,tyr>ser,cys>gln,asn>gln,his>gln,ile>gln,ser>gln,gln>gln,lys>gln,met>gln,pro>gln,thr>gln,phe>gln,ala>gln,gly>gln,val>gln,leu>gln,asp>gln,arg>gln,trp>gln,glu>gln,tyr>gln,cys>lys,asn>lys,his>lys,ile>lys,ser>lys,gln>lys,lys>lys,met>lys,pro>lys,thr>lys,phe>lys,ala>lys,gly>lys,val>lys,leu>lys,asp>lys,arg>lys,trp>lys,glu>lys,tyr>lys,cys>met,asn>met,his>met,ile>met,ser>met,gln>met,lys>met,met>met,pro>met,thr>met,phe>met,ala>met,gly>met,val>met,leu>met,asp>met,arg>met,trp>met,glu>met,tyr>met,cys>pro,asn>pro,his>pro,ile>pro,ser>pro,gln>pro,lys>pro,met>pro,pro>pro,thr>pro,phe>pro,ala>pro,gly>pro,val>pro,leu>pro,asp>pro,arg>pro,trp>pro,glu>pro,tyr>pro,cys>thr,asn>thr,his>thr,ile>thr,ser>thr,gln>thr,lys>thr,met>thr,pro>thr,thr>thr,phe>thr,ala>thr,gly>thr,val>thr,leu>thr,asp>thr,arg>thr,trp>thr,glu>thr,tyr>thr,cys>phe,asn>phe,his>phe,ile>phe,ser>phe,gln>phe,lys>phe,met>phe,pro>phe,thr>phe,phe>phe,ala>phe,gly>phe,val>phe,leu>phe,asp>phe,arg>phe,trp>phe,glu>phe,tyr>phe,cys>ala,asn>ala,his>ala,ile>ala,ser>ala,gln>ala,lys>ala,met>ala,pro>ala,thr>ala,phe>ala,ala>ala,gly>ala,val>ala,leu>ala,asp>ala,arg>ala,trp>ala,glu>ala,tyr>ala,cys>gly,asn>gly,his>gly,ile>gly,ser>gly,gln>gly,lys>gly,met>gly,pro>gly,thr>gly,phe>gly,ala>gly,gly>gly,val>gly,leu>gly,asp>gly,arg>gly,trp>gly,glu>gly,tyr>gly,cys>val,asn>val,his>val,ile>val,ser>val,gln>val,lys>val,met>val,pro>val,thr>val,phe>val,ala>val,gly>val,val>val,leu>val,asp>val,arg>val,trp>val,glu>val,tyr>val,cys>leu,asn>leu,his>leu,ile>leu,ser>leu,gln>leu,lys>leu,met>leu,pro>leu,thr>leu,phe>leu,ala>leu,gly>leu,val>leu,leu>leu,asp>leu,arg>leu,trp>leu,glu>leu,tyr>leu,cys>asp,asn>asp,his>asp,ile>asp,ser>asp,gln>asp,lys>asp,met>asp,pro>asp,thr>asp,phe>asp,ala>asp,gly>asp,val>asp,leu>asp,asp>asp,arg>asp,trp>asp,glu>asp,tyr>asp,cys>arg,asn>arg,his>arg,ile>arg,ser>arg,gln>arg,lys>arg,met>arg,pro>arg,thr>arg,phe>arg,ala>arg,gly>arg,val>arg,leu>arg,asp>arg,arg>arg,trp>arg,glu>arg,tyr>arg,cys>trp,asn>trp,his>trp,ile>trp,ser>trp,gln>trp,lys>trp,met>trp,pro>trp,thr>trp,phe>trp,ala>trp,gly>trp,val>trp,leu>trp,asp>trp,arg>trp,trp>trp,glu>trp,tyr>trp,cys>glu,asn>glu,his>glu,ile>glu,ser>glu,gln>glu,lys>glu,met>glu,pro>glu,thr>glu,phe>glu,ala>glu,gly>glu,val>glu,leu>glu,asp>glu,arg>glu,trp>glu,glu>glu,tyr>glu,cys>tyr,asn>tyr,his>tyr,ile>tyr,ser>tyr,gln>tyr,lys>tyr,met>tyr,pro>tyr,thr>tyr,phe>tyr,ala>tyr,gly>tyr,val>tyr,leu>tyr,asp>tyr,arg>tyr,trp>tyr,glu>tyr,tyr>tyr\n" column_index_aatypes_transitions = "Species,aromatics>aromatics,polar>aromatics,unpolar>aromatics,charged>aromatics,aromatics>polar,polar>polar,unpolar>polar,charged>polar,aromatics>unpolar,polar>unpolar,unpolar>unpolar,charged>unpolar,aromatics>charged,polar>charged,unpolar>charged,charged>charged\n" aa_transitions.write(column_index_aa_transitions) aatypes_transitions.write(column_index_aatypes_transitions) PATH = sys.argv[1] length=1000 iterration=100 background=0 speciesboot=['all'] stringcounts=[' '] stringbiases=[' '] towrite=1 code={'phe':['ttt','ttc'],'leu':['tta','ttg','ctt','ctc','cta','ctg'],'ile':['att','atc','ata'],'met':['atg'],'val':['gtt','gtc','gta','gtg'],'ser':['tct','tcc','tca','tcg','agt','agc'],'pro':['cct','cca','ccg','ccc'],'thr':['act','acc','aca','acg'],'ala':['gct','gcc','gca','gcg'],'tyr':['tat','tac'],'his':['cat','cac'],'gln':['caa','cag'],'asn':['aat','aac'],'lys':['aaa','aag'],'asp':['gat','gac'],'glu':['gaa','gag'],'cys':['tgt','tgc'],'trp':['tgg'],'arg':['cgt','cgc','cga','cgg','aga','agg'],'gly':['ggt','ggc','gga','ggg']} classif={'unpolar':['gly','ala','val','leu','met','ile'],'polar':['ser','thr','cys','pro','asn','gln'],'charged':['lys','arg','his','asp','glu'],'aromatics':['phe','tyr','trp']} reversecode={v:k for k in code for v in code[k]} reverseclassif={v:k for k in classif for v in classif[k]} # -------------------------------------------------------------- pairs=open("pairs.txt","r") #finds the pairs to analyze pairlist=[] while 1: line=pairs.readline() if not line: #if not lastline.startswith('#'): #pairlist[-1][1]=string.split(lastline,' ')[1] break #lastline=line if not line.startswith('#'): if not line.startswith('background:'): if background==0: pairlist.append([string.split(line,' ')[0],string.split(line,' ')[1][:-1]]) elif background==1: pairlist.append([string.split(line,' ')[0],string.split(line,' ')[1][:-1],length,iterration,plusminus,species]) background=0 else: parameters=string.split(line[:-1],': ')[1] listparameters=string.split(parameters,' ') length=int(listparameters[0]) iterration=int(listparameters[1]) plusminus=listparameters[2] species=listparameters[3:] background=1 # -------------------------------------------------------------- concat=open(sys.argv[1],"r") last_iter = 1 for p in pairlist: #pairs analysis concat.seek(0) while 1: # Everything lowercase line=concat.readline() seq=concat.readline() if p[0] in line: seq1=seq[:-1].lower() elif p[1] in line: seq2=seq[:-1].lower() if not line: break if len(p)>2: #bootstrap simulations if the background changed # Only len(pairlist[0]) > 2 length=p[2] iterration=p[3] plusminus=p[4] species=p[5] # ex : 'all' applause='background: '+str(length)+' '+str(iterration)+' '+str(plusminus)+' '+' '.join(species) print applause stringcounts.append(applause+'\n\n\n') stringbiases.append(applause+'\n\n\n') # variables below are computed on the first iteration and then used for the rest of the loop codonscountboot, aacountboot, aaclassifcountboot, codonsboot, aaboot, aaclassifboot=sampling(concat,length,iterration,plusminus,species,code,classif,reversecode,reverseclassif) print str(p[0])+' vs '+str(p[1]) # ----------------------------- # Mise en memoire des calculs pour les comptages par sps + Ecriture fichiers de sortie codonscount, aacount, aaclassifcount, codons, aa, aaclassif, GC3, GC12, IVYWREL, EKQH, PAYRESDGM, purineload, CvP=countings(seq1,seq2,code,classif,reversecode,reverseclassif) codonscountpvalue, aacountpvalue, aaclassifcountpvalue, codonspvalue, aapvalue, aaclassifpvalue=gettables(0,reversecode,code,classif) for f in codons: #tests the countings and saves the pvalues for g in codons[f]: codonspvalue[f][g]=testpvalue(codonsboot[f][g],codons[f][g],iterration) for f in aa: for g in aa[f]: aapvalue[f][g]=testpvalue(aaboot[f][g],aa[f][g],iterration) for f in aaclassif: for g in aaclassif[f]: aaclassifpvalue[f][g]=testpvalue(aaclassifboot[f][g],aaclassif[f][g],iterration) for substring in stringcounts: #to not write twice the same counting with the same background if (("counting of %s" % p[0]) in substring) and (applause in substring): towrite=0 if towrite: for f in codonscount: codonscountpvalue[f]=testpvalue(codonscountboot[f],codonscount[f],iterration) for f in aacount: aacountpvalue[f]=testpvalue(aacountboot[f],aacount[f],iterration) for f in aaclassifcount: aaclassifcountpvalue[f]=testpvalue(aaclassifcountboot[f],aaclassifcount[f],iterration) ## Writing countings into separated output files ## codons_counts.write(p[0] + ",") # Species name #codons_counts.write(",") # next cell for value in codonscount.values()[0:-1]: codons_counts.write(str(value) + ",") # write codons_counts line for the species codons_counts.write(str(codonscount.values()[-1])) codons_counts.write("\n") # next line codons_counts.write(p[0] + "_pvalue,") # Same species, but line for computed pvalues for value in codonscountpvalue.values()[0:-1]: codons_counts.write(str(value) + ",") # write pvalues line for the species codons_counts.write(str(codonscountpvalue.values()[-1])) codons_counts.write("\n") aa_counts.write(p[0] + ",") # Same method as above for value in aacount.values()[0:-1]: aa_counts.write(str(value) + ",") aa_counts.write(str(aacount.values()[-1])) aa_counts.write("\n") aa_counts.write(p[0] + "_pvalue,") for value in aacountpvalue.values()[0:-1]: aa_counts.write(str(value) + ",") aa_counts.write(str(aacountpvalue.values()[-1])) aa_counts.write("\n") aatypes_counts.write(p[0] + ",") for value in aaclassifcount.values()[0:-1]: aatypes_counts.write(str(value) + ",") aatypes_counts.write(str(aaclassifcount.values()[-1])) aatypes_counts.write("\n") aatypes_counts.write(p[0] + "_pvalue,") for value in aaclassifcountpvalue.values()[0:-1]: aatypes_counts.write(str(value) + ",") aatypes_counts.write(str(aaclassifcountpvalue.values()[-1])) aatypes_counts.write("\n") gc_counts.write(p[0]) gc_counts.write(",") gc_counts.write(str(GC3)+","+str(GC12)+","+str(IVYWREL)+","+str(EKQH)+","+str(PAYRESDGM)+","+str(purineload)+","+str(CvP)+"\n") """ IMPROVMENT BELOW : Countings was not done on the last species of the list. It is now compute when the loop reaches the last iteration, thanks to the countings() function : it is called with a switch between the arguments 'seq1' and seq2'. """ if last_iter == len(pairlist): codonscount2, aacount2, aaclassifcount2, codons2, aa2, aaclassif2, GC3_b, GC12_b, IVYWREL_b, EKQH_b, PAYRESDGM_b, purineload_b, CvP_b=countings(seq2,seq1,code,classif,reversecode,reverseclassif) codonscountpvalue2, aacountpvalue2, aaclassifcountpvalue2, codonspvalue2, aapvalue2, aaclassifpvalue2=gettables(0,reversecode,code,classif) for f in codons2: #tests the countings and saves the pvalues for g in codons2[f]: codonspvalue2[f][g]=testpvalue(codonsboot[f][g],codons[f][g],iterration) for f in aa2: for g in aa2[f]: aapvalue2[f][g]=testpvalue(aaboot[f][g],aa[f][g],iterration) for f in aaclassif2: for g in aaclassif2[f]: aaclassifpvalue2[f][g]=testpvalue(aaclassifboot[f][g],aaclassif[f][g],iterration) for f in codonscount2: codonscountpvalue2[f]=testpvalue(codonscountboot[f],codonscount[f],iterration) for f in aacount2: aacountpvalue2[f]=testpvalue(aacountboot[f],aacount[f],iterration) for f in aaclassifcount2: aaclassifcountpvalue2[f]=testpvalue(aaclassifcountboot[f],aaclassifcount[f],iterration) # write last line of each countings file codons_counts.write(p[1] + ",") # second species of the couple, the last one to be written in the file for value in codonscount2.values()[0:-1]: codons_counts.write(str(value) + ",") codons_counts.write(str(codonscount2.values()[-1])) codons_counts.write("\n") codons_counts.write(p[1] + "_pvalue,") # Same species, but line for computed pvalues for value in codonscountpvalue2.values()[0:-1]: codons_counts.write(str(value) + ",") # write pvalues line for the species codons_counts.write(str(codonscountpvalue2.values()[-1])) codons_counts.write("\n") aa_counts.write(p[1] + ",") for value in aacount2.values()[0:-1]: aa_counts.write(str(value) + ",") aa_counts.write(str(aacount2.values()[-1])) aa_counts.write("\n") aa_counts.write(p[1] + "_pvalue,") for value in aacountpvalue2.values()[0:-1]: aa_counts.write(str(value) + ",") # write pvalues line for the species aa_counts.write(str(aacountpvalue2.values()[-1])) aa_counts.write("\n") aatypes_counts.write(p[1] + ",") for value in aaclassifcount2.values()[0:-1]: aatypes_counts.write(str(value) + ",") aatypes_counts.write(str(aaclassifcount2.values()[-1])) aatypes_counts.write("\n") aatypes_counts.write(p[1] + "_pvalue,") for value in aaclassifcountpvalue2.values()[0:-1]: aatypes_counts.write(str(value) + ",") # write pvalues line for the species aatypes_counts.write(str(aaclassifcountpvalue2.values()[-1])) aatypes_counts.write("\n") gc_counts.write(p[1]) gc_counts.write(",") gc_counts.write(str(GC3_b)+","+str(GC12_b)+","+str(IVYWREL_b)+","+str(EKQH_b)+","+str(PAYRESDGM_b)+","+str(purineload_b)+","+str(CvP_b)) # end writing stringcounts=stringcounts[:-1]+[''.join([stringcounts[-1],("counting of %s\n\n" % p[0])+''.join(strcountings(codonscount, aacount, aaclassifcount,codonscountpvalue,aacountpvalue,aaclassifcountpvalue,GC3,GC12,IVYWREL,EKQH,PAYRESDGM,purineload,CvP))+'\n\n'])] else: towrite=1 # ----------------------------- stringbiases.append("mutation biases from %s to %s\n\n" % (p[1], p[0])) stringbiases=stringbiases+strbiases(codons, aa, aaclassif,codonspvalue,aapvalue,aaclassifpvalue,aa_transitions,aatypes_transitions,p) stringbiases.append('\n\n') print 'done' last_iter +=1 concat.close() # -------------------------------------------------------------- stringcounts=''.join(stringcounts) stringbiases=''.join(stringbiases) # results=open(os.path.dirname(PATH)+"/"+string(os.path.split(PATH)[1],' ')[0]+'_results.txt',"w") #results=open('./codoncounting_results.txt',"w") results1=open('./counts.txt', "w") results1.write("%s" % stringcounts) results1.close() results2=open('./biases.txt', "w") results2.write("%s" % stringbiases) results2.close() #results.write("%s" % stringcounts) #results.write("\n\n") #results.write("%s" % stringbiases) #results.close() codons_counts.close() aa_counts.close() aatypes_counts.close() gc_counts.close() aa_transitions.close() aatypes_transitions.close()
