Mercurial > repos > abims-sbr > mutcount
diff scripts/S02a_codon_counting.py @ 2:988467f963f0 draft
planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit cf1b9c905931ca2ca25faa4844d45c908756472f
| author | abims-sbr |
|---|---|
| date | Wed, 17 Jan 2018 08:57:49 -0500 |
| parents | 78dd6454f6f0 |
| children | 5766f80370e7 |
line wrap: on
line diff
--- a/scripts/S02a_codon_counting.py Wed Sep 27 10:04:08 2017 -0400 +++ b/scripts/S02a_codon_counting.py Wed Jan 17 08:57:49 2018 -0500 @@ -1,4 +1,7 @@ -#!/usr/bin/python +#!/usr/bin/env python +# coding:utf-8 + +# Command line : ./S02a_codon_counting.py <concatenation_file_from_phylogeny> #python codoncounting.py alignement_file #Warning!! alignement_file should always be preceded by a path, at least ./ @@ -56,13 +59,13 @@ numbers=[] stats=['pvalues'] for f in table: - names.append(' %s' % f) - numbers.append(' %f' % table[f]) - stats.append(' %f' % pvalues[f]) + names.append(',%s' % f) + numbers.append(',%f' % table[f]) + stats.append(',%f' % pvalues[f]) names.append('\n') numbers.append('\n') stats.append('\n') - string=names+numbers+stats + string=names+numbers+stats return string @@ -74,37 +77,60 @@ stats=['pvalues'] for f in table: for g in table[f]: - names.append(' %s to %s' % (g,f)) - numbers.append(' %f' % table[f][g]) - stats.append(' %f' % pvalues[f][g]) + names.append(',%s>%s' % (g,f)) + numbers.append(',%f' % table[f][g]) + stats.append(',%f' % pvalues[f][g]) names.append('\n') numbers.append('\n') - stats.append('\n') - string=names+numbers+stats + stats.append('\n') + string=names+numbers+stats return string - def strcountings(codons, aa, classif,codonspvalues,aapvalues,classifpvalues,GC3,GC12,IVYWREL,EKQH,PAYRESDGM,purineload,CvP): subcodons=['occurence of codons\n']+substrcountings(codons,codonspvalues) subaa=subcodons+['\noccurence of amino_acids\n']+substrcountings(aa,aapvalues) subclassif=subaa+['\noccurence of amino_acid types\n']+substrcountings(classif,classifpvalues) - string=subclassif+[('\nGC3 GC12 IVYWREL EK/QH PAYRE/SDGM Purine_load CvP\n%f %f %f %f %f %f %f\n' % (GC3, GC12, IVYWREL, EKQH, PAYRESDGM, purineload, CvP))] + string=subclassif+[('\nGC3 GC12 IVYWREL EK/QH PAYRE/SDGM Purine_load CvP\n%f,%f,%f,%f,%f,%f,%f\n' % (GC3, GC12, IVYWREL, EKQH, PAYRESDGM, purineload, CvP))] return string -def strbiases(codons, aa, classif,codonspvalues,aapvalues,classifpvalues): +def strbiases(codons, aa, classif,codonspvalues,aapvalues,classifpvalues,fileaa, fileaatype, p): + + # Writing to files (tab separated; the tab characters are in the lists values) - subcodons=['codons mutations biases\n']+substrbiases(codons,codonspvalues) + aafreq, aapval = substrbiases(aa,aapvalues)[401:801], substrbiases(aa,aapvalues)[803:-1] + aatypefreq, aatypepval = substrbiases(classif,classifpvalues)[17:33], substrbiases(classif,classifpvalues)[35:-1] + + fileaa.write("%s>%s" %(p[1], p[0])) + for value in aafreq: + fileaa.write(str(value)) + fileaa.write("\n") + fileaa.write("%s>%s_pvalue" %(p[1], p[0])) + for value in aapval: + fileaa.write(str(value)) + fileaa.write("\n") + + fileaatype.write("%s>%s" %(p[1], p[0])) + for value in aatypefreq: + fileaatype.write(str(value)) + fileaatype.write("\n") + fileaatype.write("%s>%s_pvalue" %(p[1], p[0])) + for value in aatypepval: + fileaatype.write(str(value)) + fileaatype.write("\n") + + # Origin code + + subcodons=['codons mutations biases\n']+substrbiases(codons,codonspvalues) subaa=subcodons+['\namino acids mutation biases\n']+substrbiases(aa,aapvalues) - subclassif=subaa+['\ntypes of amino_acids mutation biasecodoncountingv22.pys\n']+substrbiases(classif,classifpvalues) + subclassif=subaa+['\ntypes of amino_acids mutation biasecodoncountingv22.pys\n']+substrbiases(classif,classifpvalues) return subclassif - def testpvalue(bootstrap,value,iterration): #computes where the observed value is located in the expected counting distribution maxval=iterration-1 @@ -123,7 +149,6 @@ return pvalue - def gettables(content,reversecode,code,classif): #generates the tables contening all the countings if content==[]: @@ -170,13 +195,13 @@ return codonscount, aacount, aaclassifcount, codons, aa, aaclassif - def countings(seq1,seq2,code,classif,reversecode,reverseclassif): +# COMPTAGES (output counts.txt) functions used : gettables() #countings actually counts occurence and mutation bias of codons, amino acids and types of amino acids - codonscount, aacount, aaclassifcount, codons, aa, aaclassif=gettables(0,reversecode,code,classif) + codonscount, aacount, aaclassifcount, codons, aa, aaclassif=gettables(0,reversecode,code,classif) codonscount2, aacount2, aaclassifcount2, _, _, _=gettables(0,reversecode,code,classif) - + G12=0 C12=0 G3=0 @@ -299,11 +324,10 @@ IVYWREL=IVYWREL/float(compcodons) CvP=aaclassifcount['charged']-aaclassifcount['polar'] - return codonscount, aacount, aaclassifcount, codons, aa, aaclassif, GC3, GC12, IVYWREL, EKQH, PAYRESDGM, purineload, CvP - def sampling(inputfile,length,iterration,plusminus,species,code,classif,reversecode,reverseclassif): +# functions used : gettables, countings #sampling provides 'iterations' pairs of sequences of 'length' non ambiguous codons obtained from the dataset specified by 'species' and 'plusminus' #sort of bootstrap @@ -409,6 +433,32 @@ import string, os, sys, re, random from math import pow + +codons_counts = open("codons_counts.csv", "w") +aa_counts = open("aa_counts.csv", "w") +aatypes_counts = open("aatypes_counts.csv", "w") +gc_counts = open("gc_counts.csv", "w") + +# ou bien utiliser le reversecode plus bas +column_index_codons = "Species,tat,tgt,tct,ttt,tgc,tgg,tac,ttc,tcg,tta,ttg,tcc,tca,gca,gta,gcc,gtc,gcg,gtg,caa,gtt,gct,acc,ggt,cga,cgc,gat,aag,cgg,act,ggg,gga,ggc,gag,aaa,gac,cgt,gaa,ctt,atg,aca,acg,atc,aac,ata,agg,cct,agc,aga,cat,aat,att,ctg,cta,ctc,cac,ccg,agt,cag,cca,ccc\n" +column_index_aa = "Species,cys,asn,his,ile,ser,gln,lys,met,pro,thr,phe,ala,gly,val,leu,asp,arg,trp,glu,tyr\n" +column_index_aatypes = "Species,aromatics,polar,unpolar,charged\n" +column_index_gc = "Species,GC3,GC12,IVYWREL,EKQH,PAYRESDGM,purineload,CvP\n" + +codons_counts.write(column_index_codons) +aa_counts.write(column_index_aa) +aatypes_counts.write(column_index_aatypes) +gc_counts.write(column_index_gc) + +aa_transitions = open("aa_transitions.csv", "w") +aatypes_transitions = open("aatypes_transitions.csv", "w") + +column_index_aa_transitions = "Species,cys>cys,asn>cys,his>cys,ile>cys,ser>cys,gln>cys,lys>cys,met>cys,pro>cys,thr>cys,phe>cys,ala>cys,gly>cys,val>cys,leu>cys,asp>cys,arg>cys,trp>cys,glu>cys,tyr>cys,cys>asn,asn>asn,his>asn,ile>asn,ser>asn,gln>asn,lys>asn,met>asn,pro>asn,thr>asn,phe>asn,ala>asn,gly>asn,val>asn,leu>asn,asp>asn,arg>asn,trp>asn,glu>asn,tyr>asn,cys>his,asn>his,his>his,ile>his,ser>his,gln>his,lys>his,met>his,pro>his,thr>his,phe>his,ala>his,gly>his,val>his,leu>his,asp>his,arg>his,trp>his,glu>his,tyr>his,cys>ile,asn>ile,his>ile,ile>ile,ser>ile,gln>ile,lys>ile,met>ile,pro>ile,thr>ile,phe>ile,ala>ile,gly>ile,val>ile,leu>ile,asp>ile,arg>ile,trp>ile,glu>ile,tyr>ile,cys>ser,asn>ser,his>ser,ile>ser,ser>ser,gln>ser,lys>ser,met>ser,pro>ser,thr>ser,phe>ser,ala>ser,gly>ser,val>ser,leu>ser,asp>ser,arg>ser,trp>ser,glu>ser,tyr>ser,cys>gln,asn>gln,his>gln,ile>gln,ser>gln,gln>gln,lys>gln,met>gln,pro>gln,thr>gln,phe>gln,ala>gln,gly>gln,val>gln,leu>gln,asp>gln,arg>gln,trp>gln,glu>gln,tyr>gln,cys>lys,asn>lys,his>lys,ile>lys,ser>lys,gln>lys,lys>lys,met>lys,pro>lys,thr>lys,phe>lys,ala>lys,gly>lys,val>lys,leu>lys,asp>lys,arg>lys,trp>lys,glu>lys,tyr>lys,cys>met,asn>met,his>met,ile>met,ser>met,gln>met,lys>met,met>met,pro>met,thr>met,phe>met,ala>met,gly>met,val>met,leu>met,asp>met,arg>met,trp>met,glu>met,tyr>met,cys>pro,asn>pro,his>pro,ile>pro,ser>pro,gln>pro,lys>pro,met>pro,pro>pro,thr>pro,phe>pro,ala>pro,gly>pro,val>pro,leu>pro,asp>pro,arg>pro,trp>pro,glu>pro,tyr>pro,cys>thr,asn>thr,his>thr,ile>thr,ser>thr,gln>thr,lys>thr,met>thr,pro>thr,thr>thr,phe>thr,ala>thr,gly>thr,val>thr,leu>thr,asp>thr,arg>thr,trp>thr,glu>thr,tyr>thr,cys>phe,asn>phe,his>phe,ile>phe,ser>phe,gln>phe,lys>phe,met>phe,pro>phe,thr>phe,phe>phe,ala>phe,gly>phe,val>phe,leu>phe,asp>phe,arg>phe,trp>phe,glu>phe,tyr>phe,cys>ala,asn>ala,his>ala,ile>ala,ser>ala,gln>ala,lys>ala,met>ala,pro>ala,thr>ala,phe>ala,ala>ala,gly>ala,val>ala,leu>ala,asp>ala,arg>ala,trp>ala,glu>ala,tyr>ala,cys>gly,asn>gly,his>gly,ile>gly,ser>gly,gln>gly,lys>gly,met>gly,pro>gly,thr>gly,phe>gly,ala>gly,gly>gly,val>gly,leu>gly,asp>gly,arg>gly,trp>gly,glu>gly,tyr>gly,cys>val,asn>val,his>val,ile>val,ser>val,gln>val,lys>val,met>val,pro>val,thr>val,phe>val,ala>val,gly>val,val>val,leu>val,asp>val,arg>val,trp>val,glu>val,tyr>val,cys>leu,asn>leu,his>leu,ile>leu,ser>leu,gln>leu,lys>leu,met>leu,pro>leu,thr>leu,phe>leu,ala>leu,gly>leu,val>leu,leu>leu,asp>leu,arg>leu,trp>leu,glu>leu,tyr>leu,cys>asp,asn>asp,his>asp,ile>asp,ser>asp,gln>asp,lys>asp,met>asp,pro>asp,thr>asp,phe>asp,ala>asp,gly>asp,val>asp,leu>asp,asp>asp,arg>asp,trp>asp,glu>asp,tyr>asp,cys>arg,asn>arg,his>arg,ile>arg,ser>arg,gln>arg,lys>arg,met>arg,pro>arg,thr>arg,phe>arg,ala>arg,gly>arg,val>arg,leu>arg,asp>arg,arg>arg,trp>arg,glu>arg,tyr>arg,cys>trp,asn>trp,his>trp,ile>trp,ser>trp,gln>trp,lys>trp,met>trp,pro>trp,thr>trp,phe>trp,ala>trp,gly>trp,val>trp,leu>trp,asp>trp,arg>trp,trp>trp,glu>trp,tyr>trp,cys>glu,asn>glu,his>glu,ile>glu,ser>glu,gln>glu,lys>glu,met>glu,pro>glu,thr>glu,phe>glu,ala>glu,gly>glu,val>glu,leu>glu,asp>glu,arg>glu,trp>glu,glu>glu,tyr>glu,cys>tyr,asn>tyr,his>tyr,ile>tyr,ser>tyr,gln>tyr,lys>tyr,met>tyr,pro>tyr,thr>tyr,phe>tyr,ala>tyr,gly>tyr,val>tyr,leu>tyr,asp>tyr,arg>tyr,trp>tyr,glu>tyr,tyr>tyr\n" +column_index_aatypes_transitions = "Species,aromatics>aromatics,polar>aromatics,unpolar>aromatics,charged>aromatics,aromatics>polar,polar>polar,unpolar>polar,charged>polar,aromatics>unpolar,polar>unpolar,unpolar>unpolar,charged>unpolar,aromatics>charged,polar>charged,unpolar>charged,charged>charged\n" + +aa_transitions.write(column_index_aa_transitions) +aatypes_transitions.write(column_index_aatypes_transitions) + PATH = sys.argv[1] length=1000 iterration=100 @@ -424,6 +474,8 @@ reversecode={v:k for k in code for v in code[k]} reverseclassif={v:k for k in classif for v in classif[k]} +# -------------------------------------------------------------- + pairs=open("pairs.txt","r") #finds the pairs to analyze pairlist=[] while 1: @@ -449,12 +501,15 @@ species=listparameters[3:] background=1 +# -------------------------------------------------------------- concat=open(sys.argv[1],"r") -for p in pairlist: #pairs analysis +last_iter = 1 +for p in pairlist: #pairs analysis concat.seek(0) while 1: + # Everything lowercase line=concat.readline() seq=concat.readline() if p[0] in line: @@ -465,16 +520,21 @@ break if len(p)>2: #bootstrap simulations if the background changed + # Only len(pairlist[0]) > 2 length=p[2] iterration=p[3] plusminus=p[4] - species=p[5] + species=p[5] # ex : 'all' applause='background: '+str(length)+' '+str(iterration)+' '+str(plusminus)+' '+' '.join(species) print applause stringcounts.append(applause+'\n\n\n') stringbiases.append(applause+'\n\n\n') + # variables below are computed on the first iteration and then used for the rest of the loop codonscountboot, aacountboot, aaclassifcountboot, codonsboot, aaboot, aaclassifboot=sampling(concat,length,iterration,plusminus,species,code,classif,reversecode,reverseclassif) print str(p[0])+' vs '+str(p[1]) + + # ----------------------------- + # Mise en memoire des calculs pour les comptages par sps + Ecriture fichiers de sortie codonscount, aacount, aaclassifcount, codons, aa, aaclassif, GC3, GC12, IVYWREL, EKQH, PAYRESDGM, purineload, CvP=countings(seq1,seq2,code,classif,reversecode,reverseclassif) codonscountpvalue, aacountpvalue, aaclassifcountpvalue, codonspvalue, aapvalue, aaclassifpvalue=gettables(0,reversecode,code,classif) @@ -501,25 +561,154 @@ aacountpvalue[f]=testpvalue(aacountboot[f],aacount[f],iterration) for f in aaclassifcount: aaclassifcountpvalue[f]=testpvalue(aaclassifcountboot[f],aaclassifcount[f],iterration) + + ## Writing countings into separated output files ## + + codons_counts.write(p[0] + ",") # Species name + #codons_counts.write(",") # next cell + for value in codonscount.values()[0:-1]: + codons_counts.write(str(value) + ",") # write codons_counts line for the species + codons_counts.write(str(codonscount.values()[-1])) + codons_counts.write("\n") # next line + + codons_counts.write(p[0] + "_pvalue,") # Same species, but line for computed pvalues + for value in codonscountpvalue.values()[0:-1]: + codons_counts.write(str(value) + ",") # write pvalues line for the species + codons_counts.write(str(codonscountpvalue.values()[-1])) + codons_counts.write("\n") + + aa_counts.write(p[0] + ",") # Same method as above + for value in aacount.values()[0:-1]: + aa_counts.write(str(value) + ",") + aa_counts.write(str(aacount.values()[-1])) + aa_counts.write("\n") + + aa_counts.write(p[0] + "_pvalue,") + for value in aacountpvalue.values()[0:-1]: + aa_counts.write(str(value) + ",") + aa_counts.write(str(aacountpvalue.values()[-1])) + aa_counts.write("\n") + + aatypes_counts.write(p[0] + ",") + for value in aaclassifcount.values()[0:-1]: + aatypes_counts.write(str(value) + ",") + aatypes_counts.write(str(aaclassifcount.values()[-1])) + aatypes_counts.write("\n") + + aatypes_counts.write(p[0] + "_pvalue,") + for value in aaclassifcountpvalue.values()[0:-1]: + aatypes_counts.write(str(value) + ",") + aatypes_counts.write(str(aaclassifcountpvalue.values()[-1])) + aatypes_counts.write("\n") + + gc_counts.write(p[0]) + gc_counts.write(",") + gc_counts.write(str(GC3)+","+str(GC12)+","+str(IVYWREL)+","+str(EKQH)+","+str(PAYRESDGM)+","+str(purineload)+","+str(CvP)+"\n") + + """ IMPROVMENT BELOW : + Countings was not done on the last species of the list. It is now compute when the loop reaches the last iteration, thanks to the countings() + function : it is called with a switch between the arguments 'seq1' and seq2'. + """ + + if last_iter == len(pairlist): + codonscount2, aacount2, aaclassifcount2, codons2, aa2, aaclassif2, GC3_b, GC12_b, IVYWREL_b, EKQH_b, PAYRESDGM_b, purineload_b, CvP_b=countings(seq2,seq1,code,classif,reversecode,reverseclassif) + codonscountpvalue2, aacountpvalue2, aaclassifcountpvalue2, codonspvalue2, aapvalue2, aaclassifpvalue2=gettables(0,reversecode,code,classif) + + for f in codons2: #tests the countings and saves the pvalues + for g in codons2[f]: + codonspvalue2[f][g]=testpvalue(codonsboot[f][g],codons[f][g],iterration) + for f in aa2: + for g in aa2[f]: + aapvalue2[f][g]=testpvalue(aaboot[f][g],aa[f][g],iterration) + for f in aaclassif2: + for g in aaclassif2[f]: + aaclassifpvalue2[f][g]=testpvalue(aaclassifboot[f][g],aaclassif[f][g],iterration) + + for f in codonscount2: + codonscountpvalue2[f]=testpvalue(codonscountboot[f],codonscount[f],iterration) + for f in aacount2: + aacountpvalue2[f]=testpvalue(aacountboot[f],aacount[f],iterration) + for f in aaclassifcount2: + aaclassifcountpvalue2[f]=testpvalue(aaclassifcountboot[f],aaclassifcount[f],iterration) + + # write last line of each countings file + + codons_counts.write(p[1] + ",") # second species of the couple, the last one to be written in the file + for value in codonscount2.values()[0:-1]: + codons_counts.write(str(value) + ",") + codons_counts.write(str(codonscount2.values()[-1])) + codons_counts.write("\n") + + codons_counts.write(p[1] + "_pvalue,") # Same species, but line for computed pvalues + for value in codonscountpvalue2.values()[0:-1]: + codons_counts.write(str(value) + ",") # write pvalues line for the species + codons_counts.write(str(codonscountpvalue2.values()[-1])) + codons_counts.write("\n") + + aa_counts.write(p[1] + ",") + for value in aacount2.values()[0:-1]: + aa_counts.write(str(value) + ",") + aa_counts.write(str(aacount2.values()[-1])) + aa_counts.write("\n") + + aa_counts.write(p[1] + "_pvalue,") + for value in aacountpvalue2.values()[0:-1]: + aa_counts.write(str(value) + ",") # write pvalues line for the species + aa_counts.write(str(aacountpvalue2.values()[-1])) + aa_counts.write("\n") + + aatypes_counts.write(p[1] + ",") + for value in aaclassifcount2.values()[0:-1]: + aatypes_counts.write(str(value) + ",") + aatypes_counts.write(str(aaclassifcount2.values()[-1])) + aatypes_counts.write("\n") + + aatypes_counts.write(p[1] + "_pvalue,") + for value in aaclassifcountpvalue2.values()[0:-1]: + aatypes_counts.write(str(value) + ",") # write pvalues line for the species + aatypes_counts.write(str(aaclassifcountpvalue2.values()[-1])) + aatypes_counts.write("\n") + + gc_counts.write(p[1]) + gc_counts.write(",") + gc_counts.write(str(GC3_b)+","+str(GC12_b)+","+str(IVYWREL_b)+","+str(EKQH_b)+","+str(PAYRESDGM_b)+","+str(purineload_b)+","+str(CvP_b)) + + # end writing stringcounts=stringcounts[:-1]+[''.join([stringcounts[-1],("counting of %s\n\n" % p[0])+''.join(strcountings(codonscount, aacount, aaclassifcount,codonscountpvalue,aacountpvalue,aaclassifcountpvalue,GC3,GC12,IVYWREL,EKQH,PAYRESDGM,purineload,CvP))+'\n\n'])] else: towrite=1 + # ----------------------------- stringbiases.append("mutation biases from %s to %s\n\n" % (p[1], p[0])) - stringbiases=stringbiases+strbiases(codons, aa, aaclassif,codonspvalue,aapvalue,aaclassifpvalue) - stringbiases.append('\n\n') + stringbiases=stringbiases+strbiases(codons, aa, aaclassif,codonspvalue,aapvalue,aaclassifpvalue,aa_transitions,aatypes_transitions,p) + stringbiases.append('\n\n') print 'done' + last_iter +=1 concat.close() +# -------------------------------------------------------------- + stringcounts=''.join(stringcounts) stringbiases=''.join(stringbiases) + # results=open(os.path.dirname(PATH)+"/"+string(os.path.split(PATH)[1],' ')[0]+'_results.txt',"w") -results=open('./codoncounting_results.txt',"w") -results.write("%s" % stringcounts) -results.write("\n\n") -results.write("%s" % stringbiases) -results.close() +#results=open('./codoncounting_results.txt',"w") +results1=open('./counts.txt', "w") +results1.write("%s" % stringcounts) +results1.close() +results2=open('./biases.txt', "w") +results2.write("%s" % stringbiases) +results2.close() +#results.write("%s" % stringcounts) +#results.write("\n\n") +#results.write("%s" % stringbiases) +#results.close() - +codons_counts.close() +aa_counts.close() +aatypes_counts.close() +gc_counts.close() +aa_transitions.close() +aatypes_transitions.close()
