diff scripts/S02a_codon_counting.py @ 2:988467f963f0 draft

planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit cf1b9c905931ca2ca25faa4844d45c908756472f
author abims-sbr
date Wed, 17 Jan 2018 08:57:49 -0500
parents 78dd6454f6f0
children 5766f80370e7
line wrap: on
line diff
--- a/scripts/S02a_codon_counting.py	Wed Sep 27 10:04:08 2017 -0400
+++ b/scripts/S02a_codon_counting.py	Wed Jan 17 08:57:49 2018 -0500
@@ -1,4 +1,7 @@
-#!/usr/bin/python
+#!/usr/bin/env python
+# coding:utf-8
+
+# Command line : ./S02a_codon_counting.py <concatenation_file_from_phylogeny>
 
 #python codoncounting.py alignement_file
 #Warning!! alignement_file should always be preceded by a path, at least ./
@@ -56,13 +59,13 @@
   numbers=[]
   stats=['pvalues']
   for f in table:
-    names.append('	%s' % f)
-    numbers.append('	%f' % table[f])
-    stats.append('	%f' % pvalues[f])
+    names.append(',%s' % f)
+    numbers.append(',%f' % table[f])
+    stats.append(',%f' % pvalues[f])
   names.append('\n')
   numbers.append('\n')
   stats.append('\n')
-  string=names+numbers+stats
+  string=names+numbers+stats  
 
   return string
 
@@ -74,37 +77,60 @@
   stats=['pvalues']
   for f in table:
     for g in table[f]:
-      names.append('	%s to %s' % (g,f))
-      numbers.append('	%f' % table[f][g])
-      stats.append('	%f' % pvalues[f][g])
+      names.append(',%s>%s' % (g,f))
+      numbers.append(',%f' % table[f][g])
+      stats.append(',%f' % pvalues[f][g])
   names.append('\n')
   numbers.append('\n')
-  stats.append('\n')
-  string=names+numbers+stats
+  stats.append('\n')  
+  string=names+numbers+stats  
 
   return string
 
-
 def strcountings(codons, aa, classif,codonspvalues,aapvalues,classifpvalues,GC3,GC12,IVYWREL,EKQH,PAYRESDGM,purineload,CvP):
 
   subcodons=['occurence of codons\n']+substrcountings(codons,codonspvalues)
   subaa=subcodons+['\noccurence of amino_acids\n']+substrcountings(aa,aapvalues)
   subclassif=subaa+['\noccurence of amino_acid types\n']+substrcountings(classif,classifpvalues)
 
-  string=subclassif+[('\nGC3	GC12	IVYWREL	EK/QH	PAYRE/SDGM	Purine_load	CvP\n%f	%f	%f	%f	%f	%f	%f\n' % (GC3, GC12, IVYWREL, EKQH, PAYRESDGM, purineload, CvP))]
+  string=subclassif+[('\nGC3	GC12	IVYWREL	EK/QH	PAYRE/SDGM	Purine_load	CvP\n%f,%f,%f,%f,%f,%f,%f\n' % (GC3, GC12, IVYWREL, EKQH, PAYRESDGM, purineload, CvP))]
 
   return string
 
 
-def strbiases(codons, aa, classif,codonspvalues,aapvalues,classifpvalues):
+def strbiases(codons, aa, classif,codonspvalues,aapvalues,classifpvalues,fileaa, fileaatype, p):
+
+  # Writing to files (tab separated; the tab characters are in the lists values)
 
-  subcodons=['codons mutations biases\n']+substrbiases(codons,codonspvalues)
+  aafreq, aapval = substrbiases(aa,aapvalues)[401:801], substrbiases(aa,aapvalues)[803:-1]
+  aatypefreq, aatypepval = substrbiases(classif,classifpvalues)[17:33], substrbiases(classif,classifpvalues)[35:-1]
+      
+  fileaa.write("%s>%s" %(p[1], p[0]))
+  for value in aafreq:
+    fileaa.write(str(value))
+  fileaa.write("\n")
+  fileaa.write("%s>%s_pvalue" %(p[1], p[0]))
+  for value in aapval:
+    fileaa.write(str(value))
+  fileaa.write("\n")
+  
+  fileaatype.write("%s>%s" %(p[1], p[0]))
+  for value in aatypefreq:
+    fileaatype.write(str(value))
+  fileaatype.write("\n")
+  fileaatype.write("%s>%s_pvalue" %(p[1], p[0]))
+  for value in aatypepval:
+    fileaatype.write(str(value))
+  fileaatype.write("\n")
+
+  # Origin code
+
+  subcodons=['codons mutations biases\n']+substrbiases(codons,codonspvalues)  
   subaa=subcodons+['\namino acids mutation biases\n']+substrbiases(aa,aapvalues)
-  subclassif=subaa+['\ntypes of amino_acids mutation biasecodoncountingv22.pys\n']+substrbiases(classif,classifpvalues)
+  subclassif=subaa+['\ntypes of amino_acids mutation biasecodoncountingv22.pys\n']+substrbiases(classif,classifpvalues)  
 
   return subclassif
 
-
 def testpvalue(bootstrap,value,iterration): #computes where the observed value is located in the expected counting distribution
 
   maxval=iterration-1
@@ -123,7 +149,6 @@
 
   return pvalue
 
-
 def gettables(content,reversecode,code,classif): #generates the tables contening all the countings
 
   if content==[]:
@@ -170,13 +195,13 @@
 
   return codonscount, aacount, aaclassifcount, codons, aa, aaclassif
 
-
 def countings(seq1,seq2,code,classif,reversecode,reverseclassif):
+# COMPTAGES (output counts.txt) functions used : gettables()
 #countings actually counts occurence and mutation bias of codons, amino acids and types of amino acids
   
-  codonscount, aacount, aaclassifcount, codons, aa, aaclassif=gettables(0,reversecode,code,classif)
+  codonscount, aacount, aaclassifcount, codons, aa, aaclassif=gettables(0,reversecode,code,classif)  
   codonscount2, aacount2, aaclassifcount2, _, _, _=gettables(0,reversecode,code,classif)
-
+ 
   G12=0
   C12=0
   G3=0
@@ -299,11 +324,10 @@
   IVYWREL=IVYWREL/float(compcodons)
   CvP=aaclassifcount['charged']-aaclassifcount['polar']
   
-
   return codonscount, aacount, aaclassifcount, codons, aa, aaclassif, GC3, GC12, IVYWREL, EKQH, PAYRESDGM, purineload, CvP
 
-
 def sampling(inputfile,length,iterration,plusminus,species,code,classif,reversecode,reverseclassif):
+# functions used : gettables, countings
 #sampling provides 'iterations' pairs of sequences of 'length' non ambiguous codons obtained from the dataset specified by 'species' and 'plusminus'
 #sort of bootstrap
 
@@ -409,6 +433,32 @@
  
 import string, os, sys, re, random
 from math import pow
+
+codons_counts = open("codons_counts.csv", "w")
+aa_counts = open("aa_counts.csv", "w")
+aatypes_counts = open("aatypes_counts.csv", "w")
+gc_counts = open("gc_counts.csv", "w")
+
+# ou bien utiliser le reversecode plus bas
+column_index_codons = "Species,tat,tgt,tct,ttt,tgc,tgg,tac,ttc,tcg,tta,ttg,tcc,tca,gca,gta,gcc,gtc,gcg,gtg,caa,gtt,gct,acc,ggt,cga,cgc,gat,aag,cgg,act,ggg,gga,ggc,gag,aaa,gac,cgt,gaa,ctt,atg,aca,acg,atc,aac,ata,agg,cct,agc,aga,cat,aat,att,ctg,cta,ctc,cac,ccg,agt,cag,cca,ccc\n"
+column_index_aa = "Species,cys,asn,his,ile,ser,gln,lys,met,pro,thr,phe,ala,gly,val,leu,asp,arg,trp,glu,tyr\n"
+column_index_aatypes = "Species,aromatics,polar,unpolar,charged\n"
+column_index_gc = "Species,GC3,GC12,IVYWREL,EKQH,PAYRESDGM,purineload,CvP\n"
+
+codons_counts.write(column_index_codons)
+aa_counts.write(column_index_aa)
+aatypes_counts.write(column_index_aatypes)
+gc_counts.write(column_index_gc)
+
+aa_transitions = open("aa_transitions.csv", "w")
+aatypes_transitions = open("aatypes_transitions.csv", "w")
+
+column_index_aa_transitions = "Species,cys>cys,asn>cys,his>cys,ile>cys,ser>cys,gln>cys,lys>cys,met>cys,pro>cys,thr>cys,phe>cys,ala>cys,gly>cys,val>cys,leu>cys,asp>cys,arg>cys,trp>cys,glu>cys,tyr>cys,cys>asn,asn>asn,his>asn,ile>asn,ser>asn,gln>asn,lys>asn,met>asn,pro>asn,thr>asn,phe>asn,ala>asn,gly>asn,val>asn,leu>asn,asp>asn,arg>asn,trp>asn,glu>asn,tyr>asn,cys>his,asn>his,his>his,ile>his,ser>his,gln>his,lys>his,met>his,pro>his,thr>his,phe>his,ala>his,gly>his,val>his,leu>his,asp>his,arg>his,trp>his,glu>his,tyr>his,cys>ile,asn>ile,his>ile,ile>ile,ser>ile,gln>ile,lys>ile,met>ile,pro>ile,thr>ile,phe>ile,ala>ile,gly>ile,val>ile,leu>ile,asp>ile,arg>ile,trp>ile,glu>ile,tyr>ile,cys>ser,asn>ser,his>ser,ile>ser,ser>ser,gln>ser,lys>ser,met>ser,pro>ser,thr>ser,phe>ser,ala>ser,gly>ser,val>ser,leu>ser,asp>ser,arg>ser,trp>ser,glu>ser,tyr>ser,cys>gln,asn>gln,his>gln,ile>gln,ser>gln,gln>gln,lys>gln,met>gln,pro>gln,thr>gln,phe>gln,ala>gln,gly>gln,val>gln,leu>gln,asp>gln,arg>gln,trp>gln,glu>gln,tyr>gln,cys>lys,asn>lys,his>lys,ile>lys,ser>lys,gln>lys,lys>lys,met>lys,pro>lys,thr>lys,phe>lys,ala>lys,gly>lys,val>lys,leu>lys,asp>lys,arg>lys,trp>lys,glu>lys,tyr>lys,cys>met,asn>met,his>met,ile>met,ser>met,gln>met,lys>met,met>met,pro>met,thr>met,phe>met,ala>met,gly>met,val>met,leu>met,asp>met,arg>met,trp>met,glu>met,tyr>met,cys>pro,asn>pro,his>pro,ile>pro,ser>pro,gln>pro,lys>pro,met>pro,pro>pro,thr>pro,phe>pro,ala>pro,gly>pro,val>pro,leu>pro,asp>pro,arg>pro,trp>pro,glu>pro,tyr>pro,cys>thr,asn>thr,his>thr,ile>thr,ser>thr,gln>thr,lys>thr,met>thr,pro>thr,thr>thr,phe>thr,ala>thr,gly>thr,val>thr,leu>thr,asp>thr,arg>thr,trp>thr,glu>thr,tyr>thr,cys>phe,asn>phe,his>phe,ile>phe,ser>phe,gln>phe,lys>phe,met>phe,pro>phe,thr>phe,phe>phe,ala>phe,gly>phe,val>phe,leu>phe,asp>phe,arg>phe,trp>phe,glu>phe,tyr>phe,cys>ala,asn>ala,his>ala,ile>ala,ser>ala,gln>ala,lys>ala,met>ala,pro>ala,thr>ala,phe>ala,ala>ala,gly>ala,val>ala,leu>ala,asp>ala,arg>ala,trp>ala,glu>ala,tyr>ala,cys>gly,asn>gly,his>gly,ile>gly,ser>gly,gln>gly,lys>gly,met>gly,pro>gly,thr>gly,phe>gly,ala>gly,gly>gly,val>gly,leu>gly,asp>gly,arg>gly,trp>gly,glu>gly,tyr>gly,cys>val,asn>val,his>val,ile>val,ser>val,gln>val,lys>val,met>val,pro>val,thr>val,phe>val,ala>val,gly>val,val>val,leu>val,asp>val,arg>val,trp>val,glu>val,tyr>val,cys>leu,asn>leu,his>leu,ile>leu,ser>leu,gln>leu,lys>leu,met>leu,pro>leu,thr>leu,phe>leu,ala>leu,gly>leu,val>leu,leu>leu,asp>leu,arg>leu,trp>leu,glu>leu,tyr>leu,cys>asp,asn>asp,his>asp,ile>asp,ser>asp,gln>asp,lys>asp,met>asp,pro>asp,thr>asp,phe>asp,ala>asp,gly>asp,val>asp,leu>asp,asp>asp,arg>asp,trp>asp,glu>asp,tyr>asp,cys>arg,asn>arg,his>arg,ile>arg,ser>arg,gln>arg,lys>arg,met>arg,pro>arg,thr>arg,phe>arg,ala>arg,gly>arg,val>arg,leu>arg,asp>arg,arg>arg,trp>arg,glu>arg,tyr>arg,cys>trp,asn>trp,his>trp,ile>trp,ser>trp,gln>trp,lys>trp,met>trp,pro>trp,thr>trp,phe>trp,ala>trp,gly>trp,val>trp,leu>trp,asp>trp,arg>trp,trp>trp,glu>trp,tyr>trp,cys>glu,asn>glu,his>glu,ile>glu,ser>glu,gln>glu,lys>glu,met>glu,pro>glu,thr>glu,phe>glu,ala>glu,gly>glu,val>glu,leu>glu,asp>glu,arg>glu,trp>glu,glu>glu,tyr>glu,cys>tyr,asn>tyr,his>tyr,ile>tyr,ser>tyr,gln>tyr,lys>tyr,met>tyr,pro>tyr,thr>tyr,phe>tyr,ala>tyr,gly>tyr,val>tyr,leu>tyr,asp>tyr,arg>tyr,trp>tyr,glu>tyr,tyr>tyr\n"
+column_index_aatypes_transitions = "Species,aromatics>aromatics,polar>aromatics,unpolar>aromatics,charged>aromatics,aromatics>polar,polar>polar,unpolar>polar,charged>polar,aromatics>unpolar,polar>unpolar,unpolar>unpolar,charged>unpolar,aromatics>charged,polar>charged,unpolar>charged,charged>charged\n"
+
+aa_transitions.write(column_index_aa_transitions)
+aatypes_transitions.write(column_index_aatypes_transitions)
+
 PATH = sys.argv[1]
 length=1000
 iterration=100
@@ -424,6 +474,8 @@
 reversecode={v:k for k in code for v in code[k]}
 reverseclassif={v:k for k in classif for v in classif[k]} 
 
+# --------------------------------------------------------------
+
 pairs=open("pairs.txt","r") #finds the pairs to analyze
 pairlist=[]
 while 1:
@@ -449,12 +501,15 @@
       species=listparameters[3:]
       background=1
 
+# --------------------------------------------------------------
 
 concat=open(sys.argv[1],"r")
 
-for p in pairlist: #pairs analysis
+last_iter = 1
+for p in pairlist: #pairs analysis  
   concat.seek(0)
   while 1:
+    # Everything lowercase
     line=concat.readline()
     seq=concat.readline()
     if p[0] in line:
@@ -465,16 +520,21 @@
       break
   
   if len(p)>2: #bootstrap simulations if the background changed
+    # Only len(pairlist[0]) > 2
     length=p[2]
     iterration=p[3]
     plusminus=p[4]
-    species=p[5]
+    species=p[5] # ex : 'all'
     applause='background: '+str(length)+' '+str(iterration)+' '+str(plusminus)+' '+' '.join(species)
     print applause
     stringcounts.append(applause+'\n\n\n')
     stringbiases.append(applause+'\n\n\n')
+    # variables below are computed on the first iteration and then used for the rest of the loop
     codonscountboot, aacountboot, aaclassifcountboot, codonsboot, aaboot, aaclassifboot=sampling(concat,length,iterration,plusminus,species,code,classif,reversecode,reverseclassif)
   print str(p[0])+' vs '+str(p[1])
+
+  # -----------------------------
+  # Mise en memoire des calculs pour les comptages par sps + Ecriture fichiers de sortie
   codonscount, aacount, aaclassifcount, codons, aa, aaclassif, GC3, GC12, IVYWREL, EKQH, PAYRESDGM, purineload, CvP=countings(seq1,seq2,code,classif,reversecode,reverseclassif)
   codonscountpvalue, aacountpvalue, aaclassifcountpvalue, codonspvalue, aapvalue, aaclassifpvalue=gettables(0,reversecode,code,classif)
 
@@ -501,25 +561,154 @@
       aacountpvalue[f]=testpvalue(aacountboot[f],aacount[f],iterration)
     for f in aaclassifcount:
       aaclassifcountpvalue[f]=testpvalue(aaclassifcountboot[f],aaclassifcount[f],iterration)
+   
+    ## Writing countings into separated output files ##
+
+    codons_counts.write(p[0] + ",") # Species name
+    #codons_counts.write(",") # next cell
+    for value in codonscount.values()[0:-1]:
+      codons_counts.write(str(value) + ",") # write codons_counts line for the species
+    codons_counts.write(str(codonscount.values()[-1]))
+    codons_counts.write("\n") # next line
+
+    codons_counts.write(p[0] + "_pvalue,") # Same species, but line for computed pvalues    
+    for value in codonscountpvalue.values()[0:-1]:
+      codons_counts.write(str(value) + ",") # write pvalues line for the species
+    codons_counts.write(str(codonscountpvalue.values()[-1]))
+    codons_counts.write("\n")
+
+    aa_counts.write(p[0] + ",") # Same method as above    
+    for value in aacount.values()[0:-1]:
+      aa_counts.write(str(value) + ",")
+    aa_counts.write(str(aacount.values()[-1]))
+    aa_counts.write("\n")
+
+    aa_counts.write(p[0] + "_pvalue,")
+    for value in aacountpvalue.values()[0:-1]:
+      aa_counts.write(str(value) + ",")
+    aa_counts.write(str(aacountpvalue.values()[-1]))
+    aa_counts.write("\n")
+
+    aatypes_counts.write(p[0] + ",")
+    for value in aaclassifcount.values()[0:-1]:
+      aatypes_counts.write(str(value) + ",")
+    aatypes_counts.write(str(aaclassifcount.values()[-1]))
+    aatypes_counts.write("\n")
+
+    aatypes_counts.write(p[0] + "_pvalue,")
+    for value in aaclassifcountpvalue.values()[0:-1]:
+      aatypes_counts.write(str(value) + ",")
+    aatypes_counts.write(str(aaclassifcountpvalue.values()[-1]))
+    aatypes_counts.write("\n")
+
+    gc_counts.write(p[0])
+    gc_counts.write(",")
+    gc_counts.write(str(GC3)+","+str(GC12)+","+str(IVYWREL)+","+str(EKQH)+","+str(PAYRESDGM)+","+str(purineload)+","+str(CvP)+"\n")    
+
+    """ IMPROVMENT BELOW :
+    Countings was not done on the last species of the list. It is now compute when the loop reaches the last iteration, thanks to the countings()
+    function : it is called with a switch between the arguments 'seq1' and seq2'.    
+    """
+    
+    if last_iter == len(pairlist):
+      codonscount2, aacount2, aaclassifcount2, codons2, aa2, aaclassif2, GC3_b, GC12_b, IVYWREL_b, EKQH_b, PAYRESDGM_b, purineload_b, CvP_b=countings(seq2,seq1,code,classif,reversecode,reverseclassif)
+      codonscountpvalue2, aacountpvalue2, aaclassifcountpvalue2, codonspvalue2, aapvalue2, aaclassifpvalue2=gettables(0,reversecode,code,classif)
+
+      for f in codons2: #tests the countings and saves the pvalues
+        for g in codons2[f]:
+          codonspvalue2[f][g]=testpvalue(codonsboot[f][g],codons[f][g],iterration)
+      for f in aa2:
+        for g in aa2[f]:
+          aapvalue2[f][g]=testpvalue(aaboot[f][g],aa[f][g],iterration)
+      for f in aaclassif2:
+        for g in aaclassif2[f]:
+          aaclassifpvalue2[f][g]=testpvalue(aaclassifboot[f][g],aaclassif[f][g],iterration)
+
+      for f in codonscount2:
+        codonscountpvalue2[f]=testpvalue(codonscountboot[f],codonscount[f],iterration)
+      for f in aacount2:
+        aacountpvalue2[f]=testpvalue(aacountboot[f],aacount[f],iterration)
+      for f in aaclassifcount2:
+        aaclassifcountpvalue2[f]=testpvalue(aaclassifcountboot[f],aaclassifcount[f],iterration)
+
+      # write last line of each countings file
+
+      codons_counts.write(p[1] + ",") # second species of the couple, the last one to be written in the file
+      for value in codonscount2.values()[0:-1]:
+        codons_counts.write(str(value) + ",")
+      codons_counts.write(str(codonscount2.values()[-1]))
+      codons_counts.write("\n")
+
+      codons_counts.write(p[1] + "_pvalue,") # Same species, but line for computed pvalues
+      for value in codonscountpvalue2.values()[0:-1]:
+        codons_counts.write(str(value) + ",") # write pvalues line for the species
+      codons_counts.write(str(codonscountpvalue2.values()[-1]))  
+      codons_counts.write("\n")
+
+      aa_counts.write(p[1] + ",")
+      for value in aacount2.values()[0:-1]:
+        aa_counts.write(str(value) + ",")
+      aa_counts.write(str(aacount2.values()[-1]))
+      aa_counts.write("\n")
+
+      aa_counts.write(p[1] + "_pvalue,")
+      for value in aacountpvalue2.values()[0:-1]:
+        aa_counts.write(str(value) + ",") # write pvalues line for the species
+      aa_counts.write(str(aacountpvalue2.values()[-1]))
+      aa_counts.write("\n")
+
+      aatypes_counts.write(p[1] + ",")
+      for value in aaclassifcount2.values()[0:-1]:
+        aatypes_counts.write(str(value) + ",")
+      aatypes_counts.write(str(aaclassifcount2.values()[-1]))
+      aatypes_counts.write("\n")
+
+      aatypes_counts.write(p[1] + "_pvalue,")
+      for value in aaclassifcountpvalue2.values()[0:-1]:
+        aatypes_counts.write(str(value) + ",") # write pvalues line for the species
+      aatypes_counts.write(str(aaclassifcountpvalue2.values()[-1]))
+      aatypes_counts.write("\n")
+
+      gc_counts.write(p[1])
+      gc_counts.write(",")
+      gc_counts.write(str(GC3_b)+","+str(GC12_b)+","+str(IVYWREL_b)+","+str(EKQH_b)+","+str(PAYRESDGM_b)+","+str(purineload_b)+","+str(CvP_b))
+
+    # end writing
     
     stringcounts=stringcounts[:-1]+[''.join([stringcounts[-1],("counting of %s\n\n" % p[0])+''.join(strcountings(codonscount, aacount, aaclassifcount,codonscountpvalue,aacountpvalue,aaclassifcountpvalue,GC3,GC12,IVYWREL,EKQH,PAYRESDGM,purineload,CvP))+'\n\n'])]
   else:
     towrite=1
+  # -----------------------------
 
   stringbiases.append("mutation biases from %s to %s\n\n" % (p[1], p[0]))
-  stringbiases=stringbiases+strbiases(codons, aa, aaclassif,codonspvalue,aapvalue,aaclassifpvalue)
-  stringbiases.append('\n\n')
+  stringbiases=stringbiases+strbiases(codons, aa, aaclassif,codonspvalue,aapvalue,aaclassifpvalue,aa_transitions,aatypes_transitions,p)
+  stringbiases.append('\n\n') 
   print 'done'
+  last_iter +=1
 
 concat.close()
 
+# --------------------------------------------------------------
+
 stringcounts=''.join(stringcounts)
 stringbiases=''.join(stringbiases)
+
 # results=open(os.path.dirname(PATH)+"/"+string(os.path.split(PATH)[1],' ')[0]+'_results.txt',"w")
-results=open('./codoncounting_results.txt',"w")
-results.write("%s" % stringcounts)
-results.write("\n\n")
-results.write("%s" % stringbiases)
-results.close()
+#results=open('./codoncounting_results.txt',"w")
+results1=open('./counts.txt', "w")
+results1.write("%s" % stringcounts)
+results1.close()
+results2=open('./biases.txt', "w")
+results2.write("%s" % stringbiases)
+results2.close()
+#results.write("%s" % stringcounts)
+#results.write("\n\n")
+#results.write("%s" % stringbiases)
+#results.close()
 
-
+codons_counts.close()
+aa_counts.close()
+aatypes_counts.close()
+gc_counts.close()
+aa_transitions.close()
+aatypes_transitions.close()