mutcount: scripts/S02b_study_seq_composition

comparison scripts/S02b_study_seq_composition_nuc.py @ 0:78dd6454f6f0 draft

planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 73670b26c75bb6c1a6332481920f3036314de364

author	abims-sbr
date	Tue, 02 May 2017 04:20:51 -0400
parents
children	8de21b6eb110

comparison

equal deleted inserted replaced

--1:000000000000
+:78dd6454f6f0
+#!/usr/bin/env python
+## Author: Eric FONTANILLAS
+## Date: 21.12.10
+## Object: Test for compositional bias in genome and proteome as marker of thermal adaptation (comparison between 2 "hot" species: Ap and Ps and one "cold" species: Pg)
+import sys,os,zipfile,shutil,subprocess
+#############
+### DEF 0 ###
+#############import sys,os,zipfile
+def simplify_fasta_name(fasta_name,LT):
+for abbreviation in LT:
+if abbreviation in fasta_name:
+new_fasta_name = abbreviation
+return(new_fasta_name)
+##########################################
+###########
+## DEF1 ##
+###########
+## Generates bash, with key = fasta name; value = sequence (WITH GAP, IF ANY, REMOVED IN THIS FUNCTION)
+def dico(fasta_file,LT):
+count_fastaName=0
+F1 = open(fasta_file, "r")
+bash1 = {}
+while 1:
+nextline = F1.readline()
+#print nextline
+if not nextline :
+break
+if nextline[0] == ">":
+count_fastaName = count_fastaName + 1
+fasta_name = nextline[1:-1]
+nextline = F1.readline()
+sequence = nextline[:-1]
+if fasta_name not in bash1.keys():
+fasta_name = simplify_fasta_name(fasta_name,LT)  ### DEF 0 ###
+bash1[fasta_name] = sequence
+else:
+print fasta_name
+# Find alignment length
+kk = bash1.keys()
+key0 = kk[0]
+seq0 = bash1[key0]
+ln_seq = len(seq0)
+F1.close()
+return(bash1)
+#####################################
+##################
+###### DEF2 ######
+##################
+def base_composition(seq):
+count_A=string.count(seq, "A") + string.count(seq, "a")
+count_T=string.count(seq, "T") + string.count(seq, "t")
+count_C=string.count(seq, "C") + string.count(seq, "c")
+count_G=string.count(seq, "G") + string.count(seq, "g")
+## 3 ## Nucleotide proportions
+ln = count_C+count_G+count_T+count_A
+if (ln!=0):
+CG = count_C+count_G
+AT = count_T+count_A
+AG = count_A+count_G
+TC = count_T+count_C
+## 1 ## Search for compositional bias in genome as marker of thermal adaptation: CG vs AT
+ratio_CG_AT = float(CG)/float(AT)
+percent_CG = float(CG)/(float(AT) + float(CG))*100
+## 2 ## Search for compositional bias in genome as marker of thermal adaptation: AG vs TC
+ratio_purine_pyrimidine=float(AG)/float(TC)
+percent_purine=float(AG)/(float(AG)+float(TC))*100
+prop_A = float(count_A)/float(ln)
+prop_T = float(count_T)/float(ln)
+prop_C = float(count_C)/float(ln)
+prop_G = float(count_G)/float(ln)
+else:
+percent_CG=0
+percent_purine=0
+prop_A=0
+prop_T=0
+prop_C=0
+prop_G=0
+return(percent_CG, percent_purine, prop_A, prop_T, prop_C, prop_G)
+##############################################
+##################
+###### DEF3 ######
+##################
+def purine_loading(seq):
+count_A=string.count(seq, "A") + string.count(seq, "a")
+count_T=string.count(seq, "T") + string.count(seq, "t")
+count_C=string.count(seq, "C") + string.count(seq, "c")
+count_G=string.count(seq, "G") + string.count(seq, "g")
+## 3 ## Nucleotide proportions
+TOTAL = count_C+count_G+count_T+count_A
+if (TOTAL!=0):
+## PLI : Purine loading indice (Forsdyke et al.)
+# (G-C)/N * 1000 et (A-T)/N * 1000
+DIFF_GC = count_G - count_C
+DIFF_AT = count_A - count_T
+# Per bp
+PLI_GC = float(DIFF_GC)/float(TOTAL)
+PLI_AT = float(DIFF_AT)/float(TOTAL)
+# Per 1000 bp
+PLI_GC_1000 = PLI_GC*1000
+PLI_AT_1000 = PLI_AT*1000
+else:
+DIFF_GC=0
+DIFF_AT=0
+PLI_GC=0
+PLI_AT=0
+PLI_GC_1000=0
+PLI_AT_1000=0
+return(TOTAL, DIFF_GC, DIFF_AT,PLI_GC,PLI_AT,PLI_GC_1000,PLI_AT_1000)
+##############################################
+###################
+### RUN RUN RUN ###
+###################
+import string, os,sys,zipfile
+##Create specific folders
+Path_IN_loci_NUC = "./IN_NUC"
+outpath= "./OUT"
+os.makedirs(Path_IN_loci_NUC)
+os.makedirs(outpath)
+#Check if the file is a zip or fasta file
+the_zip_file = zipfile.ZipFile(sys.argv[1])
+ret = the_zip_file.testzip()
+if ret is not None:
+shutil.copy2(sys.argv[1], './IN_NUC/input.fasta')
+else:
+cmd="unzip %s -d ./IN_NUC"%(sys.argv[1])
+os.system(cmd)
+## 1 ## List taxa
+LT=[]
+cmd="grep '>' %s" % sys.argv[2]
+result = subprocess.check_output(cmd, shell=True)
+result=result.split('\n')
+for i in result:
+sp=i[1:]
+if sp !='':
+LT.append(sp)
+print LT
+#LT = ["Ap", "Ac", "Pg"]
+#os.system("grep '>' %s" %(sys.argv[1]))
+## 2 ## PathIN
+# fileIN_properties = open("01_AminoAcid_Properties2.csv", "r")
+Lloci_NUC = os.listdir(Path_IN_loci_NUC)
+## 3 ## PathOUT
+## 3.1 ## NUC composition
+fileOUT_NUC=open("./OUT/10_nuc_compositions.csv","w")
+fileOUT_NUC.write("LOCUS,")
+for taxa in LT:
+fileOUT_NUC.write("%s_prop_A,%s_prop_T,%s_prop_C,%s_prop_G," %(taxa,taxa,taxa,taxa))
+fileOUT_NUC.write("\n")
+## 3.2 ## NUC percent_GC
+fileOUT_percent_GC=open("./OUT/11_percent_GC.csv","w")
+fileOUT_percent_GC.write("LOCUS,")
+for taxa in LT:
+fileOUT_percent_GC.write("%s_percent_GC," %(taxa))
+fileOUT_percent_GC.write("\n")
+## 3.3 ## NUC percent_purine
+fileOUT_percent_purine=open("./OUT/12_percent_purine.csv","w")
+fileOUT_percent_purine.write("LOCUS,")
+for taxa in LT:
+fileOUT_percent_purine.write("%s_percent_purine," %(taxa))
+fileOUT_percent_purine.write("\n")
+## 3.4 ## Purine Load
+fileOUT_Purine_Load=open("./OUT/12_Purine_Load_Indice.csv", "w")
+fileOUT_Purine_Load.write("LOCUS,")
+for taxa in LT:
+fileOUT_Purine_Load.write("%s_TOTAL,%s_DIFF_GC,%s_DIFF_AT,%s_PLI_GC1000,%s_PLI_AT1000," %(taxa,taxa,taxa,taxa,taxa))
+fileOUT_Purine_Load.write("\n")
+#####################
+## 4 ## Process Loci
+#####################
+for locus in Lloci_NUC:
+print locus
+path_locus = "%s/%s" %(Path_IN_loci_NUC, locus)
+bash = dico(path_locus,LT)
+fileOUT_NUC.write("%s," %locus)
+fileOUT_percent_GC.write("%s," %locus)
+fileOUT_percent_purine.write("%s," %locus)
+fileOUT_Purine_Load.write("%s," %locus)
+#print bash
+for taxa in LT:
+print taxa
+	if taxa in bash.keys():
+seq = bash[taxa]
+percent_GC, percent_purine,prop_A, prop_T, prop_C, prop_G = base_composition(seq)   ### DEF2 ###
+TOTAL, DIFF_GC, DIFF_AT,PLI_GC,PLI_AT,PLI_GC_1000,PLI_AT_1000 = purine_loading(seq) ### DEF3 ###
+fileOUT_NUC.write("%.5f,%.5f,%.5f,%.5f," %(prop_A,prop_T,prop_C,prop_G))
+fileOUT_percent_GC.write("%.5f," %percent_GC)
+fileOUT_percent_purine.write("%.5f," %percent_purine)
+fileOUT_Purine_Load.write("%d,%d,%d,%.5f,%.5f," %(TOTAL, DIFF_GC, DIFF_AT,PLI_GC_1000, PLI_AT_1000))
+fileOUT_NUC.write("\n")
+fileOUT_percent_GC.write("\n")
+fileOUT_percent_purine.write("\n")
+fileOUT_Purine_Load.write("\n")
+fileOUT_NUC.close()
+fileOUT_percent_GC.close()
+fileOUT_percent_purine.close()
+fileOUT_Purine_Load.close()

Mercurial > repos > abims-sbr > mutcount

comparison scripts/S02b_study_seq_composition_nuc.py @ 0:78dd6454f6f0 draft