Mercurial > repos > abims-sbr > mutcount
diff scripts/S02b_study_seq_composition_nuc.py @ 0:78dd6454f6f0 draft
planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 73670b26c75bb6c1a6332481920f3036314de364
| author | abims-sbr |
|---|---|
| date | Tue, 02 May 2017 04:20:51 -0400 |
| parents | |
| children | 8de21b6eb110 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/S02b_study_seq_composition_nuc.py Tue May 02 04:20:51 2017 -0400 @@ -0,0 +1,248 @@ +#!/usr/bin/env python +## Author: Eric FONTANILLAS +## Date: 21.12.10 +## Object: Test for compositional bias in genome and proteome as marker of thermal adaptation (comparison between 2 "hot" species: Ap and Ps and one "cold" species: Pg) + + +import sys,os,zipfile,shutil,subprocess + +############# +### DEF 0 ### +#############import sys,os,zipfile +def simplify_fasta_name(fasta_name,LT): + for abbreviation in LT: + if abbreviation in fasta_name: + new_fasta_name = abbreviation + + return(new_fasta_name) +########################################## + +########### +## DEF1 ## +########### +## Generates bash, with key = fasta name; value = sequence (WITH GAP, IF ANY, REMOVED IN THIS FUNCTION) + +def dico(fasta_file,LT): + + count_fastaName=0 + F1 = open(fasta_file, "r") + + bash1 = {} + while 1: + nextline = F1.readline() + #print nextline + if not nextline : + break + + if nextline[0] == ">": + count_fastaName = count_fastaName + 1 + fasta_name = nextline[1:-1] + nextline = F1.readline() + sequence = nextline[:-1] + + if fasta_name not in bash1.keys(): + fasta_name = simplify_fasta_name(fasta_name,LT) ### DEF 0 ### + bash1[fasta_name] = sequence + else: + print fasta_name + + # Find alignment length + kk = bash1.keys() + key0 = kk[0] + seq0 = bash1[key0] + ln_seq = len(seq0) + + F1.close() + + return(bash1) +##################################### + + + +################## +###### DEF2 ###### +################## +def base_composition(seq): + count_A=string.count(seq, "A") + string.count(seq, "a") + count_T=string.count(seq, "T") + string.count(seq, "t") + count_C=string.count(seq, "C") + string.count(seq, "c") + count_G=string.count(seq, "G") + string.count(seq, "g") + ## 3 ## Nucleotide proportions + ln = count_C+count_G+count_T+count_A + if (ln!=0): + + CG = count_C+count_G + AT = count_T+count_A + + AG = count_A+count_G + TC = count_T+count_C + + ## 1 ## Search for compositional bias in genome as marker of thermal adaptation: CG vs AT + ratio_CG_AT = float(CG)/float(AT) + percent_CG = float(CG)/(float(AT) + float(CG))*100 + + ## 2 ## Search for compositional bias in genome as marker of thermal adaptation: AG vs TC + ratio_purine_pyrimidine=float(AG)/float(TC) + percent_purine=float(AG)/(float(AG)+float(TC))*100 + + + prop_A = float(count_A)/float(ln) + prop_T = float(count_T)/float(ln) + prop_C = float(count_C)/float(ln) + prop_G = float(count_G)/float(ln) + else: + percent_CG=0 + percent_purine=0 + prop_A=0 + prop_T=0 + prop_C=0 + prop_G=0 + + return(percent_CG, percent_purine, prop_A, prop_T, prop_C, prop_G) +############################################## + +################## +###### DEF3 ###### +################## +def purine_loading(seq): + count_A=string.count(seq, "A") + string.count(seq, "a") + count_T=string.count(seq, "T") + string.count(seq, "t") + count_C=string.count(seq, "C") + string.count(seq, "c") + count_G=string.count(seq, "G") + string.count(seq, "g") + ## 3 ## Nucleotide proportions + TOTAL = count_C+count_G+count_T+count_A + if (TOTAL!=0): + + ## PLI : Purine loading indice (Forsdyke et al.) + # (G-C)/N * 1000 et (A-T)/N * 1000 + + DIFF_GC = count_G - count_C + DIFF_AT = count_A - count_T + + # Per bp + PLI_GC = float(DIFF_GC)/float(TOTAL) + PLI_AT = float(DIFF_AT)/float(TOTAL) + + # Per 1000 bp + PLI_GC_1000 = PLI_GC*1000 + PLI_AT_1000 = PLI_AT*1000 + else: + DIFF_GC=0 + DIFF_AT=0 + PLI_GC=0 + PLI_AT=0 + PLI_GC_1000=0 + PLI_AT_1000=0 + + return(TOTAL, DIFF_GC, DIFF_AT,PLI_GC,PLI_AT,PLI_GC_1000,PLI_AT_1000) +############################################## + +################### +### RUN RUN RUN ### +################### +import string, os,sys,zipfile + + + + + +##Create specific folders +Path_IN_loci_NUC = "./IN_NUC" +outpath= "./OUT" +os.makedirs(Path_IN_loci_NUC) +os.makedirs(outpath) + + + + +#Check if the file is a zip or fasta file + +the_zip_file = zipfile.ZipFile(sys.argv[1]) +ret = the_zip_file.testzip() + +if ret is not None: + shutil.copy2(sys.argv[1], './IN_NUC/input.fasta') +else: + cmd="unzip %s -d ./IN_NUC"%(sys.argv[1]) + os.system(cmd) + + +## 1 ## List taxa +LT=[] +cmd="grep '>' %s" % sys.argv[2] +result = subprocess.check_output(cmd, shell=True) +result=result.split('\n') +for i in result: + sp=i[1:] + if sp !='': + LT.append(sp) +print LT + +#LT = ["Ap", "Ac", "Pg"] +#os.system("grep '>' %s" %(sys.argv[1])) + +## 2 ## PathIN +# fileIN_properties = open("01_AminoAcid_Properties2.csv", "r") +Lloci_NUC = os.listdir(Path_IN_loci_NUC) + + +## 3 ## PathOUT +## 3.1 ## NUC composition +fileOUT_NUC=open("./OUT/10_nuc_compositions.csv","w") +fileOUT_NUC.write("LOCUS,") +for taxa in LT: + fileOUT_NUC.write("%s_prop_A,%s_prop_T,%s_prop_C,%s_prop_G," %(taxa,taxa,taxa,taxa)) +fileOUT_NUC.write("\n") + +## 3.2 ## NUC percent_GC +fileOUT_percent_GC=open("./OUT/11_percent_GC.csv","w") +fileOUT_percent_GC.write("LOCUS,") +for taxa in LT: + fileOUT_percent_GC.write("%s_percent_GC," %(taxa)) +fileOUT_percent_GC.write("\n") + +## 3.3 ## NUC percent_purine +fileOUT_percent_purine=open("./OUT/12_percent_purine.csv","w") +fileOUT_percent_purine.write("LOCUS,") +for taxa in LT: + fileOUT_percent_purine.write("%s_percent_purine," %(taxa)) +fileOUT_percent_purine.write("\n") + +## 3.4 ## Purine Load +fileOUT_Purine_Load=open("./OUT/12_Purine_Load_Indice.csv", "w") +fileOUT_Purine_Load.write("LOCUS,") +for taxa in LT: + fileOUT_Purine_Load.write("%s_TOTAL,%s_DIFF_GC,%s_DIFF_AT,%s_PLI_GC1000,%s_PLI_AT1000," %(taxa,taxa,taxa,taxa,taxa)) +fileOUT_Purine_Load.write("\n") + +##################### +## 4 ## Process Loci +##################### +for locus in Lloci_NUC: + print locus + path_locus = "%s/%s" %(Path_IN_loci_NUC, locus) + bash = dico(path_locus,LT) + + fileOUT_NUC.write("%s," %locus) + fileOUT_percent_GC.write("%s," %locus) + fileOUT_percent_purine.write("%s," %locus) + fileOUT_Purine_Load.write("%s," %locus) + #print bash + for taxa in LT: + print taxa + if taxa in bash.keys(): + seq = bash[taxa] + percent_GC, percent_purine,prop_A, prop_T, prop_C, prop_G = base_composition(seq) ### DEF2 ### + TOTAL, DIFF_GC, DIFF_AT,PLI_GC,PLI_AT,PLI_GC_1000,PLI_AT_1000 = purine_loading(seq) ### DEF3 ### + fileOUT_NUC.write("%.5f,%.5f,%.5f,%.5f," %(prop_A,prop_T,prop_C,prop_G)) + fileOUT_percent_GC.write("%.5f," %percent_GC) + fileOUT_percent_purine.write("%.5f," %percent_purine) + fileOUT_Purine_Load.write("%d,%d,%d,%.5f,%.5f," %(TOTAL, DIFF_GC, DIFF_AT,PLI_GC_1000, PLI_AT_1000)) + fileOUT_NUC.write("\n") + fileOUT_percent_GC.write("\n") + fileOUT_percent_purine.write("\n") + fileOUT_Purine_Load.write("\n") +fileOUT_NUC.close() +fileOUT_percent_GC.close() +fileOUT_percent_purine.close() +fileOUT_Purine_Load.close()
