Mercurial > repos > abims-sbr > mutcount
comparison scripts/S02b_study_seq_composition_nuc.py @ 0:78dd6454f6f0 draft
planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 73670b26c75bb6c1a6332481920f3036314de364
| author | abims-sbr |
|---|---|
| date | Tue, 02 May 2017 04:20:51 -0400 |
| parents | |
| children | 8de21b6eb110 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:78dd6454f6f0 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 ## Author: Eric FONTANILLAS | |
| 3 ## Date: 21.12.10 | |
| 4 ## Object: Test for compositional bias in genome and proteome as marker of thermal adaptation (comparison between 2 "hot" species: Ap and Ps and one "cold" species: Pg) | |
| 5 | |
| 6 | |
| 7 import sys,os,zipfile,shutil,subprocess | |
| 8 | |
| 9 ############# | |
| 10 ### DEF 0 ### | |
| 11 #############import sys,os,zipfile | |
| 12 def simplify_fasta_name(fasta_name,LT): | |
| 13 for abbreviation in LT: | |
| 14 if abbreviation in fasta_name: | |
| 15 new_fasta_name = abbreviation | |
| 16 | |
| 17 return(new_fasta_name) | |
| 18 ########################################## | |
| 19 | |
| 20 ########### | |
| 21 ## DEF1 ## | |
| 22 ########### | |
| 23 ## Generates bash, with key = fasta name; value = sequence (WITH GAP, IF ANY, REMOVED IN THIS FUNCTION) | |
| 24 | |
| 25 def dico(fasta_file,LT): | |
| 26 | |
| 27 count_fastaName=0 | |
| 28 F1 = open(fasta_file, "r") | |
| 29 | |
| 30 bash1 = {} | |
| 31 while 1: | |
| 32 nextline = F1.readline() | |
| 33 #print nextline | |
| 34 if not nextline : | |
| 35 break | |
| 36 | |
| 37 if nextline[0] == ">": | |
| 38 count_fastaName = count_fastaName + 1 | |
| 39 fasta_name = nextline[1:-1] | |
| 40 nextline = F1.readline() | |
| 41 sequence = nextline[:-1] | |
| 42 | |
| 43 if fasta_name not in bash1.keys(): | |
| 44 fasta_name = simplify_fasta_name(fasta_name,LT) ### DEF 0 ### | |
| 45 bash1[fasta_name] = sequence | |
| 46 else: | |
| 47 print fasta_name | |
| 48 | |
| 49 # Find alignment length | |
| 50 kk = bash1.keys() | |
| 51 key0 = kk[0] | |
| 52 seq0 = bash1[key0] | |
| 53 ln_seq = len(seq0) | |
| 54 | |
| 55 F1.close() | |
| 56 | |
| 57 return(bash1) | |
| 58 ##################################### | |
| 59 | |
| 60 | |
| 61 | |
| 62 ################## | |
| 63 ###### DEF2 ###### | |
| 64 ################## | |
| 65 def base_composition(seq): | |
| 66 count_A=string.count(seq, "A") + string.count(seq, "a") | |
| 67 count_T=string.count(seq, "T") + string.count(seq, "t") | |
| 68 count_C=string.count(seq, "C") + string.count(seq, "c") | |
| 69 count_G=string.count(seq, "G") + string.count(seq, "g") | |
| 70 ## 3 ## Nucleotide proportions | |
| 71 ln = count_C+count_G+count_T+count_A | |
| 72 if (ln!=0): | |
| 73 | |
| 74 CG = count_C+count_G | |
| 75 AT = count_T+count_A | |
| 76 | |
| 77 AG = count_A+count_G | |
| 78 TC = count_T+count_C | |
| 79 | |
| 80 ## 1 ## Search for compositional bias in genome as marker of thermal adaptation: CG vs AT | |
| 81 ratio_CG_AT = float(CG)/float(AT) | |
| 82 percent_CG = float(CG)/(float(AT) + float(CG))*100 | |
| 83 | |
| 84 ## 2 ## Search for compositional bias in genome as marker of thermal adaptation: AG vs TC | |
| 85 ratio_purine_pyrimidine=float(AG)/float(TC) | |
| 86 percent_purine=float(AG)/(float(AG)+float(TC))*100 | |
| 87 | |
| 88 | |
| 89 prop_A = float(count_A)/float(ln) | |
| 90 prop_T = float(count_T)/float(ln) | |
| 91 prop_C = float(count_C)/float(ln) | |
| 92 prop_G = float(count_G)/float(ln) | |
| 93 else: | |
| 94 percent_CG=0 | |
| 95 percent_purine=0 | |
| 96 prop_A=0 | |
| 97 prop_T=0 | |
| 98 prop_C=0 | |
| 99 prop_G=0 | |
| 100 | |
| 101 return(percent_CG, percent_purine, prop_A, prop_T, prop_C, prop_G) | |
| 102 ############################################## | |
| 103 | |
| 104 ################## | |
| 105 ###### DEF3 ###### | |
| 106 ################## | |
| 107 def purine_loading(seq): | |
| 108 count_A=string.count(seq, "A") + string.count(seq, "a") | |
| 109 count_T=string.count(seq, "T") + string.count(seq, "t") | |
| 110 count_C=string.count(seq, "C") + string.count(seq, "c") | |
| 111 count_G=string.count(seq, "G") + string.count(seq, "g") | |
| 112 ## 3 ## Nucleotide proportions | |
| 113 TOTAL = count_C+count_G+count_T+count_A | |
| 114 if (TOTAL!=0): | |
| 115 | |
| 116 ## PLI : Purine loading indice (Forsdyke et al.) | |
| 117 # (G-C)/N * 1000 et (A-T)/N * 1000 | |
| 118 | |
| 119 DIFF_GC = count_G - count_C | |
| 120 DIFF_AT = count_A - count_T | |
| 121 | |
| 122 # Per bp | |
| 123 PLI_GC = float(DIFF_GC)/float(TOTAL) | |
| 124 PLI_AT = float(DIFF_AT)/float(TOTAL) | |
| 125 | |
| 126 # Per 1000 bp | |
| 127 PLI_GC_1000 = PLI_GC*1000 | |
| 128 PLI_AT_1000 = PLI_AT*1000 | |
| 129 else: | |
| 130 DIFF_GC=0 | |
| 131 DIFF_AT=0 | |
| 132 PLI_GC=0 | |
| 133 PLI_AT=0 | |
| 134 PLI_GC_1000=0 | |
| 135 PLI_AT_1000=0 | |
| 136 | |
| 137 return(TOTAL, DIFF_GC, DIFF_AT,PLI_GC,PLI_AT,PLI_GC_1000,PLI_AT_1000) | |
| 138 ############################################## | |
| 139 | |
| 140 ################### | |
| 141 ### RUN RUN RUN ### | |
| 142 ################### | |
| 143 import string, os,sys,zipfile | |
| 144 | |
| 145 | |
| 146 | |
| 147 | |
| 148 | |
| 149 ##Create specific folders | |
| 150 Path_IN_loci_NUC = "./IN_NUC" | |
| 151 outpath= "./OUT" | |
| 152 os.makedirs(Path_IN_loci_NUC) | |
| 153 os.makedirs(outpath) | |
| 154 | |
| 155 | |
| 156 | |
| 157 | |
| 158 #Check if the file is a zip or fasta file | |
| 159 | |
| 160 the_zip_file = zipfile.ZipFile(sys.argv[1]) | |
| 161 ret = the_zip_file.testzip() | |
| 162 | |
| 163 if ret is not None: | |
| 164 shutil.copy2(sys.argv[1], './IN_NUC/input.fasta') | |
| 165 else: | |
| 166 cmd="unzip %s -d ./IN_NUC"%(sys.argv[1]) | |
| 167 os.system(cmd) | |
| 168 | |
| 169 | |
| 170 ## 1 ## List taxa | |
| 171 LT=[] | |
| 172 cmd="grep '>' %s" % sys.argv[2] | |
| 173 result = subprocess.check_output(cmd, shell=True) | |
| 174 result=result.split('\n') | |
| 175 for i in result: | |
| 176 sp=i[1:] | |
| 177 if sp !='': | |
| 178 LT.append(sp) | |
| 179 print LT | |
| 180 | |
| 181 #LT = ["Ap", "Ac", "Pg"] | |
| 182 #os.system("grep '>' %s" %(sys.argv[1])) | |
| 183 | |
| 184 ## 2 ## PathIN | |
| 185 # fileIN_properties = open("01_AminoAcid_Properties2.csv", "r") | |
| 186 Lloci_NUC = os.listdir(Path_IN_loci_NUC) | |
| 187 | |
| 188 | |
| 189 ## 3 ## PathOUT | |
| 190 ## 3.1 ## NUC composition | |
| 191 fileOUT_NUC=open("./OUT/10_nuc_compositions.csv","w") | |
| 192 fileOUT_NUC.write("LOCUS,") | |
| 193 for taxa in LT: | |
| 194 fileOUT_NUC.write("%s_prop_A,%s_prop_T,%s_prop_C,%s_prop_G," %(taxa,taxa,taxa,taxa)) | |
| 195 fileOUT_NUC.write("\n") | |
| 196 | |
| 197 ## 3.2 ## NUC percent_GC | |
| 198 fileOUT_percent_GC=open("./OUT/11_percent_GC.csv","w") | |
| 199 fileOUT_percent_GC.write("LOCUS,") | |
| 200 for taxa in LT: | |
| 201 fileOUT_percent_GC.write("%s_percent_GC," %(taxa)) | |
| 202 fileOUT_percent_GC.write("\n") | |
| 203 | |
| 204 ## 3.3 ## NUC percent_purine | |
| 205 fileOUT_percent_purine=open("./OUT/12_percent_purine.csv","w") | |
| 206 fileOUT_percent_purine.write("LOCUS,") | |
| 207 for taxa in LT: | |
| 208 fileOUT_percent_purine.write("%s_percent_purine," %(taxa)) | |
| 209 fileOUT_percent_purine.write("\n") | |
| 210 | |
| 211 ## 3.4 ## Purine Load | |
| 212 fileOUT_Purine_Load=open("./OUT/12_Purine_Load_Indice.csv", "w") | |
| 213 fileOUT_Purine_Load.write("LOCUS,") | |
| 214 for taxa in LT: | |
| 215 fileOUT_Purine_Load.write("%s_TOTAL,%s_DIFF_GC,%s_DIFF_AT,%s_PLI_GC1000,%s_PLI_AT1000," %(taxa,taxa,taxa,taxa,taxa)) | |
| 216 fileOUT_Purine_Load.write("\n") | |
| 217 | |
| 218 ##################### | |
| 219 ## 4 ## Process Loci | |
| 220 ##################### | |
| 221 for locus in Lloci_NUC: | |
| 222 print locus | |
| 223 path_locus = "%s/%s" %(Path_IN_loci_NUC, locus) | |
| 224 bash = dico(path_locus,LT) | |
| 225 | |
| 226 fileOUT_NUC.write("%s," %locus) | |
| 227 fileOUT_percent_GC.write("%s," %locus) | |
| 228 fileOUT_percent_purine.write("%s," %locus) | |
| 229 fileOUT_Purine_Load.write("%s," %locus) | |
| 230 #print bash | |
| 231 for taxa in LT: | |
| 232 print taxa | |
| 233 if taxa in bash.keys(): | |
| 234 seq = bash[taxa] | |
| 235 percent_GC, percent_purine,prop_A, prop_T, prop_C, prop_G = base_composition(seq) ### DEF2 ### | |
| 236 TOTAL, DIFF_GC, DIFF_AT,PLI_GC,PLI_AT,PLI_GC_1000,PLI_AT_1000 = purine_loading(seq) ### DEF3 ### | |
| 237 fileOUT_NUC.write("%.5f,%.5f,%.5f,%.5f," %(prop_A,prop_T,prop_C,prop_G)) | |
| 238 fileOUT_percent_GC.write("%.5f," %percent_GC) | |
| 239 fileOUT_percent_purine.write("%.5f," %percent_purine) | |
| 240 fileOUT_Purine_Load.write("%d,%d,%d,%.5f,%.5f," %(TOTAL, DIFF_GC, DIFF_AT,PLI_GC_1000, PLI_AT_1000)) | |
| 241 fileOUT_NUC.write("\n") | |
| 242 fileOUT_percent_GC.write("\n") | |
| 243 fileOUT_percent_purine.write("\n") | |
| 244 fileOUT_Purine_Load.write("\n") | |
| 245 fileOUT_NUC.close() | |
| 246 fileOUT_percent_GC.close() | |
| 247 fileOUT_percent_purine.close() | |
| 248 fileOUT_Purine_Load.close() |
