view scripts/S02b_study_seq_composition_nuc.py @ 1:8de21b6eb110 draft

planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 44a89d5eeb82789bfc643b33c11f391281b6374b
author abims-sbr
date Wed, 27 Sep 2017 10:04:08 -0400
parents 78dd6454f6f0
children 988467f963f0
line wrap: on
line source

#!/usr/bin/env python
## Author: Eric FONTANILLAS
## Date: 21.12.10
## Object: Test for compositional bias in genome and proteome as marker of thermal adaptation (comparison between 2 "hot" species: Ap and Ps and one "cold" species: Pg)


import sys,os,shutil,subprocess, string

#############
### DEF 0 ###

def simplify_fasta_name(fasta_name,LT):
    for abbreviation in LT:
        if abbreviation in fasta_name:
            new_fasta_name = abbreviation

    return(new_fasta_name)
##########################################

###########
## DEF1 ##
###########
## Generates bash, with key = fasta name; value = sequence (WITH GAP, IF ANY, REMOVED IN THIS FUNCTION)

def dico(fasta_file,LT):

    count_fastaName=0
    F1 = open(fasta_file, "r")
    
    bash1 = {}
    while 1:
        nextline = F1.readline()
        #print nextline
        if not nextline :
            break
        
        if nextline[0] == ">":
            count_fastaName = count_fastaName + 1
            fasta_name = nextline[1:-1]
            nextline = F1.readline()
            sequence = nextline[:-1]
            
            if fasta_name not in bash1.keys():
                fasta_name = simplify_fasta_name(fasta_name,LT)  ### DEF 0 ###
                bash1[fasta_name] = sequence
            else:
                print fasta_name

    # Find alignment length
    kk = bash1.keys()
    key0 = kk[0]
    seq0 = bash1[key0]
    ln_seq = len(seq0)

    F1.close()
    
    return(bash1)
#####################################



##################
###### DEF2 ######
##################
def base_composition(seq):
      count_A=string.count(seq, "A") + string.count(seq, "a")
      count_T=string.count(seq, "T") + string.count(seq, "t")
      count_C=string.count(seq, "C") + string.count(seq, "c")
      count_G=string.count(seq, "G") + string.count(seq, "g")
      ## 3 ## Nucleotide proportions
      ln = count_C+count_G+count_T+count_A
      if (ln!=0):

            CG = count_C+count_G
            AT = count_T+count_A
      
            AG = count_A+count_G
            TC = count_T+count_C

            ## 1 ## Search for compositional bias in genome as marker of thermal adaptation: CG vs AT
            ratio_CG_AT = float(CG)/float(AT)
            percent_CG = float(CG)/(float(AT) + float(CG))*100
      
            ## 2 ## Search for compositional bias in genome as marker of thermal adaptation: AG vs TC
            ratio_purine_pyrimidine=float(AG)/float(TC)
            percent_purine=float(AG)/(float(AG)+float(TC))*100      
      
     
            prop_A = float(count_A)/float(ln)
            prop_T = float(count_T)/float(ln)
            prop_C = float(count_C)/float(ln)
            prop_G = float(count_G)/float(ln)
      else:
            percent_CG=0
            percent_purine=0
            prop_A=0
            prop_T=0
            prop_C=0
            prop_G=0
      
      return(percent_CG, percent_purine, prop_A, prop_T, prop_C, prop_G)
##############################################

##################
###### DEF3 ######
##################
def purine_loading(seq):
      count_A=string.count(seq, "A") + string.count(seq, "a")
      count_T=string.count(seq, "T") + string.count(seq, "t")
      count_C=string.count(seq, "C") + string.count(seq, "c")
      count_G=string.count(seq, "G") + string.count(seq, "g")
      ## 3 ## Nucleotide proportions
      TOTAL = count_C+count_G+count_T+count_A
      if (TOTAL!=0):
     
            ## PLI : Purine loading indice (Forsdyke et al.)
            # (G-C)/N * 1000 et (A-T)/N * 1000
      
            DIFF_GC = count_G - count_C
            DIFF_AT = count_A - count_T

            # Per bp
            PLI_GC = float(DIFF_GC)/float(TOTAL)
            PLI_AT = float(DIFF_AT)/float(TOTAL)
      
            # Per 1000 bp
            PLI_GC_1000 = PLI_GC*1000
            PLI_AT_1000 = PLI_AT*1000
      else:
            DIFF_GC=0
            DIFF_AT=0
            PLI_GC=0
            PLI_AT=0
            PLI_GC_1000=0
            PLI_AT_1000=0

      return(TOTAL, DIFF_GC, DIFF_AT,PLI_GC,PLI_AT,PLI_GC_1000,PLI_AT_1000)
##############################################

###################
### RUN RUN RUN ###
###################

##Create specific folders
Path_IN_loci_NUC = "./IN_NUC"
outpath= "./OUT"
os.makedirs(Path_IN_loci_NUC)
os.makedirs(outpath)

infiles = str.split(sys.argv[1], ",")
for file in infiles:
    os.system("cp %s %s" %(file, Path_IN_loci_NUC))

## 1 ## List taxa
LT=[]
cmd="grep '>' %s" % sys.argv[2]
result = subprocess.check_output(cmd, shell=True)
result=result.split('\n')
for i in result:   
    sp=i[1:]
    if sp !='':
        LT.append(sp)
print LT

#LT = ["Ap", "Ac", "Pg"]
#os.system("grep '>' %s" %(sys.argv[1]))

## 2 ## PathIN
# fileIN_properties = open("01_AminoAcid_Properties2.csv", "r")
Lloci_NUC = os.listdir(Path_IN_loci_NUC)


## 3 ## PathOUT
## 3.1 ## NUC composition
fileOUT_NUC=open("./OUT/nuc_compositions.csv","w")
fileOUT_NUC.write("LOCUS,")
for taxa in LT:
    fileOUT_NUC.write("%s_prop_A,%s_prop_T,%s_prop_C,%s_prop_G," %(taxa,taxa,taxa,taxa))
fileOUT_NUC.write("\n")

## 3.2 ## NUC percent_GC
fileOUT_percent_GC=open("./OUT/percent_GC.csv","w")
fileOUT_percent_GC.write("LOCUS,")
for taxa in LT:
    fileOUT_percent_GC.write("%s_percent_GC," %(taxa))
fileOUT_percent_GC.write("\n")

## 3.3 ## NUC percent_purine
fileOUT_percent_purine=open("./OUT/percent_purine.csv","w")
fileOUT_percent_purine.write("LOCUS,")
for taxa in LT:
    fileOUT_percent_purine.write("%s_percent_purine," %(taxa))
fileOUT_percent_purine.write("\n")

## 3.4 ## Purine Load
fileOUT_Purine_Load=open("./OUT/Purine_Load_Indice.csv", "w")
fileOUT_Purine_Load.write("LOCUS,")
for taxa in LT:
    fileOUT_Purine_Load.write("%s_TOTAL,%s_DIFF_GC,%s_DIFF_AT,%s_PLI_GC1000,%s_PLI_AT1000," %(taxa,taxa,taxa,taxa,taxa))
fileOUT_Purine_Load.write("\n")

#####################
## 4 ## Process Loci
#####################
for locus in Lloci_NUC:
    print locus
    path_locus = "%s/%s" %(Path_IN_loci_NUC, locus)
    bash = dico(path_locus,LT)

    fileOUT_NUC.write("%s," %locus)
    fileOUT_percent_GC.write("%s," %locus)
    fileOUT_percent_purine.write("%s," %locus)
    fileOUT_Purine_Load.write("%s," %locus)
    
    if taxa in bash.keys():
            seq = bash[taxa]
            percent_GC, percent_purine,prop_A, prop_T, prop_C, prop_G = base_composition(seq)   ### DEF2 ###
            TOTAL, DIFF_GC, DIFF_AT,PLI_GC,PLI_AT,PLI_GC_1000,PLI_AT_1000 = purine_loading(seq) ### DEF3 ###
            fileOUT_NUC.write("%.5f,%.5f,%.5f,%.5f," %(prop_A,prop_T,prop_C,prop_G))
            fileOUT_percent_GC.write("%.5f," %percent_GC)
            fileOUT_percent_purine.write("%.5f," %percent_purine)
            fileOUT_Purine_Load.write("%d,%d,%d,%.5f,%.5f," %(TOTAL, DIFF_GC, DIFF_AT,PLI_GC_1000, PLI_AT_1000))
    fileOUT_NUC.write("\n")
    fileOUT_percent_GC.write("\n")
    fileOUT_percent_purine.write("\n")
    fileOUT_Purine_Load.write("\n")
fileOUT_NUC.close()
fileOUT_percent_GC.close()
fileOUT_percent_purine.close()
fileOUT_Purine_Load.close()