view hmcan_wrapper.py @ 0:c7901290c849 draft

Uploaded
author jbrayet
date Wed, 13 Jan 2016 10:19:39 -0500
parents
children
line wrap: on
line source

#!/usr/bin/env python

import sys, subprocess, tempfile, os, shutil, glob

# samtools is added to PATH here

def calculate_window_size ( chr_len_file, input_file, format ):
    ''' this function calculates the window size used to evaluate copy number variation.
     
    W = L/(T*CV*CV)
    
    L=genome length
    T=total read count (mapped) present in the input file (bed, bam, sam)
    CV=coefficient of variation (0.05)
    W=window size ''' 
    
    cv=0.05
    #float b/c calculs is done in 'virgule flottante'
    proc1=subprocess.Popen ("awk '{sum=sum+$2} END {print sum}' %s" % chr_len_file, shell=True, stdout=subprocess.PIPE)
    genome_len = float (proc1.communicate()[0])
    if (proc1.returncode !=0):
        raise Exception ("In calculate_window_size(), returncode of proc1 (genome_len) = i%" % proc1.returncode)
    
    if format == "bed" :
    
        #proc_RC_bed=subprocess.Popen ("wc -l < %s > lines;read nbr_reads < lines;echo $nbr_reads" % input_file, shell=True, stdout=subprocess.PIPE)
        proc_RC_bed=subprocess.Popen ("grep -c 'chr' < %s" % input_file, shell=True, stdout=subprocess.PIPE)
        read_count = int (proc_RC_bed.communicate()[0] )
    
        #print ("read_count = %i\n" % read_count)    
        if (proc_RC_bed.returncode !=0):
            raise Exception ("In calculate_window_size(), returncode of proc_RC_bed (read_count) = i%" % proc_RC_bed.returncode)
    
    elif format == "bam":        
        # get index, get read count (can't pipe into index)
        proc_index=subprocess.Popen ("samtools index %s" % input_file, shell=True)
        #print "samtools index "+str(input_file)
        proc_index.wait()
        proc_RC_bam=subprocess.Popen ("samtools idxstats %s | awk '{s=s+$3} END {print s}'" % input_file, shell=True, stdout=subprocess.PIPE )
        read_count=int(proc_RC_bam.communicate()[0])

    else :
        # format = sam : convert to bam, get index, get read count
        proc_SamToBam=subprocess.Popen ("samtools view -bSh %s > %s.bam" % (input_file,input_file), shell=True) 
        proc_SamToBam.wait()
        proc_index=subprocess.Popen ("samtools index %s.bam" % input_file, shell=True)
        proc_index.wait()
        # get read count
        proc_RC_bam=subprocess.Popen ("samtools idxstats %s.bam | awk '{s+=$3} END {print s}'" % input_file, shell=True, stdout=subprocess.PIPE )
        read_count=int(proc_RC_bam.communicate()[0])        
    
    
    # w's type is float, & round returns float.0
    return int(round(genome_len/(read_count *cv*cv )))


def correct_hg19 (hg19Len,corrected_hg19Len ):
    ''' this function corrects hg19.len file format : adds tab between the columns instead of space '''    
        
    i=open(hg19Len)
    o=open(corrected_hg19Len, "w")
    for line in i.readlines():
        col=line.split(" ")
        col.insert(1, "\t")
        o.write("".join(col))
    
    i.close()
    o.close()

    return 
    
    
def main():

    input_chip_file = sys.argv[1]
    input_control_file = sys.argv[2]
    hmcan_config_file = sys.argv[3]
    gccount_config_file = sys.argv[4]
    project_name = sys.argv[5]
    output_peaks_file = sys.argv[6]
    output_regions_file = sys.argv[7]
    output_density_file = sys.argv[8]
    output_posterior_proba_file = sys.argv[9]
    hmcan_log_report = sys.argv[10]
    chr_len_file = sys.argv[11]
    format = sys.argv[12]
    genome = sys.argv[13]
    root_dir = sys.argv[14]

    #binary files
    GCCOUNT="/usr/bin/HMCan/HMCanV1.20/Utils/GCCount/gccount"
    HMCAN="/usr/bin/HMCan/HMCanV1.20/src/HMCan"

    ###### CREATE ANNOTATION FILES #########
    databasePath = root_dir+"/database/files"
    
    subprocess.Popen('mkdir -p '+databasePath+'/nebulaAnnotations', shell=True)
    subprocess.Popen('mkdir -p '+databasePath+'/nebulaAnnotations/'+genome, shell=True)

    nebulaGenomePath=databasePath+"/nebulaAnnotations/"+genome

    FAIFILE='n'
    LENFILE='n'
    DICTFILE='n'
    CHROFILE='n'
    MAPFILE='n'

    if not os.path.isdir(nebulaGenomePath+"/chromosomes"):
        subprocess.Popen('mkdir -p '+nebulaGenomePath+'/chromosomes', shell=True)
        CHROFILE='y'

    if not os.path.isfile(nebulaGenomePath+"/"+genome+".len"):
        FAIFILE='y'
        LENFILE='y'

    if not os.path.isfile(nebulaGenomePath+"/out50m2_"+genome+".gem.mappability"):
        MAPFILE='y'
    
    FILEPATH=databasePath.replace("database/files","tool-data")

    cmd='bash /usr/bin/create_annotation_files.sh '+FAIFILE+" "+LENFILE+" "+DICTFILE+" "+CHROFILE+" "+FILEPATH+" "+genome+" "+MAPFILE+" "+nebulaGenomePath
    process=subprocess.Popen(cmd, shell=True)
    process.wait()
    ############### END ##############
    
    # create tmp dir.Returns absolute path to a created tmp_dir  
    tmp_dir=tempfile.mkdtemp()
    #tmp_dir="/data/tmp/HMcanTest"
    
    try:

## GCCOUNT
        
        os.chdir(tmp_dir)
        
        #add 'window size' and 'step' to gccount config file
        window_size=calculate_window_size (nebulaGenomePath+"/"+genome+".len", input_control_file, format)

        #window_size=10000 
        #print ("window_size= %i\n" % window_size)


#************** EDIT config files (improved)*****************        
        
        chr_len_file=nebulaGenomePath+"/"+genome+".len"
        
        #Frist, correct hg19.len file, in case HMCan is run for hg19 (because hg19.len is not tabular on the server, better change it here)
        hg19="hg19"
        if (hg19 in genome):
            correct_hg19 (chr_len_file,nebulaGenomePath+"/corrected_hg19Len_file.len")
            param_len=nebulaGenomePath+"/corrected_hg19Len_file.len"

        else:
            #to make my life easier..
            param_len= chr_len_file

        #Edit gc count config file
        cmd_step ="sed -i \"/step/c\step = %s\" %s" % ( window_size , gccount_config_file )
        cmd_window ="sed -i \"/window/c\window = %s\" %s" % ( window_size , gccount_config_file )
        cmd_chrLen = "sed -i \"s~chrLenFile =.*~chrLenFile = "+param_len+"~g\" "+gccount_config_file
        cmd_chrFiles = "sed -i \"s~chrFiles =.*~chrFiles = "+nebulaGenomePath+"/chromosomes~g\" "+gccount_config_file
        cmd_gemMappabilityFile = "sed -i \"s~gemMappabilityFile =.*~gemMappabilityFile = "+nebulaGenomePath+"/out50m2_"+genome+".gem.mappability~g\" "+gccount_config_file

        
        s=subprocess.Popen(args=cmd_step, shell=True)
        s.wait()
        
        w=subprocess.Popen(args=cmd_window, shell=True)
        w.wait()
            
        cL=subprocess.Popen(args=cmd_chrLen, shell=True)
        cL.wait()

        cF=subprocess.Popen(args=cmd_chrFiles, shell=True)
        cF.wait()
            
        cM=subprocess.Popen(args=cmd_gemMappabilityFile, shell=True)
        cM.wait()

#*********** END edit config files ****************************        

                
        ##call gccount  /dev/null
        gc_proc=subprocess.Popen(args= "%s %s >> %s 2>&1" % (GCCOUNT, " ".join(['-conf', gccount_config_file]), hmcan_log_report), shell=True, stderr=subprocess.PIPE)
        gc_stderr_data = gc_proc.communicate()[1] # waits for process to terminate, returns (stdoutdata, stderrdata)
        #if (gc_proc.returncode !=0): 
        #    raise Exception("GCcount returncode = %i\n GCcount stderr : %s\n" % (gc_proc.returncode,gc_stderr_data ))
        
        # Find outputs: .cnp files 
        # glob returns list of absolute paths of files.cnp in tmp_dir
        cnp_list=glob.glob('*.cnp')
        if (len(cnp_list) == 0): 
            raise Exception("Error while running Gccount : program did not generate outputs for HMCan (file.cnp) !\n GCCOUNT stderr : %s\n" % gc_stderr_data) 
        cnp_file=cnp_list[0]
        #print "GC PROFIL file :%s\n" % cnp_file
            
## HMCAN
        # EDIT hmcan config file : add 'GCIndex' and 'largeBinLength' 
        cmd_gc="sed -i \"/GCIndex/c\GCIndex %s\" %s" % ( cnp_file , hmcan_config_file )
        cmd_bin="sed -i \"/largeBinLength/c\largeBinLength %s\" %s" % ( window_size , hmcan_config_file )
        cmd_chrFiles = "sed -i \"s~genomePath.*~genomePath "+nebulaGenomePath+"/chromosomes~g\" "+hmcan_config_file

        g=subprocess.Popen(args=cmd_gc, shell=True)
        g.wait()
        
        b=subprocess.Popen(args=cmd_bin, shell=True)
        b.wait()
        
        cF=subprocess.Popen(args=cmd_chrFiles, shell=True)
        cF.wait()
        
        #call HMCan , hmcan_log_report
        hmcan_proc= subprocess.Popen(args= "%s %s >> %s 2>&1" % (HMCAN, " ".join([input_chip_file, input_control_file, hmcan_config_file , project_name]), hmcan_log_report ), shell=True, stderr=subprocess.PIPE)
        hmcan_stderr_data=hmcan_proc.communicate()[1]
        if (hmcan_proc.returncode !=0):
            raise Exception    ("Error occured, HMCan returncode = %i\n HMCan stderr : %s\n" % ( hmcan_proc.returncode, hmcan_stderr_data ) )
    
        #deal with outputs : copy outputs from tmp_dir to dataset_XXX :
        
        #shutil.move( os.path.join (tmp_dir,"%s_peaks.bed" % project_name ), output_peaks_file) 
        shutil.move( os.path.join (tmp_dir,"%s_peaks.narrowPeak" % project_name ), output_peaks_file)
        shutil.move( os.path.join( tmp_dir, "%s_regions.bed" % project_name ) ,output_regions_file )
        if (output_posterior_proba_file !='None') :
            shutil.move( os.path.join( tmp_dir, "%s_posterior.wig" % project_name ) ,output_posterior_proba_file ) 
        if (output_density_file != 'None'):
            shutil.move( os.path.join( tmp_dir, "%s.wig" % project_name ) ,output_density_file )
    
    #Handle exceptions
    except OSError:
        sys.stderr.write ( "Error occured while runing HMCan : trying to execute a non existing file.\n" )
        #remove tmp_dir created:
        if os.path.exists( tmp_dir ):
            shutil.rmtree(tmp_dir)
        raise
        
    except ValueError:
        sys.stderr.write ( "Error occured while runing HMCan :  Popen is called with invalid arguments.\n" )
        if os.path.exists( tmp_dir ):
            shutil.rmtree(tmp_dir)
        raise
    
    #Handle other    
    except Exception:
        sys.stderr.write ( "Error occured while runing HMCan\n" )        
        if os.path.exists( tmp_dir ):
            shutil.rmtree(tmp_dir)
        raise
        
    #clean up my mess 
    if os.path.exists( tmp_dir ):
            shutil.rmtree(tmp_dir)


#main

if __name__=='__main__': 
    main()