changeset 0:c7901290c849 draft

Uploaded
author jbrayet
date Wed, 13 Jan 2016 10:19:39 -0500
parents
children 9106f862728e
files hmcan_wrapper.py
diffstat 1 files changed, 258 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hmcan_wrapper.py	Wed Jan 13 10:19:39 2016 -0500
@@ -0,0 +1,258 @@
+#!/usr/bin/env python
+
+import sys, subprocess, tempfile, os, shutil, glob
+
+# samtools is added to PATH here
+
+def calculate_window_size ( chr_len_file, input_file, format ):
+    ''' this function calculates the window size used to evaluate copy number variation.
+     
+    W = L/(T*CV*CV)
+    
+    L=genome length
+    T=total read count (mapped) present in the input file (bed, bam, sam)
+    CV=coefficient of variation (0.05)
+    W=window size ''' 
+    
+    cv=0.05
+    #float b/c calculs is done in 'virgule flottante'
+    proc1=subprocess.Popen ("awk '{sum=sum+$2} END {print sum}' %s" % chr_len_file, shell=True, stdout=subprocess.PIPE)
+    genome_len = float (proc1.communicate()[0])
+    if (proc1.returncode !=0):
+        raise Exception ("In calculate_window_size(), returncode of proc1 (genome_len) = i%" % proc1.returncode)
+    
+    if format == "bed" :
+    
+        #proc_RC_bed=subprocess.Popen ("wc -l < %s > lines;read nbr_reads < lines;echo $nbr_reads" % input_file, shell=True, stdout=subprocess.PIPE)
+        proc_RC_bed=subprocess.Popen ("grep -c 'chr' < %s" % input_file, shell=True, stdout=subprocess.PIPE)
+        read_count = int (proc_RC_bed.communicate()[0] )
+    
+        #print ("read_count = %i\n" % read_count)    
+        if (proc_RC_bed.returncode !=0):
+            raise Exception ("In calculate_window_size(), returncode of proc_RC_bed (read_count) = i%" % proc_RC_bed.returncode)
+    
+    elif format == "bam":        
+        # get index, get read count (can't pipe into index)
+        proc_index=subprocess.Popen ("samtools index %s" % input_file, shell=True)
+        #print "samtools index "+str(input_file)
+        proc_index.wait()
+        proc_RC_bam=subprocess.Popen ("samtools idxstats %s | awk '{s=s+$3} END {print s}'" % input_file, shell=True, stdout=subprocess.PIPE )
+        read_count=int(proc_RC_bam.communicate()[0])
+
+    else :
+        # format = sam : convert to bam, get index, get read count
+        proc_SamToBam=subprocess.Popen ("samtools view -bSh %s > %s.bam" % (input_file,input_file), shell=True) 
+        proc_SamToBam.wait()
+        proc_index=subprocess.Popen ("samtools index %s.bam" % input_file, shell=True)
+        proc_index.wait()
+        # get read count
+        proc_RC_bam=subprocess.Popen ("samtools idxstats %s.bam | awk '{s+=$3} END {print s}'" % input_file, shell=True, stdout=subprocess.PIPE )
+        read_count=int(proc_RC_bam.communicate()[0])        
+    
+    
+    # w's type is float, & round returns float.0
+    return int(round(genome_len/(read_count *cv*cv )))
+
+
+def correct_hg19 (hg19Len,corrected_hg19Len ):
+    ''' this function corrects hg19.len file format : adds tab between the columns instead of space '''    
+        
+    i=open(hg19Len)
+    o=open(corrected_hg19Len, "w")
+    for line in i.readlines():
+        col=line.split(" ")
+        col.insert(1, "\t")
+        o.write("".join(col))
+    
+    i.close()
+    o.close()
+
+    return 
+    
+    
+def main():
+
+    input_chip_file = sys.argv[1]
+    input_control_file = sys.argv[2]
+    hmcan_config_file = sys.argv[3]
+    gccount_config_file = sys.argv[4]
+    project_name = sys.argv[5]
+    output_peaks_file = sys.argv[6]
+    output_regions_file = sys.argv[7]
+    output_density_file = sys.argv[8]
+    output_posterior_proba_file = sys.argv[9]
+    hmcan_log_report = sys.argv[10]
+    chr_len_file = sys.argv[11]
+    format = sys.argv[12]
+    genome = sys.argv[13]
+    root_dir = sys.argv[14]
+
+    #binary files
+    GCCOUNT="/usr/bin/HMCan/HMCanV1.20/Utils/GCCount/gccount"
+    HMCAN="/usr/bin/HMCan/HMCanV1.20/src/HMCan"
+
+    ###### CREATE ANNOTATION FILES #########
+    databasePath = root_dir+"/database/files"
+    
+    subprocess.Popen('mkdir -p '+databasePath+'/nebulaAnnotations', shell=True)
+    subprocess.Popen('mkdir -p '+databasePath+'/nebulaAnnotations/'+genome, shell=True)
+
+    nebulaGenomePath=databasePath+"/nebulaAnnotations/"+genome
+
+    FAIFILE='n'
+    LENFILE='n'
+    DICTFILE='n'
+    CHROFILE='n'
+    MAPFILE='n'
+
+    if not os.path.isdir(nebulaGenomePath+"/chromosomes"):
+        subprocess.Popen('mkdir -p '+nebulaGenomePath+'/chromosomes', shell=True)
+        CHROFILE='y'
+
+    if not os.path.isfile(nebulaGenomePath+"/"+genome+".len"):
+        FAIFILE='y'
+        LENFILE='y'
+
+    if not os.path.isfile(nebulaGenomePath+"/out50m2_"+genome+".gem.mappability"):
+        MAPFILE='y'
+    
+    FILEPATH=databasePath.replace("database/files","tool-data")
+
+    cmd='bash /usr/bin/create_annotation_files.sh '+FAIFILE+" "+LENFILE+" "+DICTFILE+" "+CHROFILE+" "+FILEPATH+" "+genome+" "+MAPFILE+" "+nebulaGenomePath
+    process=subprocess.Popen(cmd, shell=True)
+    process.wait()
+    ############### END ##############
+    
+    # create tmp dir.Returns absolute path to a created tmp_dir  
+    tmp_dir=tempfile.mkdtemp()
+    #tmp_dir="/data/tmp/HMcanTest"
+    
+    try:
+
+## GCCOUNT
+        
+        os.chdir(tmp_dir)
+        
+        #add 'window size' and 'step' to gccount config file
+        window_size=calculate_window_size (nebulaGenomePath+"/"+genome+".len", input_control_file, format)
+
+        #window_size=10000 
+        #print ("window_size= %i\n" % window_size)
+
+
+#************** EDIT config files (improved)*****************        
+        
+        chr_len_file=nebulaGenomePath+"/"+genome+".len"
+        
+        #Frist, correct hg19.len file, in case HMCan is run for hg19 (because hg19.len is not tabular on the server, better change it here)
+        hg19="hg19"
+        if (hg19 in genome):
+            correct_hg19 (chr_len_file,nebulaGenomePath+"/corrected_hg19Len_file.len")
+            param_len=nebulaGenomePath+"/corrected_hg19Len_file.len"
+
+        else:
+            #to make my life easier..
+            param_len= chr_len_file
+
+        #Edit gc count config file
+        cmd_step ="sed -i \"/step/c\step = %s\" %s" % ( window_size , gccount_config_file )
+        cmd_window ="sed -i \"/window/c\window = %s\" %s" % ( window_size , gccount_config_file )
+        cmd_chrLen = "sed -i \"s~chrLenFile =.*~chrLenFile = "+param_len+"~g\" "+gccount_config_file
+        cmd_chrFiles = "sed -i \"s~chrFiles =.*~chrFiles = "+nebulaGenomePath+"/chromosomes~g\" "+gccount_config_file
+        cmd_gemMappabilityFile = "sed -i \"s~gemMappabilityFile =.*~gemMappabilityFile = "+nebulaGenomePath+"/out50m2_"+genome+".gem.mappability~g\" "+gccount_config_file
+
+        
+        s=subprocess.Popen(args=cmd_step, shell=True)
+        s.wait()
+        
+        w=subprocess.Popen(args=cmd_window, shell=True)
+        w.wait()
+            
+        cL=subprocess.Popen(args=cmd_chrLen, shell=True)
+        cL.wait()
+
+        cF=subprocess.Popen(args=cmd_chrFiles, shell=True)
+        cF.wait()
+            
+        cM=subprocess.Popen(args=cmd_gemMappabilityFile, shell=True)
+        cM.wait()
+
+#*********** END edit config files ****************************        
+
+                
+        ##call gccount  /dev/null
+        gc_proc=subprocess.Popen(args= "%s %s >> %s 2>&1" % (GCCOUNT, " ".join(['-conf', gccount_config_file]), hmcan_log_report), shell=True, stderr=subprocess.PIPE)
+        gc_stderr_data = gc_proc.communicate()[1] # waits for process to terminate, returns (stdoutdata, stderrdata)
+        #if (gc_proc.returncode !=0): 
+        #    raise Exception("GCcount returncode = %i\n GCcount stderr : %s\n" % (gc_proc.returncode,gc_stderr_data ))
+        
+        # Find outputs: .cnp files 
+        # glob returns list of absolute paths of files.cnp in tmp_dir
+        cnp_list=glob.glob('*.cnp')
+        if (len(cnp_list) == 0): 
+            raise Exception("Error while running Gccount : program did not generate outputs for HMCan (file.cnp) !\n GCCOUNT stderr : %s\n" % gc_stderr_data) 
+        cnp_file=cnp_list[0]
+        #print "GC PROFIL file :%s\n" % cnp_file
+            
+## HMCAN
+        # EDIT hmcan config file : add 'GCIndex' and 'largeBinLength' 
+        cmd_gc="sed -i \"/GCIndex/c\GCIndex %s\" %s" % ( cnp_file , hmcan_config_file )
+        cmd_bin="sed -i \"/largeBinLength/c\largeBinLength %s\" %s" % ( window_size , hmcan_config_file )
+        cmd_chrFiles = "sed -i \"s~genomePath.*~genomePath "+nebulaGenomePath+"/chromosomes~g\" "+hmcan_config_file
+
+        g=subprocess.Popen(args=cmd_gc, shell=True)
+        g.wait()
+        
+        b=subprocess.Popen(args=cmd_bin, shell=True)
+        b.wait()
+        
+        cF=subprocess.Popen(args=cmd_chrFiles, shell=True)
+        cF.wait()
+        
+        #call HMCan , hmcan_log_report
+        hmcan_proc= subprocess.Popen(args= "%s %s >> %s 2>&1" % (HMCAN, " ".join([input_chip_file, input_control_file, hmcan_config_file , project_name]), hmcan_log_report ), shell=True, stderr=subprocess.PIPE)
+        hmcan_stderr_data=hmcan_proc.communicate()[1]
+        if (hmcan_proc.returncode !=0):
+            raise Exception    ("Error occured, HMCan returncode = %i\n HMCan stderr : %s\n" % ( hmcan_proc.returncode, hmcan_stderr_data ) )
+    
+        #deal with outputs : copy outputs from tmp_dir to dataset_XXX :
+        
+        #shutil.move( os.path.join (tmp_dir,"%s_peaks.bed" % project_name ), output_peaks_file) 
+        shutil.move( os.path.join (tmp_dir,"%s_peaks.narrowPeak" % project_name ), output_peaks_file)
+        shutil.move( os.path.join( tmp_dir, "%s_regions.bed" % project_name ) ,output_regions_file )
+        if (output_posterior_proba_file !='None') :
+            shutil.move( os.path.join( tmp_dir, "%s_posterior.wig" % project_name ) ,output_posterior_proba_file ) 
+        if (output_density_file != 'None'):
+            shutil.move( os.path.join( tmp_dir, "%s.wig" % project_name ) ,output_density_file )
+    
+    #Handle exceptions
+    except OSError:
+        sys.stderr.write ( "Error occured while runing HMCan : trying to execute a non existing file.\n" )
+        #remove tmp_dir created:
+        if os.path.exists( tmp_dir ):
+            shutil.rmtree(tmp_dir)
+        raise
+        
+    except ValueError:
+        sys.stderr.write ( "Error occured while runing HMCan :  Popen is called with invalid arguments.\n" )
+        if os.path.exists( tmp_dir ):
+            shutil.rmtree(tmp_dir)
+        raise
+    
+    #Handle other    
+    except Exception:
+        sys.stderr.write ( "Error occured while runing HMCan\n" )        
+        if os.path.exists( tmp_dir ):
+            shutil.rmtree(tmp_dir)
+        raise
+        
+    #clean up my mess 
+    if os.path.exists( tmp_dir ):
+            shutil.rmtree(tmp_dir)
+
+
+#main
+
+if __name__=='__main__': 
+    main()
+