Mercurial > repos > jbrayet > hmcan_1_0_docker
changeset 0:c7901290c849 draft
Uploaded
author | jbrayet |
---|---|
date | Wed, 13 Jan 2016 10:19:39 -0500 |
parents | |
children | 9106f862728e |
files | hmcan_wrapper.py |
diffstat | 1 files changed, 258 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hmcan_wrapper.py Wed Jan 13 10:19:39 2016 -0500 @@ -0,0 +1,258 @@ +#!/usr/bin/env python + +import sys, subprocess, tempfile, os, shutil, glob + +# samtools is added to PATH here + +def calculate_window_size ( chr_len_file, input_file, format ): + ''' this function calculates the window size used to evaluate copy number variation. + + W = L/(T*CV*CV) + + L=genome length + T=total read count (mapped) present in the input file (bed, bam, sam) + CV=coefficient of variation (0.05) + W=window size ''' + + cv=0.05 + #float b/c calculs is done in 'virgule flottante' + proc1=subprocess.Popen ("awk '{sum=sum+$2} END {print sum}' %s" % chr_len_file, shell=True, stdout=subprocess.PIPE) + genome_len = float (proc1.communicate()[0]) + if (proc1.returncode !=0): + raise Exception ("In calculate_window_size(), returncode of proc1 (genome_len) = i%" % proc1.returncode) + + if format == "bed" : + + #proc_RC_bed=subprocess.Popen ("wc -l < %s > lines;read nbr_reads < lines;echo $nbr_reads" % input_file, shell=True, stdout=subprocess.PIPE) + proc_RC_bed=subprocess.Popen ("grep -c 'chr' < %s" % input_file, shell=True, stdout=subprocess.PIPE) + read_count = int (proc_RC_bed.communicate()[0] ) + + #print ("read_count = %i\n" % read_count) + if (proc_RC_bed.returncode !=0): + raise Exception ("In calculate_window_size(), returncode of proc_RC_bed (read_count) = i%" % proc_RC_bed.returncode) + + elif format == "bam": + # get index, get read count (can't pipe into index) + proc_index=subprocess.Popen ("samtools index %s" % input_file, shell=True) + #print "samtools index "+str(input_file) + proc_index.wait() + proc_RC_bam=subprocess.Popen ("samtools idxstats %s | awk '{s=s+$3} END {print s}'" % input_file, shell=True, stdout=subprocess.PIPE ) + read_count=int(proc_RC_bam.communicate()[0]) + + else : + # format = sam : convert to bam, get index, get read count + proc_SamToBam=subprocess.Popen ("samtools view -bSh %s > %s.bam" % (input_file,input_file), shell=True) + proc_SamToBam.wait() + proc_index=subprocess.Popen ("samtools index %s.bam" % input_file, shell=True) + proc_index.wait() + # get read count + proc_RC_bam=subprocess.Popen ("samtools idxstats %s.bam | awk '{s+=$3} END {print s}'" % input_file, shell=True, stdout=subprocess.PIPE ) + read_count=int(proc_RC_bam.communicate()[0]) + + + # w's type is float, & round returns float.0 + return int(round(genome_len/(read_count *cv*cv ))) + + +def correct_hg19 (hg19Len,corrected_hg19Len ): + ''' this function corrects hg19.len file format : adds tab between the columns instead of space ''' + + i=open(hg19Len) + o=open(corrected_hg19Len, "w") + for line in i.readlines(): + col=line.split(" ") + col.insert(1, "\t") + o.write("".join(col)) + + i.close() + o.close() + + return + + +def main(): + + input_chip_file = sys.argv[1] + input_control_file = sys.argv[2] + hmcan_config_file = sys.argv[3] + gccount_config_file = sys.argv[4] + project_name = sys.argv[5] + output_peaks_file = sys.argv[6] + output_regions_file = sys.argv[7] + output_density_file = sys.argv[8] + output_posterior_proba_file = sys.argv[9] + hmcan_log_report = sys.argv[10] + chr_len_file = sys.argv[11] + format = sys.argv[12] + genome = sys.argv[13] + root_dir = sys.argv[14] + + #binary files + GCCOUNT="/usr/bin/HMCan/HMCanV1.20/Utils/GCCount/gccount" + HMCAN="/usr/bin/HMCan/HMCanV1.20/src/HMCan" + + ###### CREATE ANNOTATION FILES ######### + databasePath = root_dir+"/database/files" + + subprocess.Popen('mkdir -p '+databasePath+'/nebulaAnnotations', shell=True) + subprocess.Popen('mkdir -p '+databasePath+'/nebulaAnnotations/'+genome, shell=True) + + nebulaGenomePath=databasePath+"/nebulaAnnotations/"+genome + + FAIFILE='n' + LENFILE='n' + DICTFILE='n' + CHROFILE='n' + MAPFILE='n' + + if not os.path.isdir(nebulaGenomePath+"/chromosomes"): + subprocess.Popen('mkdir -p '+nebulaGenomePath+'/chromosomes', shell=True) + CHROFILE='y' + + if not os.path.isfile(nebulaGenomePath+"/"+genome+".len"): + FAIFILE='y' + LENFILE='y' + + if not os.path.isfile(nebulaGenomePath+"/out50m2_"+genome+".gem.mappability"): + MAPFILE='y' + + FILEPATH=databasePath.replace("database/files","tool-data") + + cmd='bash /usr/bin/create_annotation_files.sh '+FAIFILE+" "+LENFILE+" "+DICTFILE+" "+CHROFILE+" "+FILEPATH+" "+genome+" "+MAPFILE+" "+nebulaGenomePath + process=subprocess.Popen(cmd, shell=True) + process.wait() + ############### END ############## + + # create tmp dir.Returns absolute path to a created tmp_dir + tmp_dir=tempfile.mkdtemp() + #tmp_dir="/data/tmp/HMcanTest" + + try: + +## GCCOUNT + + os.chdir(tmp_dir) + + #add 'window size' and 'step' to gccount config file + window_size=calculate_window_size (nebulaGenomePath+"/"+genome+".len", input_control_file, format) + + #window_size=10000 + #print ("window_size= %i\n" % window_size) + + +#************** EDIT config files (improved)***************** + + chr_len_file=nebulaGenomePath+"/"+genome+".len" + + #Frist, correct hg19.len file, in case HMCan is run for hg19 (because hg19.len is not tabular on the server, better change it here) + hg19="hg19" + if (hg19 in genome): + correct_hg19 (chr_len_file,nebulaGenomePath+"/corrected_hg19Len_file.len") + param_len=nebulaGenomePath+"/corrected_hg19Len_file.len" + + else: + #to make my life easier.. + param_len= chr_len_file + + #Edit gc count config file + cmd_step ="sed -i \"/step/c\step = %s\" %s" % ( window_size , gccount_config_file ) + cmd_window ="sed -i \"/window/c\window = %s\" %s" % ( window_size , gccount_config_file ) + cmd_chrLen = "sed -i \"s~chrLenFile =.*~chrLenFile = "+param_len+"~g\" "+gccount_config_file + cmd_chrFiles = "sed -i \"s~chrFiles =.*~chrFiles = "+nebulaGenomePath+"/chromosomes~g\" "+gccount_config_file + cmd_gemMappabilityFile = "sed -i \"s~gemMappabilityFile =.*~gemMappabilityFile = "+nebulaGenomePath+"/out50m2_"+genome+".gem.mappability~g\" "+gccount_config_file + + + s=subprocess.Popen(args=cmd_step, shell=True) + s.wait() + + w=subprocess.Popen(args=cmd_window, shell=True) + w.wait() + + cL=subprocess.Popen(args=cmd_chrLen, shell=True) + cL.wait() + + cF=subprocess.Popen(args=cmd_chrFiles, shell=True) + cF.wait() + + cM=subprocess.Popen(args=cmd_gemMappabilityFile, shell=True) + cM.wait() + +#*********** END edit config files **************************** + + + ##call gccount /dev/null + gc_proc=subprocess.Popen(args= "%s %s >> %s 2>&1" % (GCCOUNT, " ".join(['-conf', gccount_config_file]), hmcan_log_report), shell=True, stderr=subprocess.PIPE) + gc_stderr_data = gc_proc.communicate()[1] # waits for process to terminate, returns (stdoutdata, stderrdata) + #if (gc_proc.returncode !=0): + # raise Exception("GCcount returncode = %i\n GCcount stderr : %s\n" % (gc_proc.returncode,gc_stderr_data )) + + # Find outputs: .cnp files + # glob returns list of absolute paths of files.cnp in tmp_dir + cnp_list=glob.glob('*.cnp') + if (len(cnp_list) == 0): + raise Exception("Error while running Gccount : program did not generate outputs for HMCan (file.cnp) !\n GCCOUNT stderr : %s\n" % gc_stderr_data) + cnp_file=cnp_list[0] + #print "GC PROFIL file :%s\n" % cnp_file + +## HMCAN + # EDIT hmcan config file : add 'GCIndex' and 'largeBinLength' + cmd_gc="sed -i \"/GCIndex/c\GCIndex %s\" %s" % ( cnp_file , hmcan_config_file ) + cmd_bin="sed -i \"/largeBinLength/c\largeBinLength %s\" %s" % ( window_size , hmcan_config_file ) + cmd_chrFiles = "sed -i \"s~genomePath.*~genomePath "+nebulaGenomePath+"/chromosomes~g\" "+hmcan_config_file + + g=subprocess.Popen(args=cmd_gc, shell=True) + g.wait() + + b=subprocess.Popen(args=cmd_bin, shell=True) + b.wait() + + cF=subprocess.Popen(args=cmd_chrFiles, shell=True) + cF.wait() + + #call HMCan , hmcan_log_report + hmcan_proc= subprocess.Popen(args= "%s %s >> %s 2>&1" % (HMCAN, " ".join([input_chip_file, input_control_file, hmcan_config_file , project_name]), hmcan_log_report ), shell=True, stderr=subprocess.PIPE) + hmcan_stderr_data=hmcan_proc.communicate()[1] + if (hmcan_proc.returncode !=0): + raise Exception ("Error occured, HMCan returncode = %i\n HMCan stderr : %s\n" % ( hmcan_proc.returncode, hmcan_stderr_data ) ) + + #deal with outputs : copy outputs from tmp_dir to dataset_XXX : + + #shutil.move( os.path.join (tmp_dir,"%s_peaks.bed" % project_name ), output_peaks_file) + shutil.move( os.path.join (tmp_dir,"%s_peaks.narrowPeak" % project_name ), output_peaks_file) + shutil.move( os.path.join( tmp_dir, "%s_regions.bed" % project_name ) ,output_regions_file ) + if (output_posterior_proba_file !='None') : + shutil.move( os.path.join( tmp_dir, "%s_posterior.wig" % project_name ) ,output_posterior_proba_file ) + if (output_density_file != 'None'): + shutil.move( os.path.join( tmp_dir, "%s.wig" % project_name ) ,output_density_file ) + + #Handle exceptions + except OSError: + sys.stderr.write ( "Error occured while runing HMCan : trying to execute a non existing file.\n" ) + #remove tmp_dir created: + if os.path.exists( tmp_dir ): + shutil.rmtree(tmp_dir) + raise + + except ValueError: + sys.stderr.write ( "Error occured while runing HMCan : Popen is called with invalid arguments.\n" ) + if os.path.exists( tmp_dir ): + shutil.rmtree(tmp_dir) + raise + + #Handle other + except Exception: + sys.stderr.write ( "Error occured while runing HMCan\n" ) + if os.path.exists( tmp_dir ): + shutil.rmtree(tmp_dir) + raise + + #clean up my mess + if os.path.exists( tmp_dir ): + shutil.rmtree(tmp_dir) + + +#main + +if __name__=='__main__': + main() +