Mercurial > repos > jbrayet > hmcan_1_0_docker
comparison hmcan_wrapper.py @ 0:c7901290c849 draft
Uploaded
| author | jbrayet |
|---|---|
| date | Wed, 13 Jan 2016 10:19:39 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:c7901290c849 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 import sys, subprocess, tempfile, os, shutil, glob | |
| 4 | |
| 5 # samtools is added to PATH here | |
| 6 | |
| 7 def calculate_window_size ( chr_len_file, input_file, format ): | |
| 8 ''' this function calculates the window size used to evaluate copy number variation. | |
| 9 | |
| 10 W = L/(T*CV*CV) | |
| 11 | |
| 12 L=genome length | |
| 13 T=total read count (mapped) present in the input file (bed, bam, sam) | |
| 14 CV=coefficient of variation (0.05) | |
| 15 W=window size ''' | |
| 16 | |
| 17 cv=0.05 | |
| 18 #float b/c calculs is done in 'virgule flottante' | |
| 19 proc1=subprocess.Popen ("awk '{sum=sum+$2} END {print sum}' %s" % chr_len_file, shell=True, stdout=subprocess.PIPE) | |
| 20 genome_len = float (proc1.communicate()[0]) | |
| 21 if (proc1.returncode !=0): | |
| 22 raise Exception ("In calculate_window_size(), returncode of proc1 (genome_len) = i%" % proc1.returncode) | |
| 23 | |
| 24 if format == "bed" : | |
| 25 | |
| 26 #proc_RC_bed=subprocess.Popen ("wc -l < %s > lines;read nbr_reads < lines;echo $nbr_reads" % input_file, shell=True, stdout=subprocess.PIPE) | |
| 27 proc_RC_bed=subprocess.Popen ("grep -c 'chr' < %s" % input_file, shell=True, stdout=subprocess.PIPE) | |
| 28 read_count = int (proc_RC_bed.communicate()[0] ) | |
| 29 | |
| 30 #print ("read_count = %i\n" % read_count) | |
| 31 if (proc_RC_bed.returncode !=0): | |
| 32 raise Exception ("In calculate_window_size(), returncode of proc_RC_bed (read_count) = i%" % proc_RC_bed.returncode) | |
| 33 | |
| 34 elif format == "bam": | |
| 35 # get index, get read count (can't pipe into index) | |
| 36 proc_index=subprocess.Popen ("samtools index %s" % input_file, shell=True) | |
| 37 #print "samtools index "+str(input_file) | |
| 38 proc_index.wait() | |
| 39 proc_RC_bam=subprocess.Popen ("samtools idxstats %s | awk '{s=s+$3} END {print s}'" % input_file, shell=True, stdout=subprocess.PIPE ) | |
| 40 read_count=int(proc_RC_bam.communicate()[0]) | |
| 41 | |
| 42 else : | |
| 43 # format = sam : convert to bam, get index, get read count | |
| 44 proc_SamToBam=subprocess.Popen ("samtools view -bSh %s > %s.bam" % (input_file,input_file), shell=True) | |
| 45 proc_SamToBam.wait() | |
| 46 proc_index=subprocess.Popen ("samtools index %s.bam" % input_file, shell=True) | |
| 47 proc_index.wait() | |
| 48 # get read count | |
| 49 proc_RC_bam=subprocess.Popen ("samtools idxstats %s.bam | awk '{s+=$3} END {print s}'" % input_file, shell=True, stdout=subprocess.PIPE ) | |
| 50 read_count=int(proc_RC_bam.communicate()[0]) | |
| 51 | |
| 52 | |
| 53 # w's type is float, & round returns float.0 | |
| 54 return int(round(genome_len/(read_count *cv*cv ))) | |
| 55 | |
| 56 | |
| 57 def correct_hg19 (hg19Len,corrected_hg19Len ): | |
| 58 ''' this function corrects hg19.len file format : adds tab between the columns instead of space ''' | |
| 59 | |
| 60 i=open(hg19Len) | |
| 61 o=open(corrected_hg19Len, "w") | |
| 62 for line in i.readlines(): | |
| 63 col=line.split(" ") | |
| 64 col.insert(1, "\t") | |
| 65 o.write("".join(col)) | |
| 66 | |
| 67 i.close() | |
| 68 o.close() | |
| 69 | |
| 70 return | |
| 71 | |
| 72 | |
| 73 def main(): | |
| 74 | |
| 75 input_chip_file = sys.argv[1] | |
| 76 input_control_file = sys.argv[2] | |
| 77 hmcan_config_file = sys.argv[3] | |
| 78 gccount_config_file = sys.argv[4] | |
| 79 project_name = sys.argv[5] | |
| 80 output_peaks_file = sys.argv[6] | |
| 81 output_regions_file = sys.argv[7] | |
| 82 output_density_file = sys.argv[8] | |
| 83 output_posterior_proba_file = sys.argv[9] | |
| 84 hmcan_log_report = sys.argv[10] | |
| 85 chr_len_file = sys.argv[11] | |
| 86 format = sys.argv[12] | |
| 87 genome = sys.argv[13] | |
| 88 root_dir = sys.argv[14] | |
| 89 | |
| 90 #binary files | |
| 91 GCCOUNT="/usr/bin/HMCan/HMCanV1.20/Utils/GCCount/gccount" | |
| 92 HMCAN="/usr/bin/HMCan/HMCanV1.20/src/HMCan" | |
| 93 | |
| 94 ###### CREATE ANNOTATION FILES ######### | |
| 95 databasePath = root_dir+"/database/files" | |
| 96 | |
| 97 subprocess.Popen('mkdir -p '+databasePath+'/nebulaAnnotations', shell=True) | |
| 98 subprocess.Popen('mkdir -p '+databasePath+'/nebulaAnnotations/'+genome, shell=True) | |
| 99 | |
| 100 nebulaGenomePath=databasePath+"/nebulaAnnotations/"+genome | |
| 101 | |
| 102 FAIFILE='n' | |
| 103 LENFILE='n' | |
| 104 DICTFILE='n' | |
| 105 CHROFILE='n' | |
| 106 MAPFILE='n' | |
| 107 | |
| 108 if not os.path.isdir(nebulaGenomePath+"/chromosomes"): | |
| 109 subprocess.Popen('mkdir -p '+nebulaGenomePath+'/chromosomes', shell=True) | |
| 110 CHROFILE='y' | |
| 111 | |
| 112 if not os.path.isfile(nebulaGenomePath+"/"+genome+".len"): | |
| 113 FAIFILE='y' | |
| 114 LENFILE='y' | |
| 115 | |
| 116 if not os.path.isfile(nebulaGenomePath+"/out50m2_"+genome+".gem.mappability"): | |
| 117 MAPFILE='y' | |
| 118 | |
| 119 FILEPATH=databasePath.replace("database/files","tool-data") | |
| 120 | |
| 121 cmd='bash /usr/bin/create_annotation_files.sh '+FAIFILE+" "+LENFILE+" "+DICTFILE+" "+CHROFILE+" "+FILEPATH+" "+genome+" "+MAPFILE+" "+nebulaGenomePath | |
| 122 process=subprocess.Popen(cmd, shell=True) | |
| 123 process.wait() | |
| 124 ############### END ############## | |
| 125 | |
| 126 # create tmp dir.Returns absolute path to a created tmp_dir | |
| 127 tmp_dir=tempfile.mkdtemp() | |
| 128 #tmp_dir="/data/tmp/HMcanTest" | |
| 129 | |
| 130 try: | |
| 131 | |
| 132 ## GCCOUNT | |
| 133 | |
| 134 os.chdir(tmp_dir) | |
| 135 | |
| 136 #add 'window size' and 'step' to gccount config file | |
| 137 window_size=calculate_window_size (nebulaGenomePath+"/"+genome+".len", input_control_file, format) | |
| 138 | |
| 139 #window_size=10000 | |
| 140 #print ("window_size= %i\n" % window_size) | |
| 141 | |
| 142 | |
| 143 #************** EDIT config files (improved)***************** | |
| 144 | |
| 145 chr_len_file=nebulaGenomePath+"/"+genome+".len" | |
| 146 | |
| 147 #Frist, correct hg19.len file, in case HMCan is run for hg19 (because hg19.len is not tabular on the server, better change it here) | |
| 148 hg19="hg19" | |
| 149 if (hg19 in genome): | |
| 150 correct_hg19 (chr_len_file,nebulaGenomePath+"/corrected_hg19Len_file.len") | |
| 151 param_len=nebulaGenomePath+"/corrected_hg19Len_file.len" | |
| 152 | |
| 153 else: | |
| 154 #to make my life easier.. | |
| 155 param_len= chr_len_file | |
| 156 | |
| 157 #Edit gc count config file | |
| 158 cmd_step ="sed -i \"/step/c\step = %s\" %s" % ( window_size , gccount_config_file ) | |
| 159 cmd_window ="sed -i \"/window/c\window = %s\" %s" % ( window_size , gccount_config_file ) | |
| 160 cmd_chrLen = "sed -i \"s~chrLenFile =.*~chrLenFile = "+param_len+"~g\" "+gccount_config_file | |
| 161 cmd_chrFiles = "sed -i \"s~chrFiles =.*~chrFiles = "+nebulaGenomePath+"/chromosomes~g\" "+gccount_config_file | |
| 162 cmd_gemMappabilityFile = "sed -i \"s~gemMappabilityFile =.*~gemMappabilityFile = "+nebulaGenomePath+"/out50m2_"+genome+".gem.mappability~g\" "+gccount_config_file | |
| 163 | |
| 164 | |
| 165 s=subprocess.Popen(args=cmd_step, shell=True) | |
| 166 s.wait() | |
| 167 | |
| 168 w=subprocess.Popen(args=cmd_window, shell=True) | |
| 169 w.wait() | |
| 170 | |
| 171 cL=subprocess.Popen(args=cmd_chrLen, shell=True) | |
| 172 cL.wait() | |
| 173 | |
| 174 cF=subprocess.Popen(args=cmd_chrFiles, shell=True) | |
| 175 cF.wait() | |
| 176 | |
| 177 cM=subprocess.Popen(args=cmd_gemMappabilityFile, shell=True) | |
| 178 cM.wait() | |
| 179 | |
| 180 #*********** END edit config files **************************** | |
| 181 | |
| 182 | |
| 183 ##call gccount /dev/null | |
| 184 gc_proc=subprocess.Popen(args= "%s %s >> %s 2>&1" % (GCCOUNT, " ".join(['-conf', gccount_config_file]), hmcan_log_report), shell=True, stderr=subprocess.PIPE) | |
| 185 gc_stderr_data = gc_proc.communicate()[1] # waits for process to terminate, returns (stdoutdata, stderrdata) | |
| 186 #if (gc_proc.returncode !=0): | |
| 187 # raise Exception("GCcount returncode = %i\n GCcount stderr : %s\n" % (gc_proc.returncode,gc_stderr_data )) | |
| 188 | |
| 189 # Find outputs: .cnp files | |
| 190 # glob returns list of absolute paths of files.cnp in tmp_dir | |
| 191 cnp_list=glob.glob('*.cnp') | |
| 192 if (len(cnp_list) == 0): | |
| 193 raise Exception("Error while running Gccount : program did not generate outputs for HMCan (file.cnp) !\n GCCOUNT stderr : %s\n" % gc_stderr_data) | |
| 194 cnp_file=cnp_list[0] | |
| 195 #print "GC PROFIL file :%s\n" % cnp_file | |
| 196 | |
| 197 ## HMCAN | |
| 198 # EDIT hmcan config file : add 'GCIndex' and 'largeBinLength' | |
| 199 cmd_gc="sed -i \"/GCIndex/c\GCIndex %s\" %s" % ( cnp_file , hmcan_config_file ) | |
| 200 cmd_bin="sed -i \"/largeBinLength/c\largeBinLength %s\" %s" % ( window_size , hmcan_config_file ) | |
| 201 cmd_chrFiles = "sed -i \"s~genomePath.*~genomePath "+nebulaGenomePath+"/chromosomes~g\" "+hmcan_config_file | |
| 202 | |
| 203 g=subprocess.Popen(args=cmd_gc, shell=True) | |
| 204 g.wait() | |
| 205 | |
| 206 b=subprocess.Popen(args=cmd_bin, shell=True) | |
| 207 b.wait() | |
| 208 | |
| 209 cF=subprocess.Popen(args=cmd_chrFiles, shell=True) | |
| 210 cF.wait() | |
| 211 | |
| 212 #call HMCan , hmcan_log_report | |
| 213 hmcan_proc= subprocess.Popen(args= "%s %s >> %s 2>&1" % (HMCAN, " ".join([input_chip_file, input_control_file, hmcan_config_file , project_name]), hmcan_log_report ), shell=True, stderr=subprocess.PIPE) | |
| 214 hmcan_stderr_data=hmcan_proc.communicate()[1] | |
| 215 if (hmcan_proc.returncode !=0): | |
| 216 raise Exception ("Error occured, HMCan returncode = %i\n HMCan stderr : %s\n" % ( hmcan_proc.returncode, hmcan_stderr_data ) ) | |
| 217 | |
| 218 #deal with outputs : copy outputs from tmp_dir to dataset_XXX : | |
| 219 | |
| 220 #shutil.move( os.path.join (tmp_dir,"%s_peaks.bed" % project_name ), output_peaks_file) | |
| 221 shutil.move( os.path.join (tmp_dir,"%s_peaks.narrowPeak" % project_name ), output_peaks_file) | |
| 222 shutil.move( os.path.join( tmp_dir, "%s_regions.bed" % project_name ) ,output_regions_file ) | |
| 223 if (output_posterior_proba_file !='None') : | |
| 224 shutil.move( os.path.join( tmp_dir, "%s_posterior.wig" % project_name ) ,output_posterior_proba_file ) | |
| 225 if (output_density_file != 'None'): | |
| 226 shutil.move( os.path.join( tmp_dir, "%s.wig" % project_name ) ,output_density_file ) | |
| 227 | |
| 228 #Handle exceptions | |
| 229 except OSError: | |
| 230 sys.stderr.write ( "Error occured while runing HMCan : trying to execute a non existing file.\n" ) | |
| 231 #remove tmp_dir created: | |
| 232 if os.path.exists( tmp_dir ): | |
| 233 shutil.rmtree(tmp_dir) | |
| 234 raise | |
| 235 | |
| 236 except ValueError: | |
| 237 sys.stderr.write ( "Error occured while runing HMCan : Popen is called with invalid arguments.\n" ) | |
| 238 if os.path.exists( tmp_dir ): | |
| 239 shutil.rmtree(tmp_dir) | |
| 240 raise | |
| 241 | |
| 242 #Handle other | |
| 243 except Exception: | |
| 244 sys.stderr.write ( "Error occured while runing HMCan\n" ) | |
| 245 if os.path.exists( tmp_dir ): | |
| 246 shutil.rmtree(tmp_dir) | |
| 247 raise | |
| 248 | |
| 249 #clean up my mess | |
| 250 if os.path.exists( tmp_dir ): | |
| 251 shutil.rmtree(tmp_dir) | |
| 252 | |
| 253 | |
| 254 #main | |
| 255 | |
| 256 if __name__=='__main__': | |
| 257 main() | |
| 258 |
