comparison hmcan_wrapper.py @ 0:c7901290c849 draft

Uploaded
author jbrayet
date Wed, 13 Jan 2016 10:19:39 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c7901290c849
1 #!/usr/bin/env python
2
3 import sys, subprocess, tempfile, os, shutil, glob
4
5 # samtools is added to PATH here
6
7 def calculate_window_size ( chr_len_file, input_file, format ):
8 ''' this function calculates the window size used to evaluate copy number variation.
9
10 W = L/(T*CV*CV)
11
12 L=genome length
13 T=total read count (mapped) present in the input file (bed, bam, sam)
14 CV=coefficient of variation (0.05)
15 W=window size '''
16
17 cv=0.05
18 #float b/c calculs is done in 'virgule flottante'
19 proc1=subprocess.Popen ("awk '{sum=sum+$2} END {print sum}' %s" % chr_len_file, shell=True, stdout=subprocess.PIPE)
20 genome_len = float (proc1.communicate()[0])
21 if (proc1.returncode !=0):
22 raise Exception ("In calculate_window_size(), returncode of proc1 (genome_len) = i%" % proc1.returncode)
23
24 if format == "bed" :
25
26 #proc_RC_bed=subprocess.Popen ("wc -l < %s > lines;read nbr_reads < lines;echo $nbr_reads" % input_file, shell=True, stdout=subprocess.PIPE)
27 proc_RC_bed=subprocess.Popen ("grep -c 'chr' < %s" % input_file, shell=True, stdout=subprocess.PIPE)
28 read_count = int (proc_RC_bed.communicate()[0] )
29
30 #print ("read_count = %i\n" % read_count)
31 if (proc_RC_bed.returncode !=0):
32 raise Exception ("In calculate_window_size(), returncode of proc_RC_bed (read_count) = i%" % proc_RC_bed.returncode)
33
34 elif format == "bam":
35 # get index, get read count (can't pipe into index)
36 proc_index=subprocess.Popen ("samtools index %s" % input_file, shell=True)
37 #print "samtools index "+str(input_file)
38 proc_index.wait()
39 proc_RC_bam=subprocess.Popen ("samtools idxstats %s | awk '{s=s+$3} END {print s}'" % input_file, shell=True, stdout=subprocess.PIPE )
40 read_count=int(proc_RC_bam.communicate()[0])
41
42 else :
43 # format = sam : convert to bam, get index, get read count
44 proc_SamToBam=subprocess.Popen ("samtools view -bSh %s > %s.bam" % (input_file,input_file), shell=True)
45 proc_SamToBam.wait()
46 proc_index=subprocess.Popen ("samtools index %s.bam" % input_file, shell=True)
47 proc_index.wait()
48 # get read count
49 proc_RC_bam=subprocess.Popen ("samtools idxstats %s.bam | awk '{s+=$3} END {print s}'" % input_file, shell=True, stdout=subprocess.PIPE )
50 read_count=int(proc_RC_bam.communicate()[0])
51
52
53 # w's type is float, & round returns float.0
54 return int(round(genome_len/(read_count *cv*cv )))
55
56
57 def correct_hg19 (hg19Len,corrected_hg19Len ):
58 ''' this function corrects hg19.len file format : adds tab between the columns instead of space '''
59
60 i=open(hg19Len)
61 o=open(corrected_hg19Len, "w")
62 for line in i.readlines():
63 col=line.split(" ")
64 col.insert(1, "\t")
65 o.write("".join(col))
66
67 i.close()
68 o.close()
69
70 return
71
72
73 def main():
74
75 input_chip_file = sys.argv[1]
76 input_control_file = sys.argv[2]
77 hmcan_config_file = sys.argv[3]
78 gccount_config_file = sys.argv[4]
79 project_name = sys.argv[5]
80 output_peaks_file = sys.argv[6]
81 output_regions_file = sys.argv[7]
82 output_density_file = sys.argv[8]
83 output_posterior_proba_file = sys.argv[9]
84 hmcan_log_report = sys.argv[10]
85 chr_len_file = sys.argv[11]
86 format = sys.argv[12]
87 genome = sys.argv[13]
88 root_dir = sys.argv[14]
89
90 #binary files
91 GCCOUNT="/usr/bin/HMCan/HMCanV1.20/Utils/GCCount/gccount"
92 HMCAN="/usr/bin/HMCan/HMCanV1.20/src/HMCan"
93
94 ###### CREATE ANNOTATION FILES #########
95 databasePath = root_dir+"/database/files"
96
97 subprocess.Popen('mkdir -p '+databasePath+'/nebulaAnnotations', shell=True)
98 subprocess.Popen('mkdir -p '+databasePath+'/nebulaAnnotations/'+genome, shell=True)
99
100 nebulaGenomePath=databasePath+"/nebulaAnnotations/"+genome
101
102 FAIFILE='n'
103 LENFILE='n'
104 DICTFILE='n'
105 CHROFILE='n'
106 MAPFILE='n'
107
108 if not os.path.isdir(nebulaGenomePath+"/chromosomes"):
109 subprocess.Popen('mkdir -p '+nebulaGenomePath+'/chromosomes', shell=True)
110 CHROFILE='y'
111
112 if not os.path.isfile(nebulaGenomePath+"/"+genome+".len"):
113 FAIFILE='y'
114 LENFILE='y'
115
116 if not os.path.isfile(nebulaGenomePath+"/out50m2_"+genome+".gem.mappability"):
117 MAPFILE='y'
118
119 FILEPATH=databasePath.replace("database/files","tool-data")
120
121 cmd='bash /usr/bin/create_annotation_files.sh '+FAIFILE+" "+LENFILE+" "+DICTFILE+" "+CHROFILE+" "+FILEPATH+" "+genome+" "+MAPFILE+" "+nebulaGenomePath
122 process=subprocess.Popen(cmd, shell=True)
123 process.wait()
124 ############### END ##############
125
126 # create tmp dir.Returns absolute path to a created tmp_dir
127 tmp_dir=tempfile.mkdtemp()
128 #tmp_dir="/data/tmp/HMcanTest"
129
130 try:
131
132 ## GCCOUNT
133
134 os.chdir(tmp_dir)
135
136 #add 'window size' and 'step' to gccount config file
137 window_size=calculate_window_size (nebulaGenomePath+"/"+genome+".len", input_control_file, format)
138
139 #window_size=10000
140 #print ("window_size= %i\n" % window_size)
141
142
143 #************** EDIT config files (improved)*****************
144
145 chr_len_file=nebulaGenomePath+"/"+genome+".len"
146
147 #Frist, correct hg19.len file, in case HMCan is run for hg19 (because hg19.len is not tabular on the server, better change it here)
148 hg19="hg19"
149 if (hg19 in genome):
150 correct_hg19 (chr_len_file,nebulaGenomePath+"/corrected_hg19Len_file.len")
151 param_len=nebulaGenomePath+"/corrected_hg19Len_file.len"
152
153 else:
154 #to make my life easier..
155 param_len= chr_len_file
156
157 #Edit gc count config file
158 cmd_step ="sed -i \"/step/c\step = %s\" %s" % ( window_size , gccount_config_file )
159 cmd_window ="sed -i \"/window/c\window = %s\" %s" % ( window_size , gccount_config_file )
160 cmd_chrLen = "sed -i \"s~chrLenFile =.*~chrLenFile = "+param_len+"~g\" "+gccount_config_file
161 cmd_chrFiles = "sed -i \"s~chrFiles =.*~chrFiles = "+nebulaGenomePath+"/chromosomes~g\" "+gccount_config_file
162 cmd_gemMappabilityFile = "sed -i \"s~gemMappabilityFile =.*~gemMappabilityFile = "+nebulaGenomePath+"/out50m2_"+genome+".gem.mappability~g\" "+gccount_config_file
163
164
165 s=subprocess.Popen(args=cmd_step, shell=True)
166 s.wait()
167
168 w=subprocess.Popen(args=cmd_window, shell=True)
169 w.wait()
170
171 cL=subprocess.Popen(args=cmd_chrLen, shell=True)
172 cL.wait()
173
174 cF=subprocess.Popen(args=cmd_chrFiles, shell=True)
175 cF.wait()
176
177 cM=subprocess.Popen(args=cmd_gemMappabilityFile, shell=True)
178 cM.wait()
179
180 #*********** END edit config files ****************************
181
182
183 ##call gccount /dev/null
184 gc_proc=subprocess.Popen(args= "%s %s >> %s 2>&1" % (GCCOUNT, " ".join(['-conf', gccount_config_file]), hmcan_log_report), shell=True, stderr=subprocess.PIPE)
185 gc_stderr_data = gc_proc.communicate()[1] # waits for process to terminate, returns (stdoutdata, stderrdata)
186 #if (gc_proc.returncode !=0):
187 # raise Exception("GCcount returncode = %i\n GCcount stderr : %s\n" % (gc_proc.returncode,gc_stderr_data ))
188
189 # Find outputs: .cnp files
190 # glob returns list of absolute paths of files.cnp in tmp_dir
191 cnp_list=glob.glob('*.cnp')
192 if (len(cnp_list) == 0):
193 raise Exception("Error while running Gccount : program did not generate outputs for HMCan (file.cnp) !\n GCCOUNT stderr : %s\n" % gc_stderr_data)
194 cnp_file=cnp_list[0]
195 #print "GC PROFIL file :%s\n" % cnp_file
196
197 ## HMCAN
198 # EDIT hmcan config file : add 'GCIndex' and 'largeBinLength'
199 cmd_gc="sed -i \"/GCIndex/c\GCIndex %s\" %s" % ( cnp_file , hmcan_config_file )
200 cmd_bin="sed -i \"/largeBinLength/c\largeBinLength %s\" %s" % ( window_size , hmcan_config_file )
201 cmd_chrFiles = "sed -i \"s~genomePath.*~genomePath "+nebulaGenomePath+"/chromosomes~g\" "+hmcan_config_file
202
203 g=subprocess.Popen(args=cmd_gc, shell=True)
204 g.wait()
205
206 b=subprocess.Popen(args=cmd_bin, shell=True)
207 b.wait()
208
209 cF=subprocess.Popen(args=cmd_chrFiles, shell=True)
210 cF.wait()
211
212 #call HMCan , hmcan_log_report
213 hmcan_proc= subprocess.Popen(args= "%s %s >> %s 2>&1" % (HMCAN, " ".join([input_chip_file, input_control_file, hmcan_config_file , project_name]), hmcan_log_report ), shell=True, stderr=subprocess.PIPE)
214 hmcan_stderr_data=hmcan_proc.communicate()[1]
215 if (hmcan_proc.returncode !=0):
216 raise Exception ("Error occured, HMCan returncode = %i\n HMCan stderr : %s\n" % ( hmcan_proc.returncode, hmcan_stderr_data ) )
217
218 #deal with outputs : copy outputs from tmp_dir to dataset_XXX :
219
220 #shutil.move( os.path.join (tmp_dir,"%s_peaks.bed" % project_name ), output_peaks_file)
221 shutil.move( os.path.join (tmp_dir,"%s_peaks.narrowPeak" % project_name ), output_peaks_file)
222 shutil.move( os.path.join( tmp_dir, "%s_regions.bed" % project_name ) ,output_regions_file )
223 if (output_posterior_proba_file !='None') :
224 shutil.move( os.path.join( tmp_dir, "%s_posterior.wig" % project_name ) ,output_posterior_proba_file )
225 if (output_density_file != 'None'):
226 shutil.move( os.path.join( tmp_dir, "%s.wig" % project_name ) ,output_density_file )
227
228 #Handle exceptions
229 except OSError:
230 sys.stderr.write ( "Error occured while runing HMCan : trying to execute a non existing file.\n" )
231 #remove tmp_dir created:
232 if os.path.exists( tmp_dir ):
233 shutil.rmtree(tmp_dir)
234 raise
235
236 except ValueError:
237 sys.stderr.write ( "Error occured while runing HMCan : Popen is called with invalid arguments.\n" )
238 if os.path.exists( tmp_dir ):
239 shutil.rmtree(tmp_dir)
240 raise
241
242 #Handle other
243 except Exception:
244 sys.stderr.write ( "Error occured while runing HMCan\n" )
245 if os.path.exists( tmp_dir ):
246 shutil.rmtree(tmp_dir)
247 raise
248
249 #clean up my mess
250 if os.path.exists( tmp_dir ):
251 shutil.rmtree(tmp_dir)
252
253
254 #main
255
256 if __name__=='__main__':
257 main()
258