0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import sys, subprocess, tempfile, os, shutil, glob
|
|
4
|
|
5 # samtools is added to PATH here
|
|
6
|
|
7 def calculate_window_size ( chr_len_file, input_file, format ):
|
|
8 ''' this function calculates the window size used to evaluate copy number variation.
|
|
9
|
|
10 W = L/(T*CV*CV)
|
|
11
|
|
12 L=genome length
|
|
13 T=total read count (mapped) present in the input file (bed, bam, sam)
|
|
14 CV=coefficient of variation (0.05)
|
|
15 W=window size '''
|
|
16
|
|
17 cv=0.05
|
|
18 #float b/c calculs is done in 'virgule flottante'
|
|
19 proc1=subprocess.Popen ("awk '{sum=sum+$2} END {print sum}' %s" % chr_len_file, shell=True, stdout=subprocess.PIPE)
|
|
20 genome_len = float (proc1.communicate()[0])
|
|
21 if (proc1.returncode !=0):
|
|
22 raise Exception ("In calculate_window_size(), returncode of proc1 (genome_len) = i%" % proc1.returncode)
|
|
23
|
|
24 if format == "bed" :
|
|
25
|
|
26 #proc_RC_bed=subprocess.Popen ("wc -l < %s > lines;read nbr_reads < lines;echo $nbr_reads" % input_file, shell=True, stdout=subprocess.PIPE)
|
|
27 proc_RC_bed=subprocess.Popen ("grep -c 'chr' < %s" % input_file, shell=True, stdout=subprocess.PIPE)
|
|
28 read_count = int (proc_RC_bed.communicate()[0] )
|
|
29
|
|
30 #print ("read_count = %i\n" % read_count)
|
|
31 if (proc_RC_bed.returncode !=0):
|
|
32 raise Exception ("In calculate_window_size(), returncode of proc_RC_bed (read_count) = i%" % proc_RC_bed.returncode)
|
|
33
|
|
34 elif format == "bam":
|
|
35 # get index, get read count (can't pipe into index)
|
|
36 proc_index=subprocess.Popen ("samtools index %s" % input_file, shell=True)
|
|
37 #print "samtools index "+str(input_file)
|
|
38 proc_index.wait()
|
|
39 proc_RC_bam=subprocess.Popen ("samtools idxstats %s | awk '{s=s+$3} END {print s}'" % input_file, shell=True, stdout=subprocess.PIPE )
|
|
40 read_count=int(proc_RC_bam.communicate()[0])
|
|
41
|
|
42 else :
|
|
43 # format = sam : convert to bam, get index, get read count
|
|
44 proc_SamToBam=subprocess.Popen ("samtools view -bSh %s > %s.bam" % (input_file,input_file), shell=True)
|
|
45 proc_SamToBam.wait()
|
|
46 proc_index=subprocess.Popen ("samtools index %s.bam" % input_file, shell=True)
|
|
47 proc_index.wait()
|
|
48 # get read count
|
|
49 proc_RC_bam=subprocess.Popen ("samtools idxstats %s.bam | awk '{s+=$3} END {print s}'" % input_file, shell=True, stdout=subprocess.PIPE )
|
|
50 read_count=int(proc_RC_bam.communicate()[0])
|
|
51
|
|
52
|
|
53 # w's type is float, & round returns float.0
|
|
54 return int(round(genome_len/(read_count *cv*cv )))
|
|
55
|
|
56
|
|
57 def correct_hg19 (hg19Len,corrected_hg19Len ):
|
|
58 ''' this function corrects hg19.len file format : adds tab between the columns instead of space '''
|
|
59
|
|
60 i=open(hg19Len)
|
|
61 o=open(corrected_hg19Len, "w")
|
|
62 for line in i.readlines():
|
|
63 col=line.split(" ")
|
|
64 col.insert(1, "\t")
|
|
65 o.write("".join(col))
|
|
66
|
|
67 i.close()
|
|
68 o.close()
|
|
69
|
|
70 return
|
|
71
|
|
72
|
|
73 def main():
|
|
74
|
|
75 input_chip_file = sys.argv[1]
|
|
76 input_control_file = sys.argv[2]
|
|
77 hmcan_config_file = sys.argv[3]
|
|
78 gccount_config_file = sys.argv[4]
|
|
79 project_name = sys.argv[5]
|
|
80 output_peaks_file = sys.argv[6]
|
|
81 output_regions_file = sys.argv[7]
|
|
82 output_density_file = sys.argv[8]
|
|
83 output_posterior_proba_file = sys.argv[9]
|
|
84 hmcan_log_report = sys.argv[10]
|
|
85 chr_len_file = sys.argv[11]
|
|
86 format = sys.argv[12]
|
|
87 genome = sys.argv[13]
|
|
88 root_dir = sys.argv[14]
|
|
89
|
|
90 #binary files
|
|
91 GCCOUNT="/usr/bin/HMCan/HMCanV1.20/Utils/GCCount/gccount"
|
|
92 HMCAN="/usr/bin/HMCan/HMCanV1.20/src/HMCan"
|
|
93
|
|
94 ###### CREATE ANNOTATION FILES #########
|
|
95 databasePath = root_dir+"/database/files"
|
|
96
|
|
97 subprocess.Popen('mkdir -p '+databasePath+'/nebulaAnnotations', shell=True)
|
|
98 subprocess.Popen('mkdir -p '+databasePath+'/nebulaAnnotations/'+genome, shell=True)
|
|
99
|
|
100 nebulaGenomePath=databasePath+"/nebulaAnnotations/"+genome
|
|
101
|
|
102 FAIFILE='n'
|
|
103 LENFILE='n'
|
|
104 DICTFILE='n'
|
|
105 CHROFILE='n'
|
|
106 MAPFILE='n'
|
|
107
|
|
108 if not os.path.isdir(nebulaGenomePath+"/chromosomes"):
|
|
109 subprocess.Popen('mkdir -p '+nebulaGenomePath+'/chromosomes', shell=True)
|
|
110 CHROFILE='y'
|
|
111
|
|
112 if not os.path.isfile(nebulaGenomePath+"/"+genome+".len"):
|
|
113 FAIFILE='y'
|
|
114 LENFILE='y'
|
|
115
|
|
116 if not os.path.isfile(nebulaGenomePath+"/out50m2_"+genome+".gem.mappability"):
|
|
117 MAPFILE='y'
|
|
118
|
|
119 FILEPATH=databasePath.replace("database/files","tool-data")
|
|
120
|
|
121 cmd='bash /usr/bin/create_annotation_files.sh '+FAIFILE+" "+LENFILE+" "+DICTFILE+" "+CHROFILE+" "+FILEPATH+" "+genome+" "+MAPFILE+" "+nebulaGenomePath
|
|
122 process=subprocess.Popen(cmd, shell=True)
|
|
123 process.wait()
|
|
124 ############### END ##############
|
|
125
|
|
126 # create tmp dir.Returns absolute path to a created tmp_dir
|
|
127 tmp_dir=tempfile.mkdtemp()
|
|
128 #tmp_dir="/data/tmp/HMcanTest"
|
|
129
|
|
130 try:
|
|
131
|
|
132 ## GCCOUNT
|
|
133
|
|
134 os.chdir(tmp_dir)
|
|
135
|
|
136 #add 'window size' and 'step' to gccount config file
|
|
137 window_size=calculate_window_size (nebulaGenomePath+"/"+genome+".len", input_control_file, format)
|
|
138
|
|
139 #window_size=10000
|
|
140 #print ("window_size= %i\n" % window_size)
|
|
141
|
|
142
|
|
143 #************** EDIT config files (improved)*****************
|
|
144
|
|
145 chr_len_file=nebulaGenomePath+"/"+genome+".len"
|
|
146
|
|
147 #Frist, correct hg19.len file, in case HMCan is run for hg19 (because hg19.len is not tabular on the server, better change it here)
|
|
148 hg19="hg19"
|
|
149 if (hg19 in genome):
|
|
150 correct_hg19 (chr_len_file,nebulaGenomePath+"/corrected_hg19Len_file.len")
|
|
151 param_len=nebulaGenomePath+"/corrected_hg19Len_file.len"
|
|
152
|
|
153 else:
|
|
154 #to make my life easier..
|
|
155 param_len= chr_len_file
|
|
156
|
|
157 #Edit gc count config file
|
|
158 cmd_step ="sed -i \"/step/c\step = %s\" %s" % ( window_size , gccount_config_file )
|
|
159 cmd_window ="sed -i \"/window/c\window = %s\" %s" % ( window_size , gccount_config_file )
|
|
160 cmd_chrLen = "sed -i \"s~chrLenFile =.*~chrLenFile = "+param_len+"~g\" "+gccount_config_file
|
|
161 cmd_chrFiles = "sed -i \"s~chrFiles =.*~chrFiles = "+nebulaGenomePath+"/chromosomes~g\" "+gccount_config_file
|
|
162 cmd_gemMappabilityFile = "sed -i \"s~gemMappabilityFile =.*~gemMappabilityFile = "+nebulaGenomePath+"/out50m2_"+genome+".gem.mappability~g\" "+gccount_config_file
|
|
163
|
|
164
|
|
165 s=subprocess.Popen(args=cmd_step, shell=True)
|
|
166 s.wait()
|
|
167
|
|
168 w=subprocess.Popen(args=cmd_window, shell=True)
|
|
169 w.wait()
|
|
170
|
|
171 cL=subprocess.Popen(args=cmd_chrLen, shell=True)
|
|
172 cL.wait()
|
|
173
|
|
174 cF=subprocess.Popen(args=cmd_chrFiles, shell=True)
|
|
175 cF.wait()
|
|
176
|
|
177 cM=subprocess.Popen(args=cmd_gemMappabilityFile, shell=True)
|
|
178 cM.wait()
|
|
179
|
|
180 #*********** END edit config files ****************************
|
|
181
|
|
182
|
|
183 ##call gccount /dev/null
|
|
184 gc_proc=subprocess.Popen(args= "%s %s >> %s 2>&1" % (GCCOUNT, " ".join(['-conf', gccount_config_file]), hmcan_log_report), shell=True, stderr=subprocess.PIPE)
|
|
185 gc_stderr_data = gc_proc.communicate()[1] # waits for process to terminate, returns (stdoutdata, stderrdata)
|
|
186 #if (gc_proc.returncode !=0):
|
|
187 # raise Exception("GCcount returncode = %i\n GCcount stderr : %s\n" % (gc_proc.returncode,gc_stderr_data ))
|
|
188
|
|
189 # Find outputs: .cnp files
|
|
190 # glob returns list of absolute paths of files.cnp in tmp_dir
|
|
191 cnp_list=glob.glob('*.cnp')
|
|
192 if (len(cnp_list) == 0):
|
|
193 raise Exception("Error while running Gccount : program did not generate outputs for HMCan (file.cnp) !\n GCCOUNT stderr : %s\n" % gc_stderr_data)
|
|
194 cnp_file=cnp_list[0]
|
|
195 #print "GC PROFIL file :%s\n" % cnp_file
|
|
196
|
|
197 ## HMCAN
|
|
198 # EDIT hmcan config file : add 'GCIndex' and 'largeBinLength'
|
|
199 cmd_gc="sed -i \"/GCIndex/c\GCIndex %s\" %s" % ( cnp_file , hmcan_config_file )
|
|
200 cmd_bin="sed -i \"/largeBinLength/c\largeBinLength %s\" %s" % ( window_size , hmcan_config_file )
|
|
201 cmd_chrFiles = "sed -i \"s~genomePath.*~genomePath "+nebulaGenomePath+"/chromosomes~g\" "+hmcan_config_file
|
|
202
|
|
203 g=subprocess.Popen(args=cmd_gc, shell=True)
|
|
204 g.wait()
|
|
205
|
|
206 b=subprocess.Popen(args=cmd_bin, shell=True)
|
|
207 b.wait()
|
|
208
|
|
209 cF=subprocess.Popen(args=cmd_chrFiles, shell=True)
|
|
210 cF.wait()
|
|
211
|
|
212 #call HMCan , hmcan_log_report
|
|
213 hmcan_proc= subprocess.Popen(args= "%s %s >> %s 2>&1" % (HMCAN, " ".join([input_chip_file, input_control_file, hmcan_config_file , project_name]), hmcan_log_report ), shell=True, stderr=subprocess.PIPE)
|
|
214 hmcan_stderr_data=hmcan_proc.communicate()[1]
|
|
215 if (hmcan_proc.returncode !=0):
|
|
216 raise Exception ("Error occured, HMCan returncode = %i\n HMCan stderr : %s\n" % ( hmcan_proc.returncode, hmcan_stderr_data ) )
|
|
217
|
|
218 #deal with outputs : copy outputs from tmp_dir to dataset_XXX :
|
|
219
|
|
220 #shutil.move( os.path.join (tmp_dir,"%s_peaks.bed" % project_name ), output_peaks_file)
|
|
221 shutil.move( os.path.join (tmp_dir,"%s_peaks.narrowPeak" % project_name ), output_peaks_file)
|
|
222 shutil.move( os.path.join( tmp_dir, "%s_regions.bed" % project_name ) ,output_regions_file )
|
|
223 if (output_posterior_proba_file !='None') :
|
|
224 shutil.move( os.path.join( tmp_dir, "%s_posterior.wig" % project_name ) ,output_posterior_proba_file )
|
|
225 if (output_density_file != 'None'):
|
|
226 shutil.move( os.path.join( tmp_dir, "%s.wig" % project_name ) ,output_density_file )
|
|
227
|
|
228 #Handle exceptions
|
|
229 except OSError:
|
|
230 sys.stderr.write ( "Error occured while runing HMCan : trying to execute a non existing file.\n" )
|
|
231 #remove tmp_dir created:
|
|
232 if os.path.exists( tmp_dir ):
|
|
233 shutil.rmtree(tmp_dir)
|
|
234 raise
|
|
235
|
|
236 except ValueError:
|
|
237 sys.stderr.write ( "Error occured while runing HMCan : Popen is called with invalid arguments.\n" )
|
|
238 if os.path.exists( tmp_dir ):
|
|
239 shutil.rmtree(tmp_dir)
|
|
240 raise
|
|
241
|
|
242 #Handle other
|
|
243 except Exception:
|
|
244 sys.stderr.write ( "Error occured while runing HMCan\n" )
|
|
245 if os.path.exists( tmp_dir ):
|
|
246 shutil.rmtree(tmp_dir)
|
|
247 raise
|
|
248
|
|
249 #clean up my mess
|
|
250 if os.path.exists( tmp_dir ):
|
|
251 shutil.rmtree(tmp_dir)
|
|
252
|
|
253
|
|
254 #main
|
|
255
|
|
256 if __name__=='__main__':
|
|
257 main()
|
|
258
|