Mercurial > repos > jbrayet > hmcan_1_0_docker
comparison hmcan_wrapper.py @ 0:c7901290c849 draft
Uploaded
author | jbrayet |
---|---|
date | Wed, 13 Jan 2016 10:19:39 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c7901290c849 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import sys, subprocess, tempfile, os, shutil, glob | |
4 | |
5 # samtools is added to PATH here | |
6 | |
7 def calculate_window_size ( chr_len_file, input_file, format ): | |
8 ''' this function calculates the window size used to evaluate copy number variation. | |
9 | |
10 W = L/(T*CV*CV) | |
11 | |
12 L=genome length | |
13 T=total read count (mapped) present in the input file (bed, bam, sam) | |
14 CV=coefficient of variation (0.05) | |
15 W=window size ''' | |
16 | |
17 cv=0.05 | |
18 #float b/c calculs is done in 'virgule flottante' | |
19 proc1=subprocess.Popen ("awk '{sum=sum+$2} END {print sum}' %s" % chr_len_file, shell=True, stdout=subprocess.PIPE) | |
20 genome_len = float (proc1.communicate()[0]) | |
21 if (proc1.returncode !=0): | |
22 raise Exception ("In calculate_window_size(), returncode of proc1 (genome_len) = i%" % proc1.returncode) | |
23 | |
24 if format == "bed" : | |
25 | |
26 #proc_RC_bed=subprocess.Popen ("wc -l < %s > lines;read nbr_reads < lines;echo $nbr_reads" % input_file, shell=True, stdout=subprocess.PIPE) | |
27 proc_RC_bed=subprocess.Popen ("grep -c 'chr' < %s" % input_file, shell=True, stdout=subprocess.PIPE) | |
28 read_count = int (proc_RC_bed.communicate()[0] ) | |
29 | |
30 #print ("read_count = %i\n" % read_count) | |
31 if (proc_RC_bed.returncode !=0): | |
32 raise Exception ("In calculate_window_size(), returncode of proc_RC_bed (read_count) = i%" % proc_RC_bed.returncode) | |
33 | |
34 elif format == "bam": | |
35 # get index, get read count (can't pipe into index) | |
36 proc_index=subprocess.Popen ("samtools index %s" % input_file, shell=True) | |
37 #print "samtools index "+str(input_file) | |
38 proc_index.wait() | |
39 proc_RC_bam=subprocess.Popen ("samtools idxstats %s | awk '{s=s+$3} END {print s}'" % input_file, shell=True, stdout=subprocess.PIPE ) | |
40 read_count=int(proc_RC_bam.communicate()[0]) | |
41 | |
42 else : | |
43 # format = sam : convert to bam, get index, get read count | |
44 proc_SamToBam=subprocess.Popen ("samtools view -bSh %s > %s.bam" % (input_file,input_file), shell=True) | |
45 proc_SamToBam.wait() | |
46 proc_index=subprocess.Popen ("samtools index %s.bam" % input_file, shell=True) | |
47 proc_index.wait() | |
48 # get read count | |
49 proc_RC_bam=subprocess.Popen ("samtools idxstats %s.bam | awk '{s+=$3} END {print s}'" % input_file, shell=True, stdout=subprocess.PIPE ) | |
50 read_count=int(proc_RC_bam.communicate()[0]) | |
51 | |
52 | |
53 # w's type is float, & round returns float.0 | |
54 return int(round(genome_len/(read_count *cv*cv ))) | |
55 | |
56 | |
57 def correct_hg19 (hg19Len,corrected_hg19Len ): | |
58 ''' this function corrects hg19.len file format : adds tab between the columns instead of space ''' | |
59 | |
60 i=open(hg19Len) | |
61 o=open(corrected_hg19Len, "w") | |
62 for line in i.readlines(): | |
63 col=line.split(" ") | |
64 col.insert(1, "\t") | |
65 o.write("".join(col)) | |
66 | |
67 i.close() | |
68 o.close() | |
69 | |
70 return | |
71 | |
72 | |
73 def main(): | |
74 | |
75 input_chip_file = sys.argv[1] | |
76 input_control_file = sys.argv[2] | |
77 hmcan_config_file = sys.argv[3] | |
78 gccount_config_file = sys.argv[4] | |
79 project_name = sys.argv[5] | |
80 output_peaks_file = sys.argv[6] | |
81 output_regions_file = sys.argv[7] | |
82 output_density_file = sys.argv[8] | |
83 output_posterior_proba_file = sys.argv[9] | |
84 hmcan_log_report = sys.argv[10] | |
85 chr_len_file = sys.argv[11] | |
86 format = sys.argv[12] | |
87 genome = sys.argv[13] | |
88 root_dir = sys.argv[14] | |
89 | |
90 #binary files | |
91 GCCOUNT="/usr/bin/HMCan/HMCanV1.20/Utils/GCCount/gccount" | |
92 HMCAN="/usr/bin/HMCan/HMCanV1.20/src/HMCan" | |
93 | |
94 ###### CREATE ANNOTATION FILES ######### | |
95 databasePath = root_dir+"/database/files" | |
96 | |
97 subprocess.Popen('mkdir -p '+databasePath+'/nebulaAnnotations', shell=True) | |
98 subprocess.Popen('mkdir -p '+databasePath+'/nebulaAnnotations/'+genome, shell=True) | |
99 | |
100 nebulaGenomePath=databasePath+"/nebulaAnnotations/"+genome | |
101 | |
102 FAIFILE='n' | |
103 LENFILE='n' | |
104 DICTFILE='n' | |
105 CHROFILE='n' | |
106 MAPFILE='n' | |
107 | |
108 if not os.path.isdir(nebulaGenomePath+"/chromosomes"): | |
109 subprocess.Popen('mkdir -p '+nebulaGenomePath+'/chromosomes', shell=True) | |
110 CHROFILE='y' | |
111 | |
112 if not os.path.isfile(nebulaGenomePath+"/"+genome+".len"): | |
113 FAIFILE='y' | |
114 LENFILE='y' | |
115 | |
116 if not os.path.isfile(nebulaGenomePath+"/out50m2_"+genome+".gem.mappability"): | |
117 MAPFILE='y' | |
118 | |
119 FILEPATH=databasePath.replace("database/files","tool-data") | |
120 | |
121 cmd='bash /usr/bin/create_annotation_files.sh '+FAIFILE+" "+LENFILE+" "+DICTFILE+" "+CHROFILE+" "+FILEPATH+" "+genome+" "+MAPFILE+" "+nebulaGenomePath | |
122 process=subprocess.Popen(cmd, shell=True) | |
123 process.wait() | |
124 ############### END ############## | |
125 | |
126 # create tmp dir.Returns absolute path to a created tmp_dir | |
127 tmp_dir=tempfile.mkdtemp() | |
128 #tmp_dir="/data/tmp/HMcanTest" | |
129 | |
130 try: | |
131 | |
132 ## GCCOUNT | |
133 | |
134 os.chdir(tmp_dir) | |
135 | |
136 #add 'window size' and 'step' to gccount config file | |
137 window_size=calculate_window_size (nebulaGenomePath+"/"+genome+".len", input_control_file, format) | |
138 | |
139 #window_size=10000 | |
140 #print ("window_size= %i\n" % window_size) | |
141 | |
142 | |
143 #************** EDIT config files (improved)***************** | |
144 | |
145 chr_len_file=nebulaGenomePath+"/"+genome+".len" | |
146 | |
147 #Frist, correct hg19.len file, in case HMCan is run for hg19 (because hg19.len is not tabular on the server, better change it here) | |
148 hg19="hg19" | |
149 if (hg19 in genome): | |
150 correct_hg19 (chr_len_file,nebulaGenomePath+"/corrected_hg19Len_file.len") | |
151 param_len=nebulaGenomePath+"/corrected_hg19Len_file.len" | |
152 | |
153 else: | |
154 #to make my life easier.. | |
155 param_len= chr_len_file | |
156 | |
157 #Edit gc count config file | |
158 cmd_step ="sed -i \"/step/c\step = %s\" %s" % ( window_size , gccount_config_file ) | |
159 cmd_window ="sed -i \"/window/c\window = %s\" %s" % ( window_size , gccount_config_file ) | |
160 cmd_chrLen = "sed -i \"s~chrLenFile =.*~chrLenFile = "+param_len+"~g\" "+gccount_config_file | |
161 cmd_chrFiles = "sed -i \"s~chrFiles =.*~chrFiles = "+nebulaGenomePath+"/chromosomes~g\" "+gccount_config_file | |
162 cmd_gemMappabilityFile = "sed -i \"s~gemMappabilityFile =.*~gemMappabilityFile = "+nebulaGenomePath+"/out50m2_"+genome+".gem.mappability~g\" "+gccount_config_file | |
163 | |
164 | |
165 s=subprocess.Popen(args=cmd_step, shell=True) | |
166 s.wait() | |
167 | |
168 w=subprocess.Popen(args=cmd_window, shell=True) | |
169 w.wait() | |
170 | |
171 cL=subprocess.Popen(args=cmd_chrLen, shell=True) | |
172 cL.wait() | |
173 | |
174 cF=subprocess.Popen(args=cmd_chrFiles, shell=True) | |
175 cF.wait() | |
176 | |
177 cM=subprocess.Popen(args=cmd_gemMappabilityFile, shell=True) | |
178 cM.wait() | |
179 | |
180 #*********** END edit config files **************************** | |
181 | |
182 | |
183 ##call gccount /dev/null | |
184 gc_proc=subprocess.Popen(args= "%s %s >> %s 2>&1" % (GCCOUNT, " ".join(['-conf', gccount_config_file]), hmcan_log_report), shell=True, stderr=subprocess.PIPE) | |
185 gc_stderr_data = gc_proc.communicate()[1] # waits for process to terminate, returns (stdoutdata, stderrdata) | |
186 #if (gc_proc.returncode !=0): | |
187 # raise Exception("GCcount returncode = %i\n GCcount stderr : %s\n" % (gc_proc.returncode,gc_stderr_data )) | |
188 | |
189 # Find outputs: .cnp files | |
190 # glob returns list of absolute paths of files.cnp in tmp_dir | |
191 cnp_list=glob.glob('*.cnp') | |
192 if (len(cnp_list) == 0): | |
193 raise Exception("Error while running Gccount : program did not generate outputs for HMCan (file.cnp) !\n GCCOUNT stderr : %s\n" % gc_stderr_data) | |
194 cnp_file=cnp_list[0] | |
195 #print "GC PROFIL file :%s\n" % cnp_file | |
196 | |
197 ## HMCAN | |
198 # EDIT hmcan config file : add 'GCIndex' and 'largeBinLength' | |
199 cmd_gc="sed -i \"/GCIndex/c\GCIndex %s\" %s" % ( cnp_file , hmcan_config_file ) | |
200 cmd_bin="sed -i \"/largeBinLength/c\largeBinLength %s\" %s" % ( window_size , hmcan_config_file ) | |
201 cmd_chrFiles = "sed -i \"s~genomePath.*~genomePath "+nebulaGenomePath+"/chromosomes~g\" "+hmcan_config_file | |
202 | |
203 g=subprocess.Popen(args=cmd_gc, shell=True) | |
204 g.wait() | |
205 | |
206 b=subprocess.Popen(args=cmd_bin, shell=True) | |
207 b.wait() | |
208 | |
209 cF=subprocess.Popen(args=cmd_chrFiles, shell=True) | |
210 cF.wait() | |
211 | |
212 #call HMCan , hmcan_log_report | |
213 hmcan_proc= subprocess.Popen(args= "%s %s >> %s 2>&1" % (HMCAN, " ".join([input_chip_file, input_control_file, hmcan_config_file , project_name]), hmcan_log_report ), shell=True, stderr=subprocess.PIPE) | |
214 hmcan_stderr_data=hmcan_proc.communicate()[1] | |
215 if (hmcan_proc.returncode !=0): | |
216 raise Exception ("Error occured, HMCan returncode = %i\n HMCan stderr : %s\n" % ( hmcan_proc.returncode, hmcan_stderr_data ) ) | |
217 | |
218 #deal with outputs : copy outputs from tmp_dir to dataset_XXX : | |
219 | |
220 #shutil.move( os.path.join (tmp_dir,"%s_peaks.bed" % project_name ), output_peaks_file) | |
221 shutil.move( os.path.join (tmp_dir,"%s_peaks.narrowPeak" % project_name ), output_peaks_file) | |
222 shutil.move( os.path.join( tmp_dir, "%s_regions.bed" % project_name ) ,output_regions_file ) | |
223 if (output_posterior_proba_file !='None') : | |
224 shutil.move( os.path.join( tmp_dir, "%s_posterior.wig" % project_name ) ,output_posterior_proba_file ) | |
225 if (output_density_file != 'None'): | |
226 shutil.move( os.path.join( tmp_dir, "%s.wig" % project_name ) ,output_density_file ) | |
227 | |
228 #Handle exceptions | |
229 except OSError: | |
230 sys.stderr.write ( "Error occured while runing HMCan : trying to execute a non existing file.\n" ) | |
231 #remove tmp_dir created: | |
232 if os.path.exists( tmp_dir ): | |
233 shutil.rmtree(tmp_dir) | |
234 raise | |
235 | |
236 except ValueError: | |
237 sys.stderr.write ( "Error occured while runing HMCan : Popen is called with invalid arguments.\n" ) | |
238 if os.path.exists( tmp_dir ): | |
239 shutil.rmtree(tmp_dir) | |
240 raise | |
241 | |
242 #Handle other | |
243 except Exception: | |
244 sys.stderr.write ( "Error occured while runing HMCan\n" ) | |
245 if os.path.exists( tmp_dir ): | |
246 shutil.rmtree(tmp_dir) | |
247 raise | |
248 | |
249 #clean up my mess | |
250 if os.path.exists( tmp_dir ): | |
251 shutil.rmtree(tmp_dir) | |
252 | |
253 | |
254 #main | |
255 | |
256 if __name__=='__main__': | |
257 main() | |
258 |