comparison mirbase_ultra_v2.py @ 22:d766563ab56d draft

Uploaded
author glogobyte
date Thu, 22 Oct 2020 08:25:23 +0000
parents
children
comparison
equal deleted inserted replaced
21:9c1e59bf8f6e 22:d766563ab56d
1 from mirbase_functions import *
2 from mirbase_graphs import *
3 import itertools
4 import time
5 import sys
6 import os
7 import urllib.request
8 import gzip
9 from multiprocessing import Process, Queue, Lock, Pool, Manager, Value
10 import subprocess
11 import argparse
12 from collections import OrderedDict
13 from matplotlib.backends.backend_pdf import PdfPages
14 import pandas as pd
15 from math import pi
16 import numpy as np
17 import matplotlib.pyplot as plt
18 from matplotlib.ticker import PercentFormatter
19 import seaborn as sns
20 import scipy.stats as stats
21 from plotnine import *
22 import math
23 import re
24 import matplotlib.ticker as mtick
25 import copy
26
27 subprocess.call(['mkdir','-p', 'split1','split2','split3','split4','split11','split12','Counts','Diff/temp_con','Diff/temp_tre','Diff/n_temp_con','Diff/n_temp_tre'])
28
29 parser = argparse.ArgumentParser()
30 parser.add_argument("-analysis", "--anal", help="choose type of analysis", action="store")
31 parser.add_argument("-con", "--control", help="input fastq file", nargs='+', default=[])
32 parser.add_argument("-tre", "--treated", help="input fastq file", nargs='+', default=[] )
33 parser.add_argument("-tool_dir", "--tool_directory", help="tool directory path", action="store")
34 parser.add_argument("-gen", "--org_name", help="tool directory path", action="store")
35 parser.add_argument("-program", "--pro", help="choose type of analysis", action="store")
36 parser.add_argument("-f", "--flag", help="choose the database", action="store")
37 parser.add_argument("-umis", "--umi", help="choose the database", action="store")
38 parser.add_argument("-percentage", "--per", help="choose the database", action="store")
39 parser.add_argument("-counts", "--count", help="choose the database", action="store")
40 parser.add_argument("-name1", "--n1", help="choose the database", action="store")
41 parser.add_argument("-name2", "--n2", help="choose the database", action="store")
42 args = parser.parse_args()
43
44
45 ###############################################################################################################################################################################################
46
47 if __name__ == '__main__':
48
49 starttime = time.time()
50
51 q1 = Queue()
52 q2 = Queue()
53 lock = Lock()
54 manager = Manager()
55
56 mature_mirnas=manager.list()
57 ps_mature=Process(target=download_matures,args=(mature_mirnas,args.org_name))
58 ps_mature.start()
59
60 args.control[0]=args.control[0][1:]
61 args.control[len(args.control)-1][:-1]
62 control = [(args.control[i:i+2]) for i in range(0, len(args.control), 2)]
63
64 args.treated[0]=args.treated[0][1:]
65 args.treated[len(args.treated)-1][:-1]
66 treated = [(args.treated[i:i+2]) for i in range(0, len(args.treated), 2)]
67
68
69 ############## Detection of templated isoforms ################
70
71 radar = manager.list([0,0,0,0])
72 samples = manager.list()
73 data= manager.list()
74 names_con=manager.list()
75 samples_mirna_names=manager.list()
76 deseq=manager.list()
77 unmap_seq=manager.Value('i',0)
78 unmap_counts=manager.Value('i',0)
79 LH2E_names=manager.list()
80 ini_c_samples = manager.list()
81
82
83 radar1 = manager.list([0,0,0,0])
84 samples1 = manager.list()
85 data1 = manager.list()
86 names_tre = manager.list()
87 samples_mirna_names1=manager.list()
88 deseq1=manager.list()
89 unmap1_seq = manager.Value('i',0)
90 unmap1_counts = manager.Value('i',0)
91 LH8E_names=manager.list()
92 ini_t_samples = manager.list()
93 ps_mature.join()
94
95
96 mature_mirnas=list(mature_mirnas)
97
98
99 starttime1 = time.time()
100 ps_sam = [Process(target=sam,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,samples,data,names_con,unmap_seq,samples_mirna_names,deseq,LH2E_names,"0",ini_c_samples,unmap_counts)) for path in control]
101 ps_sam.extend([Process(target=sam,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,samples1,data1,names_tre,unmap1_seq,samples_mirna_names1,deseq1,LH8E_names,"0",ini_t_samples,unmap1_counts)) for path in treated])
102
103 [p.start() for p in ps_sam]
104 [p.join() for p in ps_sam]
105 print('SAM took {} seconds'.format(time.time() - starttime1))
106
107 ps_hist=[Process(target=hist_red,args=(ini_c_samples,'c'))]
108 ps_hist.extend([Process(target=hist_red,args=(ini_t_samples,'t'))])
109 [x.start() for x in ps_hist]
110
111 starttime200=time.time()
112
113 sc = list(samples)
114 st = list(samples1)
115
116 names_con=list(names_con)
117 names_tre=list(names_tre)
118 samples_mirna_names=list(samples_mirna_names)
119 samples_mirna_names.sort()
120 samples_mirna_names=list(samples_mirna_names for samples_mirna_names,_ in itertools.groupby(samples_mirna_names))
121
122 samples_mirna_names1=list(samples_mirna_names1)
123 samples_mirna_names1.sort()
124 samples_mirna_names1=list(samples_mirna_names1 for samples_mirna_names1,_ in itertools.groupby(samples_mirna_names1))
125
126 deseq=list(deseq)
127 deseq1=list(deseq1)
128
129 new_names_con=manager.list()
130 new_names_tre=manager.list()
131 new_deseq=manager.list()
132 new_deseq1=manager.list()
133 ps_deseq=[Process(target=deseqe2,args=(sampp,samples_mirna_names,lock,new_deseq,names_con[i],new_names_con)) for i,sampp in enumerate(deseq)]
134 ps_deseq.extend([Process(target=deseqe2,args=(sampp,samples_mirna_names1,lock,new_deseq1,names_tre[i],new_names_tre)) for i,sampp in enumerate(deseq1)])
135
136 [z.start() for z in ps_deseq]
137 [z.join() for z in ps_deseq]
138 new_deseq=list(new_deseq)
139 new_deseq1=list(new_deseq1)
140
141 LH2E=[[x[0],x[2]] for x in new_deseq[0]]
142 [LH2E[i].append(y[i][1]) for i,_ in enumerate(LH2E) for y in new_deseq]
143
144 LH8E=[[x[0],x[2]] for x in new_deseq1[0]]
145 [LH8E[i].append(y[i][1]) for i,_ in enumerate(LH8E) for y in new_deseq1]
146
147 print('Deseq took {} seconds'.format(time.time() - starttime200))
148
149 merg_nam_LH2E=manager.list()
150 merg_nam_LH8E=manager.list()
151
152 LH2E_copy=copy.deepcopy(list(LH2E))
153 LH8E_copy=copy.deepcopy(list(LH8E))
154
155 fil_sort_tre=manager.list()
156 fil_sort_con=manager.list()
157 raw_sort_tre=manager.list()
158 raw_sort_con=manager.list()
159
160 ps_main = Process(target=main_temp,args=(list(LH2E), samples_mirna_names, list(LH8E), samples_mirna_names1,1,list(names_con),list(names_tre),fil_sort_tre,fil_sort_con,raw_sort_tre,raw_sort_con,args.per,args.count))
161 ps_main.start()
162
163 if args.anal=="2":
164 all_iso = manager.list()
165 ps_non_iso = Process(target=non_template_ref,args=(sc,st,all_iso))
166 ps_non_iso.start()
167
168 ps_merge = [Process(target=merging_names,args=(LH2E_copy,merg_nam_LH2E))]
169 ps_merge.extend([Process(target=merging_names,args=(LH8E_copy,merg_nam_LH8E))])
170 [x.start() for x in ps_merge]
171 [x.join() for x in ps_merge]
172
173 merg_nam_LH2E=list(merg_nam_LH2E)
174 merg_nam_LH8E=list(merg_nam_LH8E)
175
176 starttime2 = time.time()
177 procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in data]
178 procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in data1])
179 procs.extend([Process(target=make_spider,args=(merg_nam_LH2E,merg_nam_LH8E))])
180 if args.anal == "1":
181 procs.extend([Process(target=pie_temp,args=(merg_nam_LH2E,unmap_seq.value,unmap_counts.value,merg_nam_LH8E,unmap1_seq.value,unmap1_counts.value))])
182
183 [p.start() for p in procs]
184
185
186 if args.anal=="1":
187 [x.join() for x in ps_hist]
188 [p.join() for p in procs]
189 ps_pdf = Process(target=pdf_before_DE,args=(args.anal))
190 ps_pdf.start()
191
192 print('Graphs took {} seconds'.format(time.time() - starttime2))
193
194 ps_main.join()
195
196 fil_sort_con=list(fil_sort_con)
197 fil_sort_tre=list(fil_sort_tre)
198 if fil_sort_con==[]:
199 fil_sort_con=raw_sort_con
200 fil_sort_tre=raw_sort_tre
201
202 raw_sort_con=list(raw_sort_con)
203 raw_sort_tre=list(raw_sort_tre)
204 names_con=list(new_names_con)
205 names_tre=list(new_names_tre)
206
207 ps_write = Process(target=write_main,args=(raw_sort_con, raw_sort_tre, fil_sort_con, fil_sort_tre, names_con,names_tre,1,args.per,args.n1,args.n2))
208 ps_write.start()
209
210 ps1_matrix = [Process(target=ssamples,args=(names_con,fil_sort_con,"Diff/temp_con/",0))]
211 ps1_matrix.extend([Process(target=ssamples,args=(names_tre,fil_sort_tre,"Diff/temp_tre/",0))])
212 [p.start() for p in ps1_matrix]
213
214 if args.anal=="1":
215 ps_pdf.join()
216 if args.anal=="2":
217 [p.join() for p in procs]
218 [x.join() for x in ps_hist]
219
220 ps_write.join()
221 [p.join() for p in ps1_matrix]
222
223
224 ############################## Detection of Both #######################################
225
226 starttime10 = time.time()
227
228 if args.anal == "2":
229
230 n_data= manager.list()
231 n_names_con=manager.list()
232 n_samples_mirna_names=manager.list()
233 n_deseq=manager.list()
234 n_LH2E_names=manager.list()
235
236 n_data1 = manager.list()
237 n_names_tre = manager.list()
238 n_samples_mirna_names1=manager.list()
239 n_deseq1=manager.list()
240 n_LH8E_names=manager.list()
241
242
243 new_mat_mirnas = list(mature_mirnas)
244 ps_non_iso.join()
245
246 all_iso=list(all_iso)
247 new_mat_mirnas.extend(all_iso)
248
249 starttime11=time.time()
250
251 ps_sam = [Process(target=non_sam,args=(new_mat_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,n_data,n_names_con,n_deseq,n_samples_mirna_names,n_LH2E_names)) for path in control]
252 ps_sam.extend([Process(target=non_sam,args=(new_mat_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,n_data1,n_names_tre,n_deseq1,n_samples_mirna_names1,n_LH8E_names)) for path in treated])
253
254 [p.start() for p in ps_sam]
255 [p.join() for p in ps_sam]
256
257 print('Non-sam took {} seconds'.format(time.time() - starttime11))
258
259 starttime12=time.time()
260
261 n_names_con=list(n_names_con)
262 n_names_tre=list(n_names_tre)
263 n_samples_mirna_names=list(n_samples_mirna_names)
264 n_samples_mirna_names.sort()
265 n_samples_mirna_names=list(n_samples_mirna_names for n_samples_mirna_names,_ in itertools.groupby(n_samples_mirna_names))
266
267 n_samples_mirna_names1=list(n_samples_mirna_names1)
268 n_samples_mirna_names1.sort()
269 n_samples_mirna_names1=list(n_samples_mirna_names1 for n_samples_mirna_names1,_ in itertools.groupby(n_samples_mirna_names1))
270
271 n_deseq=list(n_deseq)
272 n_deseq1=list(n_deseq1)
273
274 new_n_names_con=manager.list()
275 new_n_names_tre=manager.list()
276 n_new_deseq=manager.list()
277 n_new_deseq1=manager.list()
278 ps_deseq=[Process(target=deseqe2,args=(sampp,n_samples_mirna_names,lock,n_new_deseq,n_names_con[i],new_n_names_con)) for i,sampp in enumerate(n_deseq)]
279 ps_deseq.extend([Process(target=deseqe2,args=(sampp,n_samples_mirna_names1,lock,n_new_deseq1,n_names_tre[i],new_n_names_tre)) for i,sampp in enumerate(n_deseq1)])
280
281 [x.start() for x in ps_deseq]
282 [x.join() for x in ps_deseq]
283 n_new_deseq=list(n_new_deseq)
284 n_new_deseq1=list(n_new_deseq1)
285
286 n_LH2E=[[x[0],x[2]] for x in n_new_deseq[0]]
287 [n_LH2E[i].append(y[i][1]) for i,_ in enumerate(n_LH2E) for y in n_new_deseq]
288
289 n_LH8E=[[x[0],x[2]] for x in n_new_deseq1[0]]
290 [n_LH8E[i].append(y[i][1]) for i,_ in enumerate(n_LH8E) for y in n_new_deseq1]
291
292 print('Non-deseq took {} seconds'.format(time.time() - starttime12))
293
294 merg_nam_n_LH2E=manager.list()
295 merg_nam_n_LH8E=manager.list()
296
297 n_LH2E_copy=copy.deepcopy(list(n_LH2E))
298 n_LH8E_copy=copy.deepcopy(list(n_LH8E))
299
300 n_sort_tre=manager.list()
301 n_sort_con=manager.list()
302
303 n_fil_sort_con=manager.list()
304 n_fil_sort_tre=manager.list()
305 n_raw_sort_con=manager.list()
306 n_raw_sort_tre=manager.list()
307
308 ps_main = Process(target=main_temp,args=(list(n_LH2E), n_samples_mirna_names, list(n_LH8E), n_samples_mirna_names1,1,list(n_names_con),list(n_names_tre),n_fil_sort_tre,n_fil_sort_con,n_raw_sort_tre,n_raw_sort_con,args.per,args.count))
309 ps_main.start()
310
311 ps_merge = [Process(target=merging_names,args=(n_LH2E_copy,merg_nam_n_LH2E))]
312 ps_merge.extend([Process(target=merging_names,args=(n_LH8E_copy,merg_nam_n_LH8E))])
313 [p.start() for p in ps_merge]
314 [p.join() for p in ps_merge]
315
316 merg_nam_n_LH2E=list(merg_nam_n_LH2E)
317 merg_nam_n_LH8E=list(merg_nam_n_LH8E)
318
319 procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_data]
320 procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_data1])
321 procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_LH2E,'c'))])
322 procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_LH8E,'t'))])
323 procs.extend([Process(target=pie_non_temp,args=(merg_nam_LH2E,merg_nam_n_LH2E,merg_nam_LH8E,merg_nam_n_LH8E,unmap_seq.value,unmap1_seq.value,unmap_counts.value,unmap1_counts.value))])
324
325 starttime13=time.time()
326 [p.start() for p in procs]
327 [p.join() for p in procs]
328
329 print('Graphs took {} seconds'.format(time.time() - starttime13))
330
331 procs1 = Process(target=pdf_before_DE,args=(args.anal))
332 procs1.start()
333
334 starttime14=time.time()
335 ps_main.join()
336
337 n_fil_sort_con=list(n_fil_sort_con)
338 n_fil_sort_tre=list(n_fil_sort_tre)
339 if n_fil_sort_con==[]:
340 n_fil_sort_con=n_raw_sort_con
341 n_fil_sort_tre=n_raw_sort_tre
342
343 n_raw_sort_con=list(n_raw_sort_con)
344 n_raw_sort_tre=list(n_raw_sort_tre)
345 n_names_con=list(new_n_names_con)
346 n_names_tre=list(new_n_names_tre)
347
348 ps_write = Process(target=write_main,args=(n_raw_sort_con, n_raw_sort_tre,n_fil_sort_con, n_fil_sort_tre, n_names_con, n_names_tre,2,args.per,args.n1,args.n2))
349 ps_write.start()
350
351 ps1_matrix = [Process(target=ssamples1,args=(n_names_con,n_fil_sort_con,names_con,fil_sort_con,"Diff/n_temp_con/",0))]
352 ps1_matrix.extend([Process(target=ssamples1,args=(n_names_tre,n_fil_sort_tre,names_tre,fil_sort_tre,"Diff/n_temp_tre/",0))])
353 [p.start() for p in ps1_matrix]
354
355 ps_write.join()
356 [p.join() for p in ps1_matrix]
357 procs1.join()
358 print('That took {} seconds'.format(time.time() - starttime10))
359 print('That took {} seconds'.format(time.time() - starttime))
360
361
362
363
364
365
366
367