0
|
1 from mirbase_functions import *
|
|
2 from mirbase_graphs import *
|
|
3 import itertools
|
|
4 import time
|
|
5 import sys
|
|
6 import os
|
|
7 import urllib.request
|
|
8 import gzip
|
|
9 from multiprocessing import Process, Queue, Lock, Pool, Manager, Value
|
|
10 import subprocess
|
|
11 import argparse
|
|
12 from collections import OrderedDict
|
|
13 from matplotlib.backends.backend_pdf import PdfPages
|
|
14 import pandas as pd
|
|
15 from math import pi
|
|
16 import numpy as np
|
|
17 import matplotlib.pyplot as plt
|
|
18 from matplotlib.ticker import PercentFormatter
|
|
19 import seaborn as sns
|
|
20 import scipy.stats as stats
|
|
21 from plotnine import *
|
|
22 import math
|
|
23 import re
|
|
24 import matplotlib.ticker as mtick
|
|
25 import copy
|
|
26
|
|
27 subprocess.call(['mkdir','-p', 'split1','split2','split3','split4','split11','split12','Counts','Diff/temp_con','Diff/temp_tre','Diff/n_temp_con','Diff/n_temp_tre'])
|
|
28
|
|
29 parser = argparse.ArgumentParser()
|
|
30 parser.add_argument("-analysis", "--anal", help="choose type of analysis", action="store")
|
|
31 parser.add_argument("-con", "--control", help="input fastq file", nargs='+', default=[])
|
|
32 parser.add_argument("-tre", "--treated", help="input fastq file", nargs='+', default=[] )
|
|
33 parser.add_argument("-tool_dir", "--tool_directory", help="tool directory path", action="store")
|
|
34 parser.add_argument("-gen", "--org_name", help="tool directory path", action="store")
|
|
35 parser.add_argument("-program", "--pro", help="choose type of analysis", action="store")
|
|
36 parser.add_argument("-f", "--flag", help="choose the database", action="store")
|
|
37 parser.add_argument("-umis", "--umi", help="choose the database", action="store")
|
|
38 parser.add_argument("-percentage", "--per", help="choose the database", action="store")
|
|
39 parser.add_argument("-counts", "--count", help="choose the database", action="store")
|
|
40 parser.add_argument("-name1", "--n1", help="choose the database", action="store")
|
|
41 parser.add_argument("-name2", "--n2", help="choose the database", action="store")
|
|
42 args = parser.parse_args()
|
|
43
|
|
44 #########################################################################################################################################
|
|
45
|
|
46 def collapse_sam(path):
|
|
47
|
|
48 ini_sam=read(path,0)
|
|
49 main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
|
|
50 intro_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" in x.split("\t")[0]]
|
|
51
|
|
52 uni_seq = []
|
|
53 for x in main_sam:
|
|
54
|
|
55 if [x[2], x[9]] not in uni_seq:
|
|
56 uni_seq.append([x[2], x[9]])
|
|
57
|
|
58 new_main_sam=[]
|
|
59 incr_num=0
|
|
60 for i in range(len(uni_seq)):
|
|
61 count=0
|
|
62 incr_num+=1
|
|
63 for y in main_sam:
|
|
64 if uni_seq[i][1]==y[9] and uni_seq[i][0]==y[2]:
|
|
65 count+=1
|
|
66 temp=y
|
|
67 temp[10]="~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
|
|
68 temp[0]=str(incr_num)+"-"+str(count)
|
|
69 new_main_sam.append(temp)
|
|
70
|
|
71 new_sam=intro_sam+new_main_sam
|
|
72
|
|
73 return new_sam
|
|
74
|
|
75 #################################################################################################################################################################################################################
|
|
76
|
|
77 def duplicate_chroms_isoforms(List):
|
|
78
|
|
79 dupes=[]
|
|
80
|
|
81 for num in range(len(List)):
|
|
82
|
|
83 if [List[num][9],List[num][0],List[num][2]] not in dupes :
|
|
84 dupes.append([List[num][9],List[num][0],List[num][2]])
|
|
85
|
|
86 for x in List:
|
|
87 for y in dupes:
|
|
88 if x[9]==y[0] and x[0]==y[1] and x[2].split("_")[0]==y[2].split("_")[0] and x[2]!=y[2]:
|
|
89 y.append(x[2])
|
|
90
|
|
91
|
|
92 double_List = [x[:] for x in List]
|
|
93
|
|
94 chr_order=[]
|
|
95 for x in dupes:
|
|
96 temp = []
|
|
97 for i in range(2,len(x)):
|
|
98 if x[i].split("chr")[1].split("(")[0].isdigit():
|
|
99 temp.append(int(x[i].split("chr")[1].split("(")[1][0]+x[i].split("chr")[1].split("(")[0]))
|
|
100 else:
|
|
101 temp.append(x[i].split("chr")[1][0:4])
|
|
102
|
|
103 for z in temp:
|
|
104 if 'X(-)'==z or 'Y(-)'==z or 'X(+)'==z or 'Y(+)'==z:
|
|
105 temp = [str(j) for j in temp]
|
|
106 temp=list(set(temp))
|
|
107 temp.sort()
|
|
108 chr_order.append(temp)
|
|
109
|
|
110 final_dupes=[]
|
|
111 for i in range(len(dupes)):
|
|
112 final_dupes.append([dupes[i][0],dupes[i][2].split("_")[0],dupes[i][1]])
|
|
113 for x in chr_order[i]:
|
|
114 result = re.match("[-+]?\d+$", str(x))
|
|
115 if len(chr_order[i]) == len(set(chr_order[i])):
|
|
116 if result is not None:
|
|
117
|
|
118 if int(x)<0:
|
|
119 final_dupes[i][1]=final_dupes[i][1]+"_chr"+str(abs(int(x)))+"(-)"
|
|
120 else:
|
|
121 final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(abs(int(x)))+"(+)"
|
|
122 else:
|
|
123 final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(x)
|
|
124 else:
|
|
125 if result is not None:
|
|
126 if int(x) < 0:
|
|
127 final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(-)"
|
|
128 else:
|
|
129 final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(+)"
|
|
130 else:
|
|
131 final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(x)
|
|
132
|
|
133 final_dupes.sort()
|
|
134 final_dupes=list(final_dupes for final_dupes,_ in itertools.groupby(final_dupes))
|
|
135
|
|
136 for i in range(len(double_List)):
|
|
137 for x in final_dupes:
|
|
138
|
|
139 if double_List[i][9] == x[0] and double_List[i][0] == x[2] and len(double_List[i][2].split("_")) >3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]:
|
|
140 gg=str("_"+double_List[i][2].split("_")[-2]+"_"+double_List[i][2].split("_")[-1])
|
|
141 double_List[i][2] = x[1]+gg
|
|
142
|
|
143 if double_List[i][9]==x[0] and double_List[i][0]== x[2] and len(double_List[i][2].split("_"))==3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]:
|
|
144 double_List[i][2]=x[1]
|
|
145 List[i][2] = x[1]
|
|
146
|
|
147 List.sort()
|
|
148 new_list=list(List for List,_ in itertools.groupby(List))
|
|
149
|
|
150 double_List.sort()
|
|
151 new_double_List = list(double_List for double_List, _ in itertools.groupby(double_List))
|
|
152
|
|
153 return new_list, new_double_List
|
|
154
|
|
155
|
|
156 #############################################################################################################################################################################################################
|
|
157
|
|
158 def sam(mature_mirnas,path,name,con,l,samples,data,names,unmap_seq,samples_mirna_names,deseq,LHE_names,umi,ini_sample,unmap_counts):
|
|
159
|
|
160 # read the sam file
|
|
161 ini_sam=read(path,0)
|
|
162 new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
|
|
163 unique_seq = [x for x in new_main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26]
|
|
164
|
|
165 sorted_uni_arms = []
|
|
166
|
|
167 for i in range(len(mature_mirnas)):
|
|
168 tmp_count_reads = 0 # calculate the total number of reads
|
|
169 tmp_count_seq = 0 # calculate the total number of sequences
|
|
170 for j in range(len(unique_seq)):
|
|
171
|
|
172 if "{" in unique_seq[j][2].split("_")[0]:
|
|
173 official=unique_seq[j][2].split("_")[0][:-4]
|
|
174 else:
|
|
175 official=unique_seq[j][2].split("_")[0]
|
|
176
|
|
177 if mature_mirnas[i].split(" ")[0][1:] == official:
|
|
178
|
|
179 temp_mature = mature_mirnas[i+1].strip().replace("U", "T")
|
|
180 off_part = longestSubstring(temp_mature, unique_seq[j][9])
|
|
181
|
|
182 mat_diff = temp_mature.split(off_part)
|
|
183 mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
|
|
184
|
|
185 unique_diff = unique_seq[j][9].split(off_part)
|
|
186 unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
|
|
187
|
|
188 # Problem with hsa-miR-8485
|
|
189 if mat_diff[1]!=0 and unique_diff[1]!=0:
|
|
190 unique_seq[j]=1
|
|
191 pre_pos = 0
|
|
192 post_pos = 0
|
|
193
|
|
194 elif mat_diff[0]!=0 and unique_diff[0]!=0:
|
|
195 unique_seq[j]=1
|
|
196 pre_pos = 0
|
|
197 post_pos = 0
|
|
198
|
|
199 else:
|
|
200 pre_pos = mat_diff[0]-unique_diff[0]
|
|
201 post_pos = unique_diff[1]-mat_diff[1]
|
|
202 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
|
|
203 tmp_count_seq = tmp_count_seq+1
|
|
204
|
|
205 if pre_pos != 0 or post_pos != 0:
|
|
206 if pre_pos == 0:
|
|
207 unique_seq[j][2] = unique_seq[j][2] + "_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos)
|
|
208 elif post_pos == 0:
|
|
209 unique_seq[j][2] = unique_seq[j][2] + "_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos)
|
|
210 else:
|
|
211 unique_seq[j][2] = unique_seq[j][2]+"_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos)
|
|
212
|
|
213 for x in range(unique_seq.count(1)):
|
|
214 unique_seq.remove(1)
|
|
215 if tmp_count_reads != 0 and tmp_count_seq != 0:
|
|
216 sorted_uni_arms.append([mature_mirnas[i].split(" ")[0][1:], tmp_count_seq, tmp_count_reads])
|
|
217 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
|
|
218 dedup_unique_seq,double_fil_uni_seq=duplicate_chroms_isoforms(unique_seq)
|
|
219
|
|
220 for y in sorted_uni_arms:
|
|
221 counts=0
|
|
222 seqs=0
|
|
223 for x in double_fil_uni_seq:
|
|
224 if y[0]==x[2].split("_")[0]:
|
|
225 counts+=int(x[0].split("-")[1])
|
|
226 seqs+=1
|
|
227
|
|
228 y[1]=seqs
|
|
229 y[2]=counts
|
|
230
|
|
231 LHE=[]
|
|
232 l.acquire()
|
|
233 if con=="c":
|
|
234 LHE.extend(z[2] for z in double_fil_uni_seq)
|
|
235 for y in double_fil_uni_seq:
|
|
236 samples_mirna_names.append([y[2],y[9]])
|
|
237 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq])
|
|
238 LHE_names.extend(LHE)
|
|
239 unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4'])
|
|
240 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4'])
|
|
241 names.append(name)
|
|
242 samples.append(dedup_unique_seq)
|
|
243 data.append([con,name,double_fil_uni_seq,sorted_uni_arms])
|
|
244 ini_sample.append(new_main_sam)
|
|
245
|
|
246 if con=="t":
|
|
247 LHE.extend(z[2] for z in double_fil_uni_seq)
|
|
248 for y in double_fil_uni_seq:
|
|
249 samples_mirna_names.append([y[2],y[9]])
|
|
250 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq])
|
|
251 LHE_names.extend(LHE)
|
|
252 unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4'])
|
|
253 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4'])
|
|
254 names.append(name)
|
|
255 samples.append(dedup_unique_seq)
|
|
256 data.append([con,name,double_fil_uni_seq,sorted_uni_arms])
|
|
257 ini_sample.append(new_main_sam)
|
|
258 l.release()
|
|
259
|
|
260
|
|
261 ######################################################################################################################################
|
|
262 """
|
|
263
|
|
264 Read a sam file from Bowtie and do the followings:
|
|
265
|
|
266 1) Remove reverse stranded mapped reads
|
|
267 2) Remove unmapped reads
|
|
268 3) Remove all sequences with reads less than 11 reads
|
|
269 4) Sort the arms with the most sequences in decreading rate
|
|
270 5) Sort the sequences of every arm with the most reads in decreasing rate
|
|
271 6) Calculate total number of sequences of every arm
|
|
272 7) Calculate total number of reads of sequences of every arm.
|
|
273 8) Store all the informations in a txt file
|
|
274
|
|
275 """
|
|
276
|
|
277 def non_sam(mature_mirnas,path,name,con,l,data,names,n_deseq,n_samples_mirna_names,n_LHE_names):
|
|
278
|
|
279 ini_sam=read(path,0)
|
|
280 new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
|
|
281 unique_seq=[]
|
|
282 unique_seq = [x for x in new_main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26]
|
|
283
|
|
284 uni_seq=[]
|
|
285 # Calculate the shifted positions for every isomir and add them to the name of it
|
|
286 sorted_uni_arms = []
|
|
287 for i in range(1,len(mature_mirnas),2):
|
|
288 tmp_count_reads = 0 # calculate the total number of reads
|
|
289 tmp_count_seq = 0 # calculate the total number of sequences
|
|
290
|
|
291 for j in range(len(unique_seq)):
|
|
292
|
|
293 temp_mature = mature_mirnas[i].strip().replace("U", "T")
|
|
294
|
|
295 if temp_mature in unique_seq[j][9]:
|
|
296
|
|
297 off_part = longestSubstring(temp_mature, unique_seq[j][9])
|
|
298
|
|
299 mat_diff = temp_mature.split(off_part)
|
|
300 mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
|
|
301
|
|
302 unique_diff = unique_seq[j][9].split(off_part)
|
|
303 if len(unique_diff)<=2:
|
|
304 unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
|
|
305
|
|
306 pre_pos = mat_diff[0]-unique_diff[0]
|
|
307 post_pos = unique_diff[1]-mat_diff[1]
|
|
308
|
|
309 lengthofmir = len(off_part) + post_pos
|
|
310 if pre_pos == 0:
|
|
311 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
|
|
312 tmp_count_seq = tmp_count_seq + 1
|
|
313
|
|
314 if pre_pos == 0:
|
|
315
|
|
316 t_name=unique_seq[j].copy()
|
|
317 t_name[2]=mature_mirnas[i - 1].split(" ")[0][1:] + "__" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):])
|
|
318 uni_seq.append(t_name)
|
|
319
|
|
320
|
|
321 if tmp_count_reads != 0 and tmp_count_seq != 0:
|
|
322 sorted_uni_arms.append([mature_mirnas[i-1].split(" ")[0][1:], tmp_count_seq, tmp_count_reads])
|
|
323
|
|
324
|
|
325 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
|
|
326 unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq))))
|
|
327
|
|
328 LHE=[]
|
|
329
|
|
330 l.acquire()
|
|
331 if con=="c":
|
|
332 LHE.extend(x[2] for x in unique_seq if x[2]!="*")
|
|
333 for x in unique_seq:
|
|
334 if x[2]!="*":
|
|
335 n_samples_mirna_names.append([x[2],x[9]])
|
|
336 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
|
|
337 n_LHE_names.extend(LHE)
|
|
338 names.append(name)
|
|
339 data.append([con,name,unique_seq,sorted_uni_arms])
|
|
340
|
|
341
|
|
342 if con=="t":
|
|
343 LHE.extend(x[2] for x in unique_seq if x[2]!="*")
|
|
344 for x in unique_seq:
|
|
345 if x[2]!="*":
|
|
346 n_samples_mirna_names.append([x[2],x[9]])
|
|
347 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
|
|
348 n_LHE_names.extend(LHE)
|
|
349 names.append(name)
|
|
350 data.append([con,name,unique_seq,sorted_uni_arms])
|
|
351 l.release()
|
|
352
|
|
353 #####################################################################################################################################################################################################################
|
|
354 def deseq2_temp(samples_mirna_names,deseq,con,l):
|
|
355
|
|
356 samples_mirna_names.sort(key=lambda x:[0])
|
|
357 for i in range(len(deseq)):
|
|
358 for y in samples_mirna_names:
|
|
359 flag = 0
|
|
360 for x in deseq[i]:
|
|
361 if y[0] == x[0]:
|
|
362 flag = 1
|
|
363 break
|
|
364
|
|
365 if flag == 0:
|
|
366 deseq[i].append([y[0], "0", y[1]])
|
|
367
|
|
368 [deseq[i].sort(key=lambda x: x[0]) for i, _ in enumerate(deseq)]
|
|
369 deseq_final = [[x[0],x[2]] for x in deseq[0]]
|
|
370 [deseq_final[z].append(deseq[i][j][1]) for z,_ in enumerate(deseq_final) for i, _ in enumerate(deseq) for j,_ in enumerate(deseq[i]) if deseq_final[z][0] == deseq[i][j][0]]
|
|
371
|
|
372 l.acquire()
|
|
373 if con=="c":
|
|
374 q1.put(deseq_final)
|
|
375
|
|
376 if con=="t":
|
|
377 q2.put(deseq_final)
|
|
378 l.release()
|
|
379
|
|
380
|
|
381 ####################################################################################################################################################################################################################
|
|
382
|
|
383 def main_temp(LH2E, LH2E_names, LH8E, LH8E_names,flag,names_con,names_tre,filter_LH8E,filter_LH2E,raw_LH8E,raw_LH2E):
|
|
384
|
|
385 LH8E_add_names = [x for x in LH2E_names if x not in LH8E_names]
|
|
386 LH2E_add_names = [x for x in LH8E_names if x not in LH2E_names]
|
|
387
|
|
388 LH8E_add_names.sort()
|
|
389 LH2E_add_names.sort()
|
|
390 LH8E_add_names = list(LH8E_add_names for LH8E_add_names,_ in itertools.groupby(LH8E_add_names))
|
|
391 LH2E_add_names = list(LH2E_add_names for LH2E_add_names,_ in itertools.groupby(LH2E_add_names))
|
|
392
|
|
393 LH2E.sort()
|
|
394 LH8E.sort()
|
|
395 LH2E = list(LH2E for LH2E,_ in itertools.groupby(LH2E))
|
|
396 LH8E = list(LH8E for LH8E,_ in itertools.groupby(LH8E))
|
|
397
|
|
398 print("LHE_names")
|
|
399 print([len(LH8E_add_names),len(LH2E_add_names)])
|
|
400 print([len(LH8E),len(LH2E)])
|
|
401
|
|
402 zeros=["0"]*(len(LH8E[0])-2)
|
|
403 [LH8E_add_names[i].extend(zeros) for i,_ in enumerate(LH8E_add_names)]
|
|
404 LH8E=LH8E+LH8E_add_names
|
|
405
|
|
406 zeros=["0"]*(len(LH2E[0])-2)
|
|
407 [LH2E_add_names[i].extend(zeros) for i,_ in enumerate(LH2E_add_names)]
|
|
408 LH2E=LH2E+LH2E_add_names
|
|
409
|
|
410 dupes=[]
|
|
411 final_LH2E =[]
|
|
412
|
|
413 for num,_ in enumerate(LH2E):
|
|
414
|
|
415 if LH2E[num][1] not in final_LH2E and LH2E[num][0] not in final_LH2E:
|
|
416 final_LH2E.append(LH2E[num][1])
|
|
417 final_LH2E.append(LH2E[num][0])
|
|
418 else:
|
|
419 dupes.append(LH2E[num][1])
|
|
420
|
|
421
|
|
422 dupes=list(set(dupes))
|
|
423
|
|
424 dupes=[[x] for x in dupes]
|
|
425
|
|
426 for x in LH2E:
|
|
427 for y in dupes:
|
|
428 if x[1]==y[0]:
|
|
429 fl=0
|
|
430 if len(y)==1:
|
|
431 y.append(x[0])
|
|
432 else:
|
|
433 for i in range(1,len(y)):
|
|
434 if y[i].split("_")[0]==x[0].split("_")[0]:
|
|
435 fl=1
|
|
436 if len(x[0])<len(y[i]):
|
|
437 del y[i]
|
|
438 y.append(x[0])
|
|
439 break
|
|
440
|
|
441 if fl==0:
|
|
442 y.append((x[0]))
|
|
443
|
|
444 for y in dupes:
|
|
445 if len(y)>2:
|
|
446 for i in range(len(y)-1,1,-1):
|
|
447 y[1]=y[1]+"/"+y[i]
|
|
448 del y[i]
|
|
449
|
|
450 for x in LH2E:
|
|
451 for y in dupes:
|
|
452 if x[1]==y[0]:
|
|
453 x[0]=y[1]
|
|
454
|
|
455 for x in LH8E:
|
|
456 for y in dupes:
|
|
457 if x[1]==y[0]:
|
|
458 x[0]=y[1]
|
|
459
|
|
460
|
|
461 LH2E.sort()
|
|
462 LH2E=list(LH2E for LH2E,_ in itertools.groupby(LH2E))
|
|
463
|
|
464 LH8E.sort()
|
|
465 LH8E=list(LH8E for LH8E,_ in itertools.groupby(LH8E))
|
|
466
|
|
467 LH8E_new=[]
|
|
468 LH2E_new=[]
|
|
469
|
|
470 if int(args.per)!=-1:
|
|
471 percent=int(args.per)/100
|
|
472 print(percent)
|
|
473 print(args.count)
|
|
474
|
|
475 c_col_filter=round(percent*(len(LH2E[1])-2))
|
|
476 t_col_filter=round(percent*(len(LH8E[1])-2))
|
|
477
|
|
478 for i, _ in enumerate(LH2E):
|
|
479 c_cols=0
|
|
480 t_cols=0
|
|
481
|
|
482 c_cols=sum([1 for j in range(len(LH2E[i])-2) if int(LH2E[i][j+2])>=int(args.count)])
|
|
483 t_cols=sum([1 for j in range(len(LH8E[i])-2) if int(LH8E[i][j+2])>=int(args.count)])
|
|
484
|
|
485 if c_cols>=c_col_filter or t_cols>=t_col_filter:
|
|
486 LH8E_new.append(LH8E[i])
|
|
487 LH2E_new.append(LH2E[i])
|
|
488
|
|
489 filter_LH2E.extend(LH2E_new)
|
|
490 filter_LH8E.extend(LH8E_new)
|
|
491 raw_LH2E.extend(LH2E)
|
|
492 raw_LH8E.extend(LH8E)
|
|
493
|
|
494 ##################################################################################################################################################################################################################
|
|
495
|
|
496 def write_main(raw_LH2E, raw_LH8E, fil_LH2E, fil_LH8E, names_con, names_tre, flag):
|
|
497
|
|
498 if flag == 1 and int(args.per)!=-1:
|
|
499 fp = open('Counts/Filtered '+args.n2 +' Templated Counts', 'w')
|
|
500 fp.write("Name\t")
|
|
501 fp.write("Sequence")
|
|
502 for y in names_tre:
|
|
503 fp.write("\t"+y)
|
|
504
|
|
505 for x in fil_LH8E:
|
|
506 fp.write("\n%s" % "\t".join(x))
|
|
507 fp.close()
|
|
508
|
|
509 fp = open('Counts/Filtered '+args.n1+' Templated Counts', 'w')
|
|
510 fp.write("Name\t")
|
|
511 fp.write("Sequence")
|
|
512 for y in names_con:
|
|
513 fp.write("\t"+y)
|
|
514
|
|
515 for x in fil_LH2E:
|
|
516 fp.write("\n%s" % "\t".join(x))
|
|
517 fp.close()
|
|
518
|
|
519
|
|
520 if flag == 2 and int(args.per)!=-1:
|
|
521 fp = open('Counts/Filtered '+args.n2+' Non-Templated Counts', 'w')
|
|
522 fp.write("Name\t")
|
|
523 fp.write("Sequence")
|
|
524 for y in names_tre:
|
|
525 fp.write("\t"+y)
|
|
526
|
|
527
|
|
528 for x in fil_LH8E:
|
|
529 fp.write("\n%s" % "\t".join(x))
|
|
530 fp.close()
|
|
531
|
|
532 fp = open('Counts/Filtered '+args.n1+' Non-Templated Counts', 'w')
|
|
533 fp.write("Name\t")
|
|
534 fp.write("Sequence")
|
|
535 for y in names_con:
|
|
536 fp.write("\t"+y)
|
|
537
|
|
538 for x in fil_LH2E:
|
|
539 fp.write("\n%s" % "\t".join(x))
|
|
540 fp.close()
|
|
541
|
|
542
|
|
543 if flag == 1:
|
|
544 fp = open('Counts/Raw '+args.n2+' Templated Counts', 'w')
|
|
545 fp.write("Name\t")
|
|
546 fp.write("Sequence")
|
|
547 for y in names_tre:
|
|
548 fp.write("\t"+y)
|
|
549
|
|
550 for x in raw_LH8E:
|
|
551 fp.write("\n%s" % "\t".join(x))
|
|
552 fp.close()
|
|
553
|
|
554 fp = open('Counts/Raw '+args.n1+' Templated Counts', 'w')
|
|
555 fp.write("Name\t")
|
|
556 fp.write("Sequence")
|
|
557 for y in names_con:
|
|
558 fp.write("\t"+y)
|
|
559
|
|
560 for x in raw_LH2E:
|
|
561 fp.write("\n%s" % "\t".join(x))
|
|
562 fp.close()
|
|
563
|
|
564
|
|
565 if flag == 2:
|
|
566 fp = open('Counts/Raw '+args.n2+' Non-Templated Counts', 'w')
|
|
567 fp.write("Name\t")
|
|
568 fp.write("Sequence")
|
|
569 for y in names_tre:
|
|
570 fp.write("\t"+y)
|
|
571
|
|
572
|
|
573 for x in raw_LH8E:
|
|
574 fp.write("\n%s" % "\t".join(x))
|
|
575 fp.close()
|
|
576
|
|
577 fp = open('Counts/Raw '+args.n1+' Non-Templated Counts', 'w')
|
|
578 fp.write("Name\t")
|
|
579 fp.write("Sequence")
|
|
580 for y in names_con:
|
|
581 fp.write("\t"+y)
|
|
582
|
|
583 for x in raw_LH2E:
|
|
584 fp.write("\n%s" % "\t".join(x))
|
|
585 fp.close()
|
|
586
|
|
587
|
|
588 #########################################################################################################################################
|
|
589
|
|
590 def ssamples(names,samp,folder,pro):
|
|
591
|
|
592 for i in range(2,len(samp[0])):
|
|
593
|
|
594 fp = open(folder+names[i-2]+'.txt','w')
|
|
595 fp.write("miRNA id"+"\t"+names[i-2]+"\n")
|
|
596
|
|
597 for x in samp:
|
|
598 fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
|
|
599 fp.close()
|
|
600
|
|
601 ##################################################################################################################
|
|
602
|
|
603 def DB_write(con,name,unique_seq,sorted_uni_arms,f):
|
|
604
|
|
605 if f==1:
|
|
606 # Write a txt file with all the information
|
|
607 if con=="c":
|
|
608 fp = open('split1/'+name, 'w')
|
|
609
|
|
610 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
|
|
611 if con=="t":
|
|
612 fp = open('split2/'+name, 'w')
|
|
613 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
|
|
614
|
|
615
|
|
616 for i in range(len(sorted_uni_arms)):
|
|
617 temp = []
|
|
618 for j in range(len(unique_seq)):
|
|
619
|
|
620 if sorted_uni_arms[i][0] in unique_seq[j][2].split("_")[0]:
|
|
621
|
|
622 temp.append(unique_seq[j])
|
|
623
|
|
624 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
|
|
625 fp.write("*********************************************************************************************************\n")
|
|
626 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
|
|
627 fp.write("*********************************************************************************************************\n\n")
|
|
628 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
|
|
629 fp.write("\n" + "\n")
|
|
630 fp.close()
|
|
631
|
|
632 if f==2:
|
|
633
|
|
634 if con=="c":
|
|
635 fp = open('split3/'+name, 'w')
|
|
636 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
|
|
637 if con=="t":
|
|
638 fp = open('split4/'+name, 'w')
|
|
639 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
|
|
640
|
|
641
|
|
642 for i in range(len(sorted_uni_arms)):
|
|
643 temp = []
|
|
644 for j in range(len(unique_seq)):
|
|
645 if sorted_uni_arms[i][0]==unique_seq[j][2].split("__")[0]:
|
|
646 temp.append(unique_seq[j])
|
|
647 if temp!=[]:
|
|
648 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
|
|
649 fp.write("*********************************************************************************************************\n")
|
|
650 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
|
|
651 fp.write("*********************************************************************************************************\n\n")
|
|
652 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
|
|
653 fp.write("\n" + "\n")
|
|
654 fp.close()
|
|
655
|
|
656
|
|
657 ##########################################################################################################################
|
|
658
|
|
659 def new_mat_seq(pre_unique_seq,mat_mirnas,l):
|
|
660
|
|
661 unique_iso = []
|
|
662 for x in pre_unique_seq:
|
|
663 if len(x[2].split("_"))==3:
|
|
664 for y in pre_unique_seq:
|
|
665 if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]):
|
|
666 if any(y[2] in lst2 for lst2 in unique_iso)==False:
|
|
667 y[2]=">"+y[2]
|
|
668 unique_iso.append(y)
|
|
669 l.acquire()
|
|
670 for x in unique_iso:
|
|
671 mat_mirnas.append(x[2])
|
|
672 mat_mirnas.append(x[9])
|
|
673 l.release()
|
|
674
|
|
675 #########################################################################################################################
|
|
676 def pie_non_temp(merge_LH2E,merge_non_LH2E,merge_LH8E,merge_non_LH8E,c_unmap,t_unmap,c_unmap_counts,t_unmap_counts):
|
|
677
|
|
678 c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E]
|
|
679 t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E]
|
|
680 c_non_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_non_LH2E]
|
|
681 t_non_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_non_LH8E]
|
|
682
|
|
683 c_templ = 0
|
|
684 c_tem_counts = 0
|
|
685 c_mature = 0
|
|
686 c_mat_counts = 0
|
|
687 t_templ = 0
|
|
688 t_tem_counts = 0
|
|
689 t_mature = 0
|
|
690 t_mat_counts = 0
|
|
691
|
|
692 c_non = len(c_non_samples)
|
|
693 c_non_counts = sum(x[2] for x in c_non_samples)
|
|
694 t_non = len(t_non_samples)
|
|
695 t_non_counts = sum(x[2] for x in t_non_samples)
|
|
696
|
|
697 c_unmap = c_unmap - c_non
|
|
698 t_unmap = c_unmap - t_non
|
|
699
|
|
700 c_unmap_counts=c_unmap_counts - c_non_counts
|
|
701 t_unmap_counts=t_unmap_counts - t_non_counts
|
|
702
|
|
703
|
|
704 for x in c_samples:
|
|
705
|
|
706 if "/" not in x[0]:
|
|
707 if "chr" in x[0].split("_")[-1]:
|
|
708 c_mature+=1
|
|
709 c_mat_counts += x[2]
|
|
710 else:
|
|
711 c_templ+=1
|
|
712 c_tem_counts += x[2]
|
|
713 else:
|
|
714 f=0
|
|
715 for y in x[0].split("/"):
|
|
716 if "chr" in y.split("_")[-1]:
|
|
717 c_mature+=1
|
|
718 c_mat_counts += x[2]
|
|
719 f=1
|
|
720 break
|
|
721 if f==0:
|
|
722 c_templ+=1
|
|
723 c_tem_counts += x[2]
|
|
724
|
|
725 for x in t_samples:
|
|
726
|
|
727 if "/" not in x[0]:
|
|
728 if "chr" in x[0].split("_")[-1]:
|
|
729 t_mature+=1
|
|
730 t_mat_counts += x[2]
|
|
731 else:
|
|
732 t_templ+=1
|
|
733 t_tem_counts += x[2]
|
|
734 else:
|
|
735 f=0
|
|
736 for y in x[0].split("/"):
|
|
737 if "chr" in y.split("_")[-1]:
|
|
738 t_mature+=1
|
|
739 t_mat_counts += x[2]
|
|
740 f=1
|
|
741 break
|
|
742 if f==0:
|
|
743 t_templ+=1
|
|
744 t_tem_counts += x[2]
|
|
745
|
|
746 fig = plt.figure(figsize=(7,5))
|
|
747 labels = 'miRNA RefSeq','Template', 'Unmapped','Non-template'
|
|
748 sizes = [c_mat_counts, c_tem_counts, c_unmap_counts,c_non_counts]
|
|
749 colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue']
|
|
750 ax1 = plt.subplot2grid((1,2),(0,0))
|
|
751 patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8)
|
|
752 [x.set_fontsize(8) for x in texts]
|
|
753 plt.title('Control Group (reads)',fontsize=12)
|
|
754 labels = 'miRNA RefSeq','Template', 'Unmapped','non-template'
|
|
755 sizes = [t_mat_counts, t_tem_counts, t_unmap_counts, t_non_counts]
|
|
756 colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue']
|
|
757 ax2 = plt.subplot2grid((1,2),(0,1))
|
|
758 patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8)
|
|
759 [x.set_fontsize(8) for x in texts]
|
|
760 plt.title('Treated Group (reads)', fontsize=12)
|
|
761 plt.savefig('pie_non.png',dpi=300)
|
|
762
|
|
763 ######################################################################################################################################################
|
|
764
|
|
765 def merging_names(LH2E_copy,new):
|
|
766
|
|
767 dupes=[]
|
|
768 final_LH2E =[]
|
|
769
|
|
770 for num in range(len(LH2E_copy)):
|
|
771
|
|
772 if LH2E_copy[num][1] not in final_LH2E and LH2E_copy[num][0] not in final_LH2E:
|
|
773 final_LH2E.append(LH2E_copy[num][1])
|
|
774 final_LH2E.append(LH2E_copy[num][0])
|
|
775 else:
|
|
776 dupes.append(LH2E_copy[num][1])
|
|
777
|
|
778 dupes=list(set(dupes))
|
|
779
|
|
780 for i in range(len(dupes)):
|
|
781 dupes[i]=[dupes[i]]
|
|
782
|
|
783 for x in LH2E_copy:
|
|
784 for y in dupes:
|
|
785 if x[1]==y[0]:
|
|
786 fl=0
|
|
787 if len(y)==1:
|
|
788 y.append(x[0])
|
|
789 else:
|
|
790 for i in range(1,len(y)):
|
|
791 if y[i].split("_")[0]==x[0].split("_")[0]:
|
|
792 fl=1
|
|
793 if len(x[0])<len(y[i]):
|
|
794 del y[i]
|
|
795 y.append(x[0])
|
|
796 break
|
|
797
|
|
798 if fl==0:
|
|
799 y.append((x[0]))
|
|
800
|
|
801 for y in dupes:
|
|
802 if len(y)>2:
|
|
803 for i in range(len(y)-1,1,-1):
|
|
804 y[1]=y[1]+"/"+y[i]
|
|
805 del y[i]
|
|
806
|
|
807
|
|
808 for x in LH2E_copy:
|
|
809 for y in dupes:
|
|
810 if x[1]==y[0]:
|
|
811 x[0]=y[1]
|
|
812
|
|
813 LH2E_copy.sort()
|
|
814 LH2E_copy=list(LH2E_copy for LH2E_copy,_ in itertools.groupby(LH2E_copy))
|
|
815
|
|
816 new.extend(LH2E_copy)
|
|
817
|
|
818
|
|
819 ######################################################################################################################################################
|
|
820 def pie_temp(merge_LH2E,c_unmap,c_unmap_counts,merge_LH8E,t_unmap,t_unmap_counts):
|
|
821
|
|
822 c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E]
|
|
823 t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E]
|
|
824
|
|
825 c_templ = 0
|
|
826 c_tem_counts = 0
|
|
827 c_mature = 0
|
|
828 c_mat_counts = 0
|
|
829 t_templ = 0
|
|
830 t_tem_counts = 0
|
|
831 t_mature = 0
|
|
832 t_mat_counts = 0
|
|
833
|
|
834 for x in c_samples:
|
|
835
|
|
836 if "/" not in x[0]:
|
|
837 if "chr" in x[0].split("_")[-1]:
|
|
838 c_mature+=1
|
|
839 c_mat_counts += x[2]
|
|
840 else:
|
|
841 c_templ+=1
|
|
842 c_tem_counts += x[2]
|
|
843 else:
|
|
844 f=0
|
|
845 for y in x[0].split("/"):
|
|
846 if "chr" in y.split("_")[-1]:
|
|
847 c_mature+=1
|
|
848 c_mat_counts += x[2]
|
|
849 f=1
|
|
850 break
|
|
851 if f==0:
|
|
852 c_templ+=1
|
|
853 c_tem_counts += x[2]
|
|
854
|
|
855 for x in t_samples:
|
|
856
|
|
857 if "/" not in x[0]:
|
|
858 if "chr" in x[0].split("_")[-1]:
|
|
859 t_mature+=1
|
|
860 t_mat_counts += x[2]
|
|
861 else:
|
|
862 t_templ+=1
|
|
863 t_tem_counts += x[2]
|
|
864 else:
|
|
865 f=0
|
|
866 for y in x[0].split("/"):
|
|
867 if "chr" in y.split("_")[-1]:
|
|
868 t_mature+=1
|
|
869 t_mat_counts += x[2]
|
|
870 f=1
|
|
871 break
|
|
872 if f==0:
|
|
873 t_templ+=1
|
|
874 t_tem_counts += x[2]
|
|
875
|
|
876
|
|
877 fig = plt.figure()
|
|
878 labels = 'miRNA RefSeq','Template', 'Unmapped'
|
|
879 sizes = [c_mat_counts, c_tem_counts, c_unmap_counts]
|
|
880 colors = ['gold', 'yellowgreen', 'lightskyblue']
|
|
881 explode = (0.2, 0.05, 0.1)
|
|
882 ax1 = plt.subplot2grid((1,2),(0,0))
|
|
883 patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8)
|
|
884 [x.set_fontsize(8) for x in texts]
|
|
885 plt.title('Control group (reads)', fontsize=12)
|
|
886 labels = 'miRNA RefSeq','Template', 'Unmapped'
|
|
887 sizes = [t_mat_counts, t_tem_counts, t_unmap_counts]
|
|
888 colors = ['gold', 'yellowgreen', 'lightskyblue']
|
|
889 explode = (0.2, 0.05, 0.1)
|
|
890 ax2 = plt.subplot2grid((1,2),(0,1))
|
|
891 patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8)
|
|
892 [x.set_fontsize(8) for x in texts]
|
|
893 plt.title('Treated group (reads)',fontsize = 12)
|
|
894 plt.savefig('pie_tem.png',dpi=300)
|
|
895
|
|
896 ###################################################################################################################################################################################################################
|
|
897
|
|
898 def make_spider(merge_LH2E,merge_LH8E):
|
|
899
|
|
900 c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E]
|
|
901 t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E]
|
|
902
|
|
903 c_5 = 0
|
|
904 c_5_counts = 0
|
|
905 c_3 = 0
|
|
906 c_3_counts = 0
|
|
907 c_both =0
|
|
908 c_both_counts=0
|
|
909 c_mature = 0
|
|
910 c_mat_counts = 0
|
|
911 c_exception=0
|
|
912 c_exception_counts=0
|
|
913
|
|
914
|
|
915 t_5 = 0
|
|
916 t_5_counts = 0
|
|
917 t_3 = 0
|
|
918 t_3_counts = 0
|
|
919 t_both = 0
|
|
920 t_both_counts = 0
|
|
921 t_mature = 0
|
|
922 t_mat_counts = 0
|
|
923 t_exception = 0
|
|
924 t_exception_counts=0
|
|
925
|
|
926 for x in c_samples:
|
|
927
|
|
928 if "/" not in x[0]:
|
|
929 if "chr" in x[0].split("_")[-1]:
|
|
930 c_mature+=1
|
|
931 c_mat_counts += x[2]
|
|
932 elif 0 == int(x[0].split("_")[-1]):
|
|
933 c_5+=1
|
|
934 c_5_counts += x[2]
|
|
935 elif 0 == int(x[0].split("_")[-2]):
|
|
936 c_3+=1
|
|
937 c_3_counts += x[2]
|
|
938 else:
|
|
939 c_both+=1
|
|
940 c_both_counts+=x[2]
|
|
941
|
|
942 else:
|
|
943 f=0
|
|
944 for y in x[0].split("/"):
|
|
945 if "chr" in y.split("_")[-1]:
|
|
946 c_mature+=1
|
|
947 c_mat_counts += x[2]
|
|
948 f=1
|
|
949 break
|
|
950 if f==0:
|
|
951 for y in x[0].split("/"):
|
|
952 c_exception+=1
|
|
953 c_exception_counts += x[2]
|
|
954
|
|
955
|
|
956 for x in t_samples:
|
|
957
|
|
958 if "/" not in x[0]:
|
|
959 if "chr" in x[0].split("_")[-1]:
|
|
960 t_mature+=1
|
|
961 t_mat_counts += x[2]
|
|
962 elif 0 == int(x[0].split("_")[-1]):
|
|
963 t_5+=1
|
|
964 t_5_counts += x[2]
|
|
965 elif 0 == int(x[0].split("_")[-2]):
|
|
966 t_3+=1
|
|
967 t_3_counts += x[2]
|
|
968 else:
|
|
969 t_both+=1
|
|
970 t_both_counts+=x[2]
|
|
971
|
|
972 else:
|
|
973 f=0
|
|
974 for y in x[0].split("/"):
|
|
975 if "chr" in y.split("_")[-1]:
|
|
976 t_mature+=1
|
|
977 t_mat_counts += x[2]
|
|
978 f=1
|
|
979 break
|
|
980 if f==0:
|
|
981 for y in x[0].split("/"):
|
|
982 t_exception+=1
|
|
983 t_exception_counts += x[2]
|
|
984
|
|
985
|
|
986 c_all = c_5+c_3+c_both+c_mature+c_exception
|
|
987 c_all_counts = c_5_counts + c_3_counts + c_both_counts + c_mat_counts + c_exception_counts
|
|
988
|
|
989 t_all = t_5+t_3+t_both+t_mature + t_exception
|
|
990 t_all_counts = t_5_counts + t_3_counts + t_both_counts + t_mat_counts + t_exception_counts
|
|
991
|
|
992 c_5 = round(c_5/c_all*100,2)
|
|
993 c_3 = round(c_3/c_all*100,2)
|
|
994 c_both = round(c_both/c_all*100,2)
|
|
995 c_mature = round(c_mature/c_all*100,2)
|
|
996 c_exception = round(c_exception/c_all*100,2)
|
|
997
|
|
998 c_5_counts = round(c_5_counts/c_all_counts*100,2)
|
|
999 c_3_counts = round(c_3_counts/c_all_counts*100,2)
|
|
1000 c_both_counts = round(c_both_counts/c_all_counts*100,2)
|
|
1001 c_mat_counts = round(c_mat_counts/c_all_counts*100,2)
|
|
1002 c_exception_counts = round(c_exception_counts/c_all_counts*100,2)
|
|
1003
|
|
1004 t_5 = round(t_5/t_all*100,2)
|
|
1005 t_3 = round(t_3/t_all*100,2)
|
|
1006 t_both = round(t_both/t_all*100,2)
|
|
1007 t_mature = round(t_mature/t_all*100,2)
|
|
1008 t_exception = round(t_exception/t_all*100,2)
|
|
1009
|
|
1010 t_5_counts = round(t_5_counts/t_all_counts*100,2)
|
|
1011 t_3_counts = round(t_3_counts/t_all_counts*100,2)
|
|
1012 t_both_counts = round(t_both_counts/t_all_counts*100,2)
|
|
1013 t_mat_counts = round(t_mat_counts/t_all_counts*100,2)
|
|
1014 t_exception_counts = round(t_exception_counts/t_all_counts*100,2)
|
|
1015
|
|
1016 radar_max = max(c_5, c_3, c_both,c_mature,c_exception,t_5,t_3,t_both,t_mature,t_exception)
|
|
1017 radar_max_counts = max(c_5_counts,c_3_counts,c_both_counts,c_mat_counts,c_exception_counts,t_5_counts,t_3_counts,t_both_counts,t_mat_counts,t_exception_counts)
|
|
1018
|
|
1019 df=pd.DataFrame({
|
|
1020 'group':['Controls','Treated'],
|
|
1021 """5' and 3' isomiRs""":[c_both,t_both],
|
|
1022 """3' isomiRs""":[c_3,t_3],
|
|
1023 'miRNA RefSeq':[c_mature,t_mature],
|
|
1024 """5' isomiRs""":[c_5,t_5],
|
|
1025 'Others*':[c_exception,t_exception]})
|
|
1026
|
|
1027 df1=pd.DataFrame({
|
|
1028 'group':['Controls','Treated'],
|
|
1029 """5' and 3' isomiRs""":[c_both_counts,t_both_counts],
|
|
1030 """3' isomiRs""":[c_3_counts,t_3_counts],
|
|
1031 'miRNA RefSeq':[c_mat_counts,t_mat_counts],
|
|
1032 """5' isomiRs""":[c_5_counts,t_5_counts],
|
|
1033 'Others*':[c_exception_counts,t_exception_counts]})
|
|
1034
|
|
1035 spider_last(df,radar_max,1)
|
|
1036 spider_last(df1,radar_max_counts,2)
|
|
1037
|
|
1038
|
|
1039 def spider_last(df,radar_max,flag):
|
|
1040 # ------- PART 1: Create background
|
|
1041 fig = plt.figure()
|
|
1042 # number of variable
|
|
1043 categories=list(df)[1:]
|
|
1044 N = len(categories)
|
|
1045
|
|
1046 # What will be the angle of each axis in the plot? (we divide the plot / number of variable)
|
|
1047 angles = [n / float(N) * 2 * pi for n in range(N)]
|
|
1048 angles += angles[:1]
|
|
1049
|
|
1050 # Initialise the spider plot
|
|
1051 ax = plt.subplot(111, polar=True)
|
|
1052
|
|
1053 # If you want the first axis to be on top:
|
|
1054 ax.set_theta_offset(pi/2)
|
|
1055 ax.set_theta_direction(-1)
|
|
1056
|
|
1057 # Draw one axe per variable + add labels labels yet
|
|
1058 plt.xticks(angles[:-1], categories, fontsize=11)
|
|
1059
|
|
1060 # Draw ylabels
|
|
1061 radar_max=round(radar_max+radar_max*0.1)
|
|
1062 mul=len(str(radar_max))-1
|
|
1063 maxi=int(math.ceil(radar_max / pow(10,mul))) * pow(10,mul)
|
|
1064 sep = round(maxi/4)
|
|
1065 plt.yticks([sep, 2*sep, 3*sep, 4*sep, 5*sep], [str(sep)+'%', str(2*sep)+'%', str(3*sep)+'%', str(4*sep)+'%', str(5*sep)+'%'], color="grey", size=10)
|
|
1066 plt.ylim(0, maxi)
|
|
1067
|
|
1068 # ------- PART 2: Add plots
|
|
1069
|
|
1070 # Plot each individual = each line of the data
|
|
1071 # I don't do a loop, because plotting more than 3 groups makes the chart unreadable
|
|
1072
|
|
1073 # Ind1
|
|
1074 values=df.loc[0].drop('group').values.flatten().tolist()
|
|
1075 values += values[:1]
|
|
1076 ax.plot(angles, values,'-o', linewidth=1, linestyle='solid', label="Controls")
|
|
1077 ax.fill(angles, values, 'b', alpha=0.1)
|
|
1078
|
|
1079 # Ind2
|
|
1080 values=df.loc[1].drop('group').values.flatten().tolist()
|
|
1081 values += values[:1]
|
|
1082 ax.plot(angles, values, '-o' ,linewidth=1, linestyle='solid', label="Treated")
|
|
1083 ax.fill(angles, values, 'r', alpha=0.1)
|
|
1084
|
|
1085 # Add legend
|
|
1086 if flag==1:
|
|
1087 plt.legend(loc='upper right', bbox_to_anchor=(0.0, 0.1))
|
|
1088 plt.savefig('spider_non_red.png',dpi=300)
|
|
1089 else:
|
|
1090 plt.legend(loc='upper right', bbox_to_anchor=(0.0, 0.1))
|
|
1091 plt.savefig('spider_red.png',dpi=300)
|
|
1092
|
|
1093
|
|
1094 #############################################################################################################################################################################################################
|
|
1095 def hist_red(samples,flag):
|
|
1096 lengths=[]
|
|
1097 cat=[]
|
|
1098 total_reads=0
|
|
1099 seq=[]
|
|
1100
|
|
1101 if flag == "c":
|
|
1102 title = "Length Distribution of Control group (Redudant reads)"
|
|
1103 if flag == "t":
|
|
1104 title = "Length Distribution of Treated group (Redudant reads)"
|
|
1105
|
|
1106 for i in samples:
|
|
1107 for x in i:
|
|
1108 lengths.append(len(x[9]))
|
|
1109 if x[1]=="0":
|
|
1110 seq.append([x[9],x[0].split("-")[1],"Mapped"])
|
|
1111 cat.append("Mapped")
|
|
1112 if x[1] == "4":
|
|
1113 seq.append([x[9],x[0].split("-")[1],"Unmapped"])
|
|
1114 cat.append("Unmapped")
|
|
1115
|
|
1116 uni_len=list(set(lengths))
|
|
1117 uni_len=[x for x in uni_len if x<=35]
|
|
1118 low=min(lengths)
|
|
1119 up=max(lengths)
|
|
1120 seq.sort()
|
|
1121 uni_seq=list(seq for seq,_ in itertools.groupby(seq))
|
|
1122 dim=up-low
|
|
1123
|
|
1124 if dim>20:
|
|
1125 s=5
|
|
1126 else:
|
|
1127 s=8
|
|
1128
|
|
1129 total_reads+=sum([int(x[1]) for x in uni_seq])
|
|
1130
|
|
1131 map_reads=[]
|
|
1132 unmap_reads=[]
|
|
1133 length=[]
|
|
1134 for y in uni_len:
|
|
1135 map_temp=0
|
|
1136 unmap_temp=0
|
|
1137 for x in uni_seq:
|
|
1138 if len(x[0])==y and x[2]=="Mapped":
|
|
1139 map_temp+=int(x[1])
|
|
1140 if len(x[0])==y and x[2]=="Unmapped":
|
|
1141 unmap_temp+=int(x[1])
|
|
1142 if y<=35:
|
|
1143 length.append(y)
|
|
1144 map_reads.append(round(map_temp/total_reads*100,2))
|
|
1145 unmap_reads.append(round(unmap_temp/total_reads*100,2))
|
|
1146
|
|
1147 ylim=max([sum(x) for x in zip(unmap_reads, map_reads)])
|
|
1148 ylim=ylim+ylim*20/100
|
|
1149 fig, ax = plt.subplots()
|
|
1150 width=0.8
|
|
1151 ax.bar(length, unmap_reads, width, label='Unmapped')
|
|
1152 h=ax.bar(length, map_reads, width, bottom=unmap_reads, label='Mapped')
|
|
1153 plt.xticks(np.arange(length[0], length[-1]+1, 1))
|
|
1154 plt.xlabel('Length (nt)',fontsize=14)
|
|
1155 plt.ylabel('Percentage',fontsize=14)
|
|
1156 plt.title(title,fontsize=14)
|
|
1157 ax.legend()
|
|
1158 plt.ylim([0, ylim])
|
|
1159 ax.grid(axis='y',linewidth=0.2)
|
|
1160
|
|
1161 if flag=='c':
|
|
1162 plt.savefig('c_hist_red.png',dpi=300)
|
|
1163
|
|
1164 if flag=='t':
|
|
1165 plt.savefig('t_hist_red.png',dpi=300)
|
|
1166
|
|
1167 #################################################################################################################
|
|
1168
|
|
1169
|
|
1170 def logo_seq_red(merge, flag):
|
|
1171
|
|
1172 if flag=="c":
|
|
1173 titlos="Control group (Redundant)"
|
|
1174 file_logo="c_logo.png"
|
|
1175 file_bar="c_bar.png"
|
|
1176 if flag=="t":
|
|
1177 titlos="Treated group (Redundant)"
|
|
1178 file_logo="t_logo.png"
|
|
1179 file_bar="t_bar.png"
|
|
1180
|
|
1181 c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge]
|
|
1182
|
|
1183 A=[0]*5
|
|
1184 C=[0]*5
|
|
1185 G=[0]*5
|
|
1186 T=[0]*5
|
|
1187 total_reads=0
|
|
1188
|
|
1189 for y in c_samples:
|
|
1190 if "/" in y[0]:
|
|
1191 length=[]
|
|
1192 for x in y[0].split("/"):
|
|
1193 length.append([len(x.split("_")[-1]),x.split("_")[-1],y[2]])
|
|
1194
|
|
1195 best=min(length)
|
|
1196 total_reads+=best[2]
|
|
1197 for i in range(5):
|
|
1198 if i<len(best[1]):
|
|
1199 if best[1][i] == "A":
|
|
1200 A[i]+=best[2]
|
|
1201 elif best[1][i] == "C":
|
|
1202 C[i]+=best[2]
|
|
1203 elif best[1][i] == "G":
|
|
1204 G[i]+=best[2]
|
|
1205 else:
|
|
1206 T[i]+=best[2]
|
|
1207 else:
|
|
1208 total_reads+=y[2]
|
|
1209 for i in range(5):
|
|
1210 if i<len(y[0].split("_")[-1]):
|
|
1211 if y[0].split("_")[-1][i] == "A":
|
|
1212 A[i]+=(y[2])
|
|
1213 elif y[0].split("_")[-1][i] == "C":
|
|
1214 C[i]+=(y[2])
|
|
1215 elif y[0].split("_")[-1][i] == "G":
|
|
1216 G[i]+=(y[2])
|
|
1217 else:
|
|
1218 T[i]+=y[2]
|
|
1219
|
|
1220 A[:] = [round(x*100,1) / total_reads for x in A]
|
|
1221 C[:] = [round(x*100,1) / total_reads for x in C]
|
|
1222 G[:] = [round(x*100,1) / total_reads for x in G]
|
|
1223 T[:] = [round(x*100,1) / total_reads for x in T]
|
|
1224
|
|
1225
|
|
1226
|
|
1227 data = {'A':A,'C':C,'G':G,'T':T}
|
|
1228 df = pd.DataFrame(data, index=[1,2,3,4,5])
|
|
1229 h=df.plot.bar(color=tuple(["g", "b","gold","r"]) )
|
|
1230 h.grid(axis='y',linewidth=0.2)
|
|
1231 plt.xticks(rotation=0, ha="right")
|
|
1232 plt.ylabel("Counts (%)",fontsize=18)
|
|
1233 plt.xlabel("Positions (nt)",fontsize=18)
|
|
1234 plt.title(titlos,fontsize=20)
|
|
1235 plt.tight_layout()
|
|
1236 plt.savefig(file_bar, dpi=300)
|
|
1237
|
|
1238 import logomaker as lm
|
|
1239 crp_logo = lm.Logo(df, font_name = 'DejaVu Sans')
|
|
1240 crp_logo.style_spines(visible=False)
|
|
1241 crp_logo.style_spines(spines=['left', 'bottom'], visible=True)
|
|
1242 crp_logo.style_xticks(rotation=0, fmt='%d', anchor=0)
|
|
1243
|
|
1244 # style using Axes methods
|
|
1245 crp_logo.ax.set_title(titlos,fontsize=18)
|
|
1246 crp_logo.ax.set_ylabel("Counts (%)", fontsize=16,labelpad=5)
|
|
1247 crp_logo.ax.set_xlabel("Positions (nt)",fontsize=16, labelpad=5)
|
|
1248 crp_logo.ax.xaxis.set_ticks_position('none')
|
|
1249 crp_logo.ax.xaxis.set_tick_params(pad=-1)
|
|
1250 figure = plt.gcf()
|
|
1251 figure.set_size_inches(6, 4)
|
|
1252 crp_logo.fig.savefig(file_logo,dpi=300)
|
|
1253
|
|
1254 ##########################################################################################################################################################################################################
|
|
1255
|
|
1256
|
|
1257
|
|
1258 def logo_seq_non_red(merge_LH2E):
|
|
1259
|
|
1260 c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E]
|
|
1261
|
|
1262 A=[0]*5
|
|
1263 C=[0]*5
|
|
1264 G=[0]*5
|
|
1265 T=[0]*5
|
|
1266
|
|
1267 for y in c_samples:
|
|
1268 if "/" in y[0]:
|
|
1269 length=[]
|
|
1270 for x in y[0].split("/"):
|
|
1271 length.append([len(x.split("_")[-1]),x.split("_")[-1],y[2]])
|
|
1272
|
|
1273 best=min(length)
|
|
1274 for i in range(5):
|
|
1275 if i<len(best[1]):
|
|
1276 if best[1][i] == "A":
|
|
1277 A[i]+=1
|
|
1278 elif best[1][i] == "C":
|
|
1279 C[i]+=1
|
|
1280 elif best[1][i] == "G":
|
|
1281 G[i]+=1
|
|
1282 else:
|
|
1283 T[i]+=1
|
|
1284 else:
|
|
1285 for i in range(5):
|
|
1286 if i<len(y[0].split("_")[-1]):
|
|
1287 if y[0].split("_")[-1][i] == "A":
|
|
1288 A[i]+=1
|
|
1289 elif y[0].split("_")[-1][i] == "C":
|
|
1290 C[i]+=1
|
|
1291 elif y[0].split("_")[-1][i] == "G":
|
|
1292 G[i]+=1
|
|
1293 else:
|
|
1294 T[i]+=1
|
|
1295
|
|
1296 data = {'A':A,'C':C,'G':G,'T':T}
|
|
1297 df = pd.DataFrame(data, index=[1,2,3,4,5])
|
|
1298 h=df.plot.bar(title="Non-templated nucleotides after templated sequence",color=tuple(["g", "b","gold","r"]))
|
|
1299 h.set_xlabel("Positions (nt)")
|
|
1300 h.set_ylabel("Unique sequences")
|
|
1301 plt.xticks(rotation=0, ha="right")
|
|
1302 plt.tight_layout()
|
|
1303 plt.savefig("bar2.png", dpi=300)
|
|
1304
|
|
1305
|
|
1306 import logomaker as lm
|
|
1307 crp_logo = lm.Logo(df, font_name = 'DejaVu Sans')
|
|
1308
|
|
1309 # style using Logo methods
|
|
1310 crp_logo.style_spines(visible=False)
|
|
1311 crp_logo.style_spines(spines=['left', 'bottom'], visible=True)
|
|
1312 crp_logo.style_xticks(rotation=0, fmt='%d', anchor=0)
|
|
1313
|
|
1314 # style using Axes methods
|
|
1315 crp_logo.ax.set_ylabel("Unique sequences", labelpad=5)
|
|
1316 crp_logo.ax.set_xlabel("Positions (nt)", labelpad=5)
|
|
1317 crp_logo.ax.xaxis.set_ticks_position('none')
|
|
1318 crp_logo.ax.xaxis.set_tick_params(pad=-1)
|
|
1319 crp_logo.ax.set_title("Non-redundant")
|
|
1320 figure = plt.gcf()
|
|
1321 crp_logo.fig.savefig('logo2.png', dpi=300)
|
|
1322
|
|
1323
|
|
1324 ###################################################################################################################################################################################################################
|
|
1325
|
|
1326 def ssamples1(tem_names,tem_samp,non_names,non_samp,folder,pro):
|
|
1327
|
|
1328 for i in range(2,len(tem_samp[0])):
|
|
1329
|
|
1330 fp = open(folder+tem_names[i-2]+'.txt','w')
|
|
1331 fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n")
|
|
1332
|
|
1333 for x in tem_samp:
|
|
1334 fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
|
|
1335
|
|
1336 for j in range(len(non_names)):
|
|
1337 if non_names[j]==tem_names[i-2]:
|
|
1338 for x in non_samp:
|
|
1339 fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n")
|
|
1340 fp.close()
|
|
1341
|
|
1342 ###################################################################################################################################################################################################################
|
|
1343
|
|
1344 def download_matures(matures,org_name):
|
|
1345
|
|
1346 #url = 'ftp://mirbase.org/pub/mirbase/21/mature.fa.gz'
|
|
1347 url = 'ftp://mirbase.org/pub/mirbase/CURRENT/mature.fa.gz'
|
|
1348 data = urllib.request.urlopen(url).read()
|
|
1349 file_mirna = gzip.decompress(data).decode('utf-8')
|
|
1350 file_mirna = file_mirna.split("\n")
|
|
1351
|
|
1352 for i in range(0,len(file_mirna)-1,2):
|
|
1353
|
|
1354 if org_name in file_mirna[i]:
|
|
1355 matures.append(file_mirna[i])
|
|
1356 matures.append(file_mirna[i+1])
|
|
1357
|
|
1358 ###################################################################################################################################################################################################################
|
|
1359 def non_template_ref(sc,st,all_isoforms):
|
|
1360
|
|
1361 pre_uni_seq_con = list(sc)
|
|
1362 pre_uni_seq_tre = list(st)
|
|
1363
|
|
1364 for x in pre_uni_seq_con:
|
|
1365 for y in x:
|
|
1366 if ">"+y[2] not in all_isoforms and ")_" in y[2] :
|
|
1367 all_isoforms.append(">"+y[2])
|
|
1368 all_isoforms.append(y[9])
|
|
1369
|
|
1370
|
|
1371 for x in pre_uni_seq_tre:
|
|
1372 for y in x:
|
|
1373 if ">"+y[2] not in all_isoforms and ")_" in y[2]:
|
|
1374 all_isoforms.append(">"+y[2])
|
|
1375 all_isoforms.append(y[9])
|
|
1376
|
|
1377 ################################################################################################################################################################################################
|
|
1378
|
|
1379 def deseqe2(sample,mir_names,l,new_d,sample_name,sample_order):
|
|
1380
|
|
1381 for y in mir_names:
|
|
1382 flag=0
|
|
1383 for x in sample:
|
|
1384 if y[0]==x[0]:
|
|
1385 flag=1
|
|
1386 break
|
|
1387 if flag==0:
|
|
1388 sample.append([y[0],"0",y[1]])
|
|
1389
|
|
1390 sample.sort(key=lambda x: x[0])
|
|
1391 sample=list(sample for sample,_ in itertools.groupby(sample))
|
|
1392
|
|
1393 l.acquire()
|
|
1394 new_d.append(sample)
|
|
1395 sample_order.append(sample_name)
|
|
1396 l.release()
|
|
1397
|
|
1398 ###############################################################################################################################################################################################
|
|
1399
|
|
1400 if __name__ == '__main__':
|
|
1401
|
|
1402 starttime = time.time()
|
|
1403
|
|
1404 q1 = Queue()
|
|
1405 q2 = Queue()
|
|
1406 lock = Lock()
|
|
1407 manager = Manager()
|
|
1408
|
|
1409 mature_mirnas=manager.list()
|
|
1410 ps_mature=Process(target=download_matures,args=(mature_mirnas,args.org_name))
|
|
1411 ps_mature.start()
|
|
1412
|
|
1413 args.control[0]=args.control[0][1:]
|
|
1414 args.control[len(args.control)-1][:-1]
|
|
1415 control = [(args.control[i:i+2]) for i in range(0, len(args.control), 2)]
|
|
1416
|
|
1417 args.treated[0]=args.treated[0][1:]
|
|
1418 args.treated[len(args.treated)-1][:-1]
|
|
1419 treated = [(args.treated[i:i+2]) for i in range(0, len(args.treated), 2)]
|
|
1420
|
|
1421
|
|
1422 ############## Detection of templated isoforms ################
|
|
1423
|
|
1424 radar = manager.list([0,0,0,0])
|
|
1425 samples = manager.list()
|
|
1426 data= manager.list()
|
|
1427 names_con=manager.list()
|
|
1428 samples_mirna_names=manager.list()
|
|
1429 deseq=manager.list()
|
|
1430 unmap_seq=manager.Value('i',0)
|
|
1431 unmap_counts=manager.Value('i',0)
|
|
1432 LH2E_names=manager.list()
|
|
1433 ini_c_samples = manager.list()
|
|
1434
|
|
1435
|
|
1436 radar1 = manager.list([0,0,0,0])
|
|
1437 samples1 = manager.list()
|
|
1438 data1 = manager.list()
|
|
1439 names_tre = manager.list()
|
|
1440 samples_mirna_names1=manager.list()
|
|
1441 deseq1=manager.list()
|
|
1442 unmap1_seq = manager.Value('i',0)
|
|
1443 unmap1_counts = manager.Value('i',0)
|
|
1444 LH8E_names=manager.list()
|
|
1445 ini_t_samples = manager.list()
|
|
1446 ps_mature.join()
|
|
1447
|
|
1448
|
|
1449 mature_mirnas=list(mature_mirnas)
|
|
1450
|
|
1451
|
|
1452 starttime1 = time.time()
|
|
1453 ps_sam = [Process(target=sam,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,samples,data,names_con,unmap_seq,samples_mirna_names,deseq,LH2E_names,"0",ini_c_samples,unmap_counts)) for path in control]
|
|
1454 ps_sam.extend([Process(target=sam,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,samples1,data1,names_tre,unmap1_seq,samples_mirna_names1,deseq1,LH8E_names,"0",ini_t_samples,unmap1_counts)) for path in treated])
|
|
1455
|
|
1456 [p.start() for p in ps_sam]
|
|
1457 [p.join() for p in ps_sam]
|
|
1458 print('SAM took {} seconds'.format(time.time() - starttime1))
|
|
1459
|
|
1460 ps_hist=[Process(target=hist_red,args=(ini_c_samples,'c'))]
|
|
1461 ps_hist.extend([Process(target=hist_red,args=(ini_t_samples,'t'))])
|
|
1462 [x.start() for x in ps_hist]
|
|
1463
|
|
1464 starttime200=time.time()
|
|
1465
|
|
1466 sc = list(samples)
|
|
1467 st = list(samples1)
|
|
1468
|
|
1469 names_con=list(names_con)
|
|
1470 names_tre=list(names_tre)
|
|
1471 samples_mirna_names=list(samples_mirna_names)
|
|
1472 samples_mirna_names.sort()
|
|
1473 samples_mirna_names=list(samples_mirna_names for samples_mirna_names,_ in itertools.groupby(samples_mirna_names))
|
|
1474
|
|
1475 samples_mirna_names1=list(samples_mirna_names1)
|
|
1476 samples_mirna_names1.sort()
|
|
1477 samples_mirna_names1=list(samples_mirna_names1 for samples_mirna_names1,_ in itertools.groupby(samples_mirna_names1))
|
|
1478
|
|
1479 deseq=list(deseq)
|
|
1480 deseq1=list(deseq1)
|
|
1481
|
|
1482 new_names_con=manager.list()
|
|
1483 new_names_tre=manager.list()
|
|
1484 new_deseq=manager.list()
|
|
1485 new_deseq1=manager.list()
|
|
1486 ps_deseq=[Process(target=deseqe2,args=(sampp,samples_mirna_names,lock,new_deseq,names_con[i],new_names_con)) for i,sampp in enumerate(deseq)]
|
|
1487 ps_deseq.extend([Process(target=deseqe2,args=(sampp,samples_mirna_names1,lock,new_deseq1,names_tre[i],new_names_tre)) for i,sampp in enumerate(deseq1)])
|
|
1488
|
|
1489 [z.start() for z in ps_deseq]
|
|
1490 [z.join() for z in ps_deseq]
|
|
1491 new_deseq=list(new_deseq)
|
|
1492 new_deseq1=list(new_deseq1)
|
|
1493
|
|
1494 LH2E=[[x[0],x[2]] for x in new_deseq[0]]
|
|
1495 [LH2E[i].append(y[i][1]) for i,_ in enumerate(LH2E) for y in new_deseq]
|
|
1496
|
|
1497 LH8E=[[x[0],x[2]] for x in new_deseq1[0]]
|
|
1498 [LH8E[i].append(y[i][1]) for i,_ in enumerate(LH8E) for y in new_deseq1]
|
|
1499
|
|
1500 print('Deseq took {} seconds'.format(time.time() - starttime200))
|
|
1501
|
|
1502 merg_nam_LH2E=manager.list()
|
|
1503 merg_nam_LH8E=manager.list()
|
|
1504
|
|
1505 LH2E_copy=copy.deepcopy(list(LH2E))
|
|
1506 LH8E_copy=copy.deepcopy(list(LH8E))
|
|
1507
|
|
1508 fil_sort_tre=manager.list()
|
|
1509 fil_sort_con=manager.list()
|
|
1510 raw_sort_tre=manager.list()
|
|
1511 raw_sort_con=manager.list()
|
|
1512
|
|
1513 ps_main = Process(target=main_temp,args=(list(LH2E), samples_mirna_names, list(LH8E), samples_mirna_names1,1,list(names_con),list(names_tre),fil_sort_tre,fil_sort_con,raw_sort_tre,raw_sort_con))
|
|
1514 ps_main.start()
|
|
1515
|
|
1516 if args.anal=="2":
|
|
1517 all_iso = manager.list()
|
|
1518 ps_non_iso = Process(target=non_template_ref,args=(sc,st,all_iso))
|
|
1519 ps_non_iso.start()
|
|
1520
|
|
1521 ps_merge = [Process(target=merging_names,args=(LH2E_copy,merg_nam_LH2E))]
|
|
1522 ps_merge.extend([Process(target=merging_names,args=(LH8E_copy,merg_nam_LH8E))])
|
|
1523 [x.start() for x in ps_merge]
|
|
1524 [x.join() for x in ps_merge]
|
|
1525
|
|
1526 merg_nam_LH2E=list(merg_nam_LH2E)
|
|
1527 merg_nam_LH8E=list(merg_nam_LH8E)
|
|
1528
|
|
1529 starttime2 = time.time()
|
|
1530 procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in data]
|
|
1531 procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in data1])
|
|
1532 procs.extend([Process(target=make_spider,args=(merg_nam_LH2E,merg_nam_LH8E))])
|
|
1533 if args.anal == "1":
|
|
1534 procs.extend([Process(target=pie_temp,args=(merg_nam_LH2E,unmap_seq.value,unmap_counts.value,merg_nam_LH8E,unmap1_seq.value,unmap1_counts.value))])
|
|
1535
|
|
1536 [p.start() for p in procs]
|
|
1537
|
|
1538
|
|
1539 if args.anal=="1":
|
|
1540 [x.join() for x in ps_hist]
|
|
1541 [p.join() for p in procs]
|
|
1542 ps_pdf = Process(target=pdf_before_DE,args=(args.anal))
|
|
1543 ps_pdf.start()
|
|
1544
|
|
1545 print('Graphs took {} seconds'.format(time.time() - starttime2))
|
|
1546
|
|
1547 ps_main.join()
|
|
1548
|
|
1549 fil_sort_con=list(fil_sort_con)
|
|
1550 fil_sort_tre=list(fil_sort_tre)
|
|
1551 if fil_sort_con==[]:
|
|
1552 fil_sort_con=raw_sort_con
|
|
1553 fil_sort_tre=raw_sort_tre
|
|
1554
|
|
1555 raw_sort_con=list(raw_sort_con)
|
|
1556 raw_sort_tre=list(raw_sort_tre)
|
|
1557 names_con=list(new_names_con)
|
|
1558 names_tre=list(new_names_tre)
|
|
1559
|
|
1560 ps_write = Process(target=write_main,args=(raw_sort_con, raw_sort_tre, fil_sort_con, fil_sort_tre, names_con,names_tre,1))
|
|
1561 ps_write.start()
|
|
1562
|
|
1563 ps1_matrix = [Process(target=ssamples,args=(names_con,fil_sort_con,"Diff/temp_con/",0))]
|
|
1564 ps1_matrix.extend([Process(target=ssamples,args=(names_tre,fil_sort_tre,"Diff/temp_tre/",0))])
|
|
1565 [p.start() for p in ps1_matrix]
|
|
1566
|
|
1567 if args.anal=="1":
|
|
1568 ps_pdf.join()
|
|
1569 if args.anal=="2":
|
|
1570 [p.join() for p in procs]
|
|
1571 [x.join() for x in ps_hist]
|
|
1572
|
|
1573 ps_write.join()
|
|
1574 [p.join() for p in ps1_matrix]
|
|
1575
|
|
1576
|
|
1577 ############################## Detection of Both #######################################
|
|
1578
|
|
1579 starttime10 = time.time()
|
|
1580
|
|
1581 if args.anal == "2":
|
|
1582
|
|
1583 n_data= manager.list()
|
|
1584 n_names_con=manager.list()
|
|
1585 n_samples_mirna_names=manager.list()
|
|
1586 n_deseq=manager.list()
|
|
1587 n_LH2E_names=manager.list()
|
|
1588
|
|
1589 n_data1 = manager.list()
|
|
1590 n_names_tre = manager.list()
|
|
1591 n_samples_mirna_names1=manager.list()
|
|
1592 n_deseq1=manager.list()
|
|
1593 n_LH8E_names=manager.list()
|
|
1594
|
|
1595
|
|
1596 new_mat_mirnas = list(mature_mirnas)
|
|
1597 ps_non_iso.join()
|
|
1598
|
|
1599 all_iso=list(all_iso)
|
|
1600 new_mat_mirnas.extend(all_iso)
|
|
1601
|
|
1602 starttime11=time.time()
|
|
1603
|
|
1604 ps_sam = [Process(target=non_sam,args=(new_mat_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,n_data,n_names_con,n_deseq,n_samples_mirna_names,n_LH2E_names)) for path in control]
|
|
1605 ps_sam.extend([Process(target=non_sam,args=(new_mat_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,n_data1,n_names_tre,n_deseq1,n_samples_mirna_names1,n_LH8E_names)) for path in treated])
|
|
1606
|
|
1607 [p.start() for p in ps_sam]
|
|
1608 [p.join() for p in ps_sam]
|
|
1609
|
|
1610 print('Non-sam took {} seconds'.format(time.time() - starttime11))
|
|
1611
|
|
1612 starttime12=time.time()
|
|
1613
|
|
1614 n_names_con=list(n_names_con)
|
|
1615 n_names_tre=list(n_names_tre)
|
|
1616 n_samples_mirna_names=list(n_samples_mirna_names)
|
|
1617 n_samples_mirna_names.sort()
|
|
1618 n_samples_mirna_names=list(n_samples_mirna_names for n_samples_mirna_names,_ in itertools.groupby(n_samples_mirna_names))
|
|
1619
|
|
1620 n_samples_mirna_names1=list(n_samples_mirna_names1)
|
|
1621 n_samples_mirna_names1.sort()
|
|
1622 n_samples_mirna_names1=list(n_samples_mirna_names1 for n_samples_mirna_names1,_ in itertools.groupby(n_samples_mirna_names1))
|
|
1623
|
|
1624 n_deseq=list(n_deseq)
|
|
1625 n_deseq1=list(n_deseq1)
|
|
1626
|
|
1627 new_n_names_con=manager.list()
|
|
1628 new_n_names_tre=manager.list()
|
|
1629 n_new_deseq=manager.list()
|
|
1630 n_new_deseq1=manager.list()
|
|
1631 ps_deseq=[Process(target=deseqe2,args=(sampp,n_samples_mirna_names,lock,n_new_deseq,n_names_con[i],new_n_names_con)) for i,sampp in enumerate(n_deseq)]
|
|
1632 ps_deseq.extend([Process(target=deseqe2,args=(sampp,n_samples_mirna_names1,lock,n_new_deseq1,n_names_tre[i],new_n_names_tre)) for i,sampp in enumerate(n_deseq1)])
|
|
1633
|
|
1634 [x.start() for x in ps_deseq]
|
|
1635 [x.join() for x in ps_deseq]
|
|
1636 n_new_deseq=list(n_new_deseq)
|
|
1637 n_new_deseq1=list(n_new_deseq1)
|
|
1638
|
|
1639 print([len(n_new_deseq[0]),len(n_new_deseq[1])])
|
|
1640 print([len(n_new_deseq1[0]),len(n_new_deseq1[1])])
|
|
1641
|
|
1642 n_LH2E=[[x[0],x[2]] for x in n_new_deseq[0]]
|
|
1643 [n_LH2E[i].append(y[i][1]) for i,_ in enumerate(n_LH2E) for y in n_new_deseq]
|
|
1644
|
|
1645 n_LH8E=[[x[0],x[2]] for x in n_new_deseq1[0]]
|
|
1646 [n_LH8E[i].append(y[i][1]) for i,_ in enumerate(n_LH8E) for y in n_new_deseq1]
|
|
1647
|
|
1648 print('Non-deseq took {} seconds'.format(time.time() - starttime12))
|
|
1649
|
|
1650 merg_nam_n_LH2E=manager.list()
|
|
1651 merg_nam_n_LH8E=manager.list()
|
|
1652
|
|
1653 n_LH2E_copy=copy.deepcopy(list(n_LH2E))
|
|
1654 n_LH8E_copy=copy.deepcopy(list(n_LH8E))
|
|
1655
|
|
1656 n_sort_tre=manager.list()
|
|
1657 n_sort_con=manager.list()
|
|
1658
|
|
1659 n_fil_sort_con=manager.list()
|
|
1660 n_fil_sort_tre=manager.list()
|
|
1661 n_raw_sort_con=manager.list()
|
|
1662 n_raw_sort_tre=manager.list()
|
|
1663
|
|
1664 ps_main = Process(target=main_temp,args=(list(n_LH2E), n_samples_mirna_names, list(n_LH8E), n_samples_mirna_names1,1,list(n_names_con),list(n_names_tre),n_fil_sort_tre,n_fil_sort_con,n_raw_sort_tre,n_raw_sort_con))
|
|
1665 ps_main.start()
|
|
1666
|
|
1667 ps_merge = [Process(target=merging_names,args=(n_LH2E_copy,merg_nam_n_LH2E))]
|
|
1668 ps_merge.extend([Process(target=merging_names,args=(n_LH8E_copy,merg_nam_n_LH8E))])
|
|
1669 [p.start() for p in ps_merge]
|
|
1670 [p.join() for p in ps_merge]
|
|
1671
|
|
1672 merg_nam_n_LH2E=list(merg_nam_n_LH2E)
|
|
1673 merg_nam_n_LH8E=list(merg_nam_n_LH8E)
|
|
1674
|
|
1675 procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_data]
|
|
1676 procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_data1])
|
|
1677 procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_LH2E,'c'))])
|
|
1678 procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_LH8E,'t'))])
|
|
1679 procs.extend([Process(target=pie_non_temp,args=(merg_nam_LH2E,merg_nam_n_LH2E,merg_nam_LH8E,merg_nam_n_LH8E,unmap_seq.value,unmap1_seq.value,unmap_counts.value,unmap1_counts.value))])
|
|
1680
|
|
1681 starttime13=time.time()
|
|
1682 [p.start() for p in procs]
|
|
1683 [p.join() for p in procs]
|
|
1684
|
|
1685 print('Graphs took {} seconds'.format(time.time() - starttime13))
|
|
1686
|
|
1687 procs1 = Process(target=pdf_before_DE,args=(args.anal))
|
|
1688 procs1.start()
|
|
1689
|
|
1690 starttime14=time.time()
|
|
1691 ps_main.join()
|
|
1692
|
|
1693 n_fil_sort_con=list(n_fil_sort_con)
|
|
1694 n_fil_sort_tre=list(n_fil_sort_tre)
|
|
1695 if n_fil_sort_con==[]:
|
|
1696 n_fil_sort_con=n_raw_sort_con
|
|
1697 n_fil_sort_tre=n_raw_sort_tre
|
|
1698
|
|
1699 n_raw_sort_con=list(n_raw_sort_con)
|
|
1700 n_raw_sort_tre=list(n_raw_sort_tre)
|
|
1701 n_names_con=list(new_n_names_con)
|
|
1702 n_names_tre=list(new_n_names_tre)
|
|
1703
|
|
1704 ps_write = Process(target=write_main,args=(n_raw_sort_con, n_raw_sort_tre,n_fil_sort_con, n_fil_sort_tre, n_names_con, n_names_tre,2))
|
|
1705 ps_write.start()
|
|
1706
|
|
1707 ps1_matrix = [Process(target=ssamples1,args=(n_names_con,n_fil_sort_con,names_con,fil_sort_con,"Diff/n_temp_con/",0))]
|
|
1708 ps1_matrix.extend([Process(target=ssamples1,args=(n_names_tre,n_fil_sort_tre,names_tre,fil_sort_tre,"Diff/n_temp_tre/",0))])
|
|
1709 [p.start() for p in ps1_matrix]
|
|
1710
|
|
1711 ps_write.join()
|
|
1712 [p.join() for p in ps1_matrix]
|
|
1713 procs1.join()
|
|
1714 print('That took {} seconds'.format(time.time() - starttime10))
|
|
1715 print('That took {} seconds'.format(time.time() - starttime))
|
|
1716
|
|
1717
|
|
1718
|
|
1719
|
|
1720
|
|
1721
|
|
1722
|
|
1723
|