comparison mirbase_ultra_v2.py @ 0:99e8a03f8802 draft

Uploaded
author glogobyte
date Fri, 16 Oct 2020 18:53:05 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:99e8a03f8802
1 from mirbase_functions import *
2 from mirbase_graphs import *
3 import itertools
4 import time
5 import sys
6 import os
7 import urllib.request
8 import gzip
9 from multiprocessing import Process, Queue, Lock, Pool, Manager, Value
10 import subprocess
11 import argparse
12 from collections import OrderedDict
13 from matplotlib.backends.backend_pdf import PdfPages
14 import pandas as pd
15 from math import pi
16 import numpy as np
17 import matplotlib.pyplot as plt
18 from matplotlib.ticker import PercentFormatter
19 import seaborn as sns
20 import scipy.stats as stats
21 from plotnine import *
22 import math
23 import re
24 import matplotlib.ticker as mtick
25 import copy
26
27 subprocess.call(['mkdir','-p', 'split1','split2','split3','split4','split11','split12','Counts','Diff/temp_con','Diff/temp_tre','Diff/n_temp_con','Diff/n_temp_tre'])
28
29 parser = argparse.ArgumentParser()
30 parser.add_argument("-analysis", "--anal", help="choose type of analysis", action="store")
31 parser.add_argument("-con", "--control", help="input fastq file", nargs='+', default=[])
32 parser.add_argument("-tre", "--treated", help="input fastq file", nargs='+', default=[] )
33 parser.add_argument("-tool_dir", "--tool_directory", help="tool directory path", action="store")
34 parser.add_argument("-gen", "--org_name", help="tool directory path", action="store")
35 parser.add_argument("-program", "--pro", help="choose type of analysis", action="store")
36 parser.add_argument("-f", "--flag", help="choose the database", action="store")
37 parser.add_argument("-umis", "--umi", help="choose the database", action="store")
38 parser.add_argument("-percentage", "--per", help="choose the database", action="store")
39 parser.add_argument("-counts", "--count", help="choose the database", action="store")
40 parser.add_argument("-name1", "--n1", help="choose the database", action="store")
41 parser.add_argument("-name2", "--n2", help="choose the database", action="store")
42 args = parser.parse_args()
43
44 #########################################################################################################################################
45
46 def collapse_sam(path):
47
48 ini_sam=read(path,0)
49 main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
50 intro_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" in x.split("\t")[0]]
51
52 uni_seq = []
53 for x in main_sam:
54
55 if [x[2], x[9]] not in uni_seq:
56 uni_seq.append([x[2], x[9]])
57
58 new_main_sam=[]
59 incr_num=0
60 for i in range(len(uni_seq)):
61 count=0
62 incr_num+=1
63 for y in main_sam:
64 if uni_seq[i][1]==y[9] and uni_seq[i][0]==y[2]:
65 count+=1
66 temp=y
67 temp[10]="~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
68 temp[0]=str(incr_num)+"-"+str(count)
69 new_main_sam.append(temp)
70
71 new_sam=intro_sam+new_main_sam
72
73 return new_sam
74
75 #################################################################################################################################################################################################################
76
77 def duplicate_chroms_isoforms(List):
78
79 dupes=[]
80
81 for num in range(len(List)):
82
83 if [List[num][9],List[num][0],List[num][2]] not in dupes :
84 dupes.append([List[num][9],List[num][0],List[num][2]])
85
86 for x in List:
87 for y in dupes:
88 if x[9]==y[0] and x[0]==y[1] and x[2].split("_")[0]==y[2].split("_")[0] and x[2]!=y[2]:
89 y.append(x[2])
90
91
92 double_List = [x[:] for x in List]
93
94 chr_order=[]
95 for x in dupes:
96 temp = []
97 for i in range(2,len(x)):
98 if x[i].split("chr")[1].split("(")[0].isdigit():
99 temp.append(int(x[i].split("chr")[1].split("(")[1][0]+x[i].split("chr")[1].split("(")[0]))
100 else:
101 temp.append(x[i].split("chr")[1][0:4])
102
103 for z in temp:
104 if 'X(-)'==z or 'Y(-)'==z or 'X(+)'==z or 'Y(+)'==z:
105 temp = [str(j) for j in temp]
106 temp=list(set(temp))
107 temp.sort()
108 chr_order.append(temp)
109
110 final_dupes=[]
111 for i in range(len(dupes)):
112 final_dupes.append([dupes[i][0],dupes[i][2].split("_")[0],dupes[i][1]])
113 for x in chr_order[i]:
114 result = re.match("[-+]?\d+$", str(x))
115 if len(chr_order[i]) == len(set(chr_order[i])):
116 if result is not None:
117
118 if int(x)<0:
119 final_dupes[i][1]=final_dupes[i][1]+"_chr"+str(abs(int(x)))+"(-)"
120 else:
121 final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(abs(int(x)))+"(+)"
122 else:
123 final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(x)
124 else:
125 if result is not None:
126 if int(x) < 0:
127 final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(-)"
128 else:
129 final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(+)"
130 else:
131 final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(x)
132
133 final_dupes.sort()
134 final_dupes=list(final_dupes for final_dupes,_ in itertools.groupby(final_dupes))
135
136 for i in range(len(double_List)):
137 for x in final_dupes:
138
139 if double_List[i][9] == x[0] and double_List[i][0] == x[2] and len(double_List[i][2].split("_")) >3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]:
140 gg=str("_"+double_List[i][2].split("_")[-2]+"_"+double_List[i][2].split("_")[-1])
141 double_List[i][2] = x[1]+gg
142
143 if double_List[i][9]==x[0] and double_List[i][0]== x[2] and len(double_List[i][2].split("_"))==3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]:
144 double_List[i][2]=x[1]
145 List[i][2] = x[1]
146
147 List.sort()
148 new_list=list(List for List,_ in itertools.groupby(List))
149
150 double_List.sort()
151 new_double_List = list(double_List for double_List, _ in itertools.groupby(double_List))
152
153 return new_list, new_double_List
154
155
156 #############################################################################################################################################################################################################
157
158 def sam(mature_mirnas,path,name,con,l,samples,data,names,unmap_seq,samples_mirna_names,deseq,LHE_names,umi,ini_sample,unmap_counts):
159
160 # read the sam file
161 ini_sam=read(path,0)
162 new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
163 unique_seq = [x for x in new_main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26]
164
165 sorted_uni_arms = []
166
167 for i in range(len(mature_mirnas)):
168 tmp_count_reads = 0 # calculate the total number of reads
169 tmp_count_seq = 0 # calculate the total number of sequences
170 for j in range(len(unique_seq)):
171
172 if "{" in unique_seq[j][2].split("_")[0]:
173 official=unique_seq[j][2].split("_")[0][:-4]
174 else:
175 official=unique_seq[j][2].split("_")[0]
176
177 if mature_mirnas[i].split(" ")[0][1:] == official:
178
179 temp_mature = mature_mirnas[i+1].strip().replace("U", "T")
180 off_part = longestSubstring(temp_mature, unique_seq[j][9])
181
182 mat_diff = temp_mature.split(off_part)
183 mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
184
185 unique_diff = unique_seq[j][9].split(off_part)
186 unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
187
188 # Problem with hsa-miR-8485
189 if mat_diff[1]!=0 and unique_diff[1]!=0:
190 unique_seq[j]=1
191 pre_pos = 0
192 post_pos = 0
193
194 elif mat_diff[0]!=0 and unique_diff[0]!=0:
195 unique_seq[j]=1
196 pre_pos = 0
197 post_pos = 0
198
199 else:
200 pre_pos = mat_diff[0]-unique_diff[0]
201 post_pos = unique_diff[1]-mat_diff[1]
202 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
203 tmp_count_seq = tmp_count_seq+1
204
205 if pre_pos != 0 or post_pos != 0:
206 if pre_pos == 0:
207 unique_seq[j][2] = unique_seq[j][2] + "_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos)
208 elif post_pos == 0:
209 unique_seq[j][2] = unique_seq[j][2] + "_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos)
210 else:
211 unique_seq[j][2] = unique_seq[j][2]+"_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos)
212
213 for x in range(unique_seq.count(1)):
214 unique_seq.remove(1)
215 if tmp_count_reads != 0 and tmp_count_seq != 0:
216 sorted_uni_arms.append([mature_mirnas[i].split(" ")[0][1:], tmp_count_seq, tmp_count_reads])
217 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
218 dedup_unique_seq,double_fil_uni_seq=duplicate_chroms_isoforms(unique_seq)
219
220 for y in sorted_uni_arms:
221 counts=0
222 seqs=0
223 for x in double_fil_uni_seq:
224 if y[0]==x[2].split("_")[0]:
225 counts+=int(x[0].split("-")[1])
226 seqs+=1
227
228 y[1]=seqs
229 y[2]=counts
230
231 LHE=[]
232 l.acquire()
233 if con=="c":
234 LHE.extend(z[2] for z in double_fil_uni_seq)
235 for y in double_fil_uni_seq:
236 samples_mirna_names.append([y[2],y[9]])
237 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq])
238 LHE_names.extend(LHE)
239 unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4'])
240 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4'])
241 names.append(name)
242 samples.append(dedup_unique_seq)
243 data.append([con,name,double_fil_uni_seq,sorted_uni_arms])
244 ini_sample.append(new_main_sam)
245
246 if con=="t":
247 LHE.extend(z[2] for z in double_fil_uni_seq)
248 for y in double_fil_uni_seq:
249 samples_mirna_names.append([y[2],y[9]])
250 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq])
251 LHE_names.extend(LHE)
252 unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4'])
253 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4'])
254 names.append(name)
255 samples.append(dedup_unique_seq)
256 data.append([con,name,double_fil_uni_seq,sorted_uni_arms])
257 ini_sample.append(new_main_sam)
258 l.release()
259
260
261 ######################################################################################################################################
262 """
263
264 Read a sam file from Bowtie and do the followings:
265
266 1) Remove reverse stranded mapped reads
267 2) Remove unmapped reads
268 3) Remove all sequences with reads less than 11 reads
269 4) Sort the arms with the most sequences in decreading rate
270 5) Sort the sequences of every arm with the most reads in decreasing rate
271 6) Calculate total number of sequences of every arm
272 7) Calculate total number of reads of sequences of every arm.
273 8) Store all the informations in a txt file
274
275 """
276
277 def non_sam(mature_mirnas,path,name,con,l,data,names,n_deseq,n_samples_mirna_names,n_LHE_names):
278
279 ini_sam=read(path,0)
280 new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
281 unique_seq=[]
282 unique_seq = [x for x in new_main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26]
283
284 uni_seq=[]
285 # Calculate the shifted positions for every isomir and add them to the name of it
286 sorted_uni_arms = []
287 for i in range(1,len(mature_mirnas),2):
288 tmp_count_reads = 0 # calculate the total number of reads
289 tmp_count_seq = 0 # calculate the total number of sequences
290
291 for j in range(len(unique_seq)):
292
293 temp_mature = mature_mirnas[i].strip().replace("U", "T")
294
295 if temp_mature in unique_seq[j][9]:
296
297 off_part = longestSubstring(temp_mature, unique_seq[j][9])
298
299 mat_diff = temp_mature.split(off_part)
300 mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
301
302 unique_diff = unique_seq[j][9].split(off_part)
303 if len(unique_diff)<=2:
304 unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
305
306 pre_pos = mat_diff[0]-unique_diff[0]
307 post_pos = unique_diff[1]-mat_diff[1]
308
309 lengthofmir = len(off_part) + post_pos
310 if pre_pos == 0:
311 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
312 tmp_count_seq = tmp_count_seq + 1
313
314 if pre_pos == 0:
315
316 t_name=unique_seq[j].copy()
317 t_name[2]=mature_mirnas[i - 1].split(" ")[0][1:] + "__" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):])
318 uni_seq.append(t_name)
319
320
321 if tmp_count_reads != 0 and tmp_count_seq != 0:
322 sorted_uni_arms.append([mature_mirnas[i-1].split(" ")[0][1:], tmp_count_seq, tmp_count_reads])
323
324
325 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
326 unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq))))
327
328 LHE=[]
329
330 l.acquire()
331 if con=="c":
332 LHE.extend(x[2] for x in unique_seq if x[2]!="*")
333 for x in unique_seq:
334 if x[2]!="*":
335 n_samples_mirna_names.append([x[2],x[9]])
336 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
337 n_LHE_names.extend(LHE)
338 names.append(name)
339 data.append([con,name,unique_seq,sorted_uni_arms])
340
341
342 if con=="t":
343 LHE.extend(x[2] for x in unique_seq if x[2]!="*")
344 for x in unique_seq:
345 if x[2]!="*":
346 n_samples_mirna_names.append([x[2],x[9]])
347 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
348 n_LHE_names.extend(LHE)
349 names.append(name)
350 data.append([con,name,unique_seq,sorted_uni_arms])
351 l.release()
352
353 #####################################################################################################################################################################################################################
354 def deseq2_temp(samples_mirna_names,deseq,con,l):
355
356 samples_mirna_names.sort(key=lambda x:[0])
357 for i in range(len(deseq)):
358 for y in samples_mirna_names:
359 flag = 0
360 for x in deseq[i]:
361 if y[0] == x[0]:
362 flag = 1
363 break
364
365 if flag == 0:
366 deseq[i].append([y[0], "0", y[1]])
367
368 [deseq[i].sort(key=lambda x: x[0]) for i, _ in enumerate(deseq)]
369 deseq_final = [[x[0],x[2]] for x in deseq[0]]
370 [deseq_final[z].append(deseq[i][j][1]) for z,_ in enumerate(deseq_final) for i, _ in enumerate(deseq) for j,_ in enumerate(deseq[i]) if deseq_final[z][0] == deseq[i][j][0]]
371
372 l.acquire()
373 if con=="c":
374 q1.put(deseq_final)
375
376 if con=="t":
377 q2.put(deseq_final)
378 l.release()
379
380
381 ####################################################################################################################################################################################################################
382
383 def main_temp(LH2E, LH2E_names, LH8E, LH8E_names,flag,names_con,names_tre,filter_LH8E,filter_LH2E,raw_LH8E,raw_LH2E):
384
385 LH8E_add_names = [x for x in LH2E_names if x not in LH8E_names]
386 LH2E_add_names = [x for x in LH8E_names if x not in LH2E_names]
387
388 LH8E_add_names.sort()
389 LH2E_add_names.sort()
390 LH8E_add_names = list(LH8E_add_names for LH8E_add_names,_ in itertools.groupby(LH8E_add_names))
391 LH2E_add_names = list(LH2E_add_names for LH2E_add_names,_ in itertools.groupby(LH2E_add_names))
392
393 LH2E.sort()
394 LH8E.sort()
395 LH2E = list(LH2E for LH2E,_ in itertools.groupby(LH2E))
396 LH8E = list(LH8E for LH8E,_ in itertools.groupby(LH8E))
397
398 print("LHE_names")
399 print([len(LH8E_add_names),len(LH2E_add_names)])
400 print([len(LH8E),len(LH2E)])
401
402 zeros=["0"]*(len(LH8E[0])-2)
403 [LH8E_add_names[i].extend(zeros) for i,_ in enumerate(LH8E_add_names)]
404 LH8E=LH8E+LH8E_add_names
405
406 zeros=["0"]*(len(LH2E[0])-2)
407 [LH2E_add_names[i].extend(zeros) for i,_ in enumerate(LH2E_add_names)]
408 LH2E=LH2E+LH2E_add_names
409
410 dupes=[]
411 final_LH2E =[]
412
413 for num,_ in enumerate(LH2E):
414
415 if LH2E[num][1] not in final_LH2E and LH2E[num][0] not in final_LH2E:
416 final_LH2E.append(LH2E[num][1])
417 final_LH2E.append(LH2E[num][0])
418 else:
419 dupes.append(LH2E[num][1])
420
421
422 dupes=list(set(dupes))
423
424 dupes=[[x] for x in dupes]
425
426 for x in LH2E:
427 for y in dupes:
428 if x[1]==y[0]:
429 fl=0
430 if len(y)==1:
431 y.append(x[0])
432 else:
433 for i in range(1,len(y)):
434 if y[i].split("_")[0]==x[0].split("_")[0]:
435 fl=1
436 if len(x[0])<len(y[i]):
437 del y[i]
438 y.append(x[0])
439 break
440
441 if fl==0:
442 y.append((x[0]))
443
444 for y in dupes:
445 if len(y)>2:
446 for i in range(len(y)-1,1,-1):
447 y[1]=y[1]+"/"+y[i]
448 del y[i]
449
450 for x in LH2E:
451 for y in dupes:
452 if x[1]==y[0]:
453 x[0]=y[1]
454
455 for x in LH8E:
456 for y in dupes:
457 if x[1]==y[0]:
458 x[0]=y[1]
459
460
461 LH2E.sort()
462 LH2E=list(LH2E for LH2E,_ in itertools.groupby(LH2E))
463
464 LH8E.sort()
465 LH8E=list(LH8E for LH8E,_ in itertools.groupby(LH8E))
466
467 LH8E_new=[]
468 LH2E_new=[]
469
470 if int(args.per)!=-1:
471 percent=int(args.per)/100
472 print(percent)
473 print(args.count)
474
475 c_col_filter=round(percent*(len(LH2E[1])-2))
476 t_col_filter=round(percent*(len(LH8E[1])-2))
477
478 for i, _ in enumerate(LH2E):
479 c_cols=0
480 t_cols=0
481
482 c_cols=sum([1 for j in range(len(LH2E[i])-2) if int(LH2E[i][j+2])>=int(args.count)])
483 t_cols=sum([1 for j in range(len(LH8E[i])-2) if int(LH8E[i][j+2])>=int(args.count)])
484
485 if c_cols>=c_col_filter or t_cols>=t_col_filter:
486 LH8E_new.append(LH8E[i])
487 LH2E_new.append(LH2E[i])
488
489 filter_LH2E.extend(LH2E_new)
490 filter_LH8E.extend(LH8E_new)
491 raw_LH2E.extend(LH2E)
492 raw_LH8E.extend(LH8E)
493
494 ##################################################################################################################################################################################################################
495
496 def write_main(raw_LH2E, raw_LH8E, fil_LH2E, fil_LH8E, names_con, names_tre, flag):
497
498 if flag == 1 and int(args.per)!=-1:
499 fp = open('Counts/Filtered '+args.n2 +' Templated Counts', 'w')
500 fp.write("Name\t")
501 fp.write("Sequence")
502 for y in names_tre:
503 fp.write("\t"+y)
504
505 for x in fil_LH8E:
506 fp.write("\n%s" % "\t".join(x))
507 fp.close()
508
509 fp = open('Counts/Filtered '+args.n1+' Templated Counts', 'w')
510 fp.write("Name\t")
511 fp.write("Sequence")
512 for y in names_con:
513 fp.write("\t"+y)
514
515 for x in fil_LH2E:
516 fp.write("\n%s" % "\t".join(x))
517 fp.close()
518
519
520 if flag == 2 and int(args.per)!=-1:
521 fp = open('Counts/Filtered '+args.n2+' Non-Templated Counts', 'w')
522 fp.write("Name\t")
523 fp.write("Sequence")
524 for y in names_tre:
525 fp.write("\t"+y)
526
527
528 for x in fil_LH8E:
529 fp.write("\n%s" % "\t".join(x))
530 fp.close()
531
532 fp = open('Counts/Filtered '+args.n1+' Non-Templated Counts', 'w')
533 fp.write("Name\t")
534 fp.write("Sequence")
535 for y in names_con:
536 fp.write("\t"+y)
537
538 for x in fil_LH2E:
539 fp.write("\n%s" % "\t".join(x))
540 fp.close()
541
542
543 if flag == 1:
544 fp = open('Counts/Raw '+args.n2+' Templated Counts', 'w')
545 fp.write("Name\t")
546 fp.write("Sequence")
547 for y in names_tre:
548 fp.write("\t"+y)
549
550 for x in raw_LH8E:
551 fp.write("\n%s" % "\t".join(x))
552 fp.close()
553
554 fp = open('Counts/Raw '+args.n1+' Templated Counts', 'w')
555 fp.write("Name\t")
556 fp.write("Sequence")
557 for y in names_con:
558 fp.write("\t"+y)
559
560 for x in raw_LH2E:
561 fp.write("\n%s" % "\t".join(x))
562 fp.close()
563
564
565 if flag == 2:
566 fp = open('Counts/Raw '+args.n2+' Non-Templated Counts', 'w')
567 fp.write("Name\t")
568 fp.write("Sequence")
569 for y in names_tre:
570 fp.write("\t"+y)
571
572
573 for x in raw_LH8E:
574 fp.write("\n%s" % "\t".join(x))
575 fp.close()
576
577 fp = open('Counts/Raw '+args.n1+' Non-Templated Counts', 'w')
578 fp.write("Name\t")
579 fp.write("Sequence")
580 for y in names_con:
581 fp.write("\t"+y)
582
583 for x in raw_LH2E:
584 fp.write("\n%s" % "\t".join(x))
585 fp.close()
586
587
588 #########################################################################################################################################
589
590 def ssamples(names,samp,folder,pro):
591
592 for i in range(2,len(samp[0])):
593
594 fp = open(folder+names[i-2]+'.txt','w')
595 fp.write("miRNA id"+"\t"+names[i-2]+"\n")
596
597 for x in samp:
598 fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
599 fp.close()
600
601 ##################################################################################################################
602
603 def DB_write(con,name,unique_seq,sorted_uni_arms,f):
604
605 if f==1:
606 # Write a txt file with all the information
607 if con=="c":
608 fp = open('split1/'+name, 'w')
609
610 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
611 if con=="t":
612 fp = open('split2/'+name, 'w')
613 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
614
615
616 for i in range(len(sorted_uni_arms)):
617 temp = []
618 for j in range(len(unique_seq)):
619
620 if sorted_uni_arms[i][0] in unique_seq[j][2].split("_")[0]:
621
622 temp.append(unique_seq[j])
623
624 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
625 fp.write("*********************************************************************************************************\n")
626 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
627 fp.write("*********************************************************************************************************\n\n")
628 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
629 fp.write("\n" + "\n")
630 fp.close()
631
632 if f==2:
633
634 if con=="c":
635 fp = open('split3/'+name, 'w')
636 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
637 if con=="t":
638 fp = open('split4/'+name, 'w')
639 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
640
641
642 for i in range(len(sorted_uni_arms)):
643 temp = []
644 for j in range(len(unique_seq)):
645 if sorted_uni_arms[i][0]==unique_seq[j][2].split("__")[0]:
646 temp.append(unique_seq[j])
647 if temp!=[]:
648 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
649 fp.write("*********************************************************************************************************\n")
650 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
651 fp.write("*********************************************************************************************************\n\n")
652 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
653 fp.write("\n" + "\n")
654 fp.close()
655
656
657 ##########################################################################################################################
658
659 def new_mat_seq(pre_unique_seq,mat_mirnas,l):
660
661 unique_iso = []
662 for x in pre_unique_seq:
663 if len(x[2].split("_"))==3:
664 for y in pre_unique_seq:
665 if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]):
666 if any(y[2] in lst2 for lst2 in unique_iso)==False:
667 y[2]=">"+y[2]
668 unique_iso.append(y)
669 l.acquire()
670 for x in unique_iso:
671 mat_mirnas.append(x[2])
672 mat_mirnas.append(x[9])
673 l.release()
674
675 #########################################################################################################################
676 def pie_non_temp(merge_LH2E,merge_non_LH2E,merge_LH8E,merge_non_LH8E,c_unmap,t_unmap,c_unmap_counts,t_unmap_counts):
677
678 c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E]
679 t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E]
680 c_non_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_non_LH2E]
681 t_non_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_non_LH8E]
682
683 c_templ = 0
684 c_tem_counts = 0
685 c_mature = 0
686 c_mat_counts = 0
687 t_templ = 0
688 t_tem_counts = 0
689 t_mature = 0
690 t_mat_counts = 0
691
692 c_non = len(c_non_samples)
693 c_non_counts = sum(x[2] for x in c_non_samples)
694 t_non = len(t_non_samples)
695 t_non_counts = sum(x[2] for x in t_non_samples)
696
697 c_unmap = c_unmap - c_non
698 t_unmap = c_unmap - t_non
699
700 c_unmap_counts=c_unmap_counts - c_non_counts
701 t_unmap_counts=t_unmap_counts - t_non_counts
702
703
704 for x in c_samples:
705
706 if "/" not in x[0]:
707 if "chr" in x[0].split("_")[-1]:
708 c_mature+=1
709 c_mat_counts += x[2]
710 else:
711 c_templ+=1
712 c_tem_counts += x[2]
713 else:
714 f=0
715 for y in x[0].split("/"):
716 if "chr" in y.split("_")[-1]:
717 c_mature+=1
718 c_mat_counts += x[2]
719 f=1
720 break
721 if f==0:
722 c_templ+=1
723 c_tem_counts += x[2]
724
725 for x in t_samples:
726
727 if "/" not in x[0]:
728 if "chr" in x[0].split("_")[-1]:
729 t_mature+=1
730 t_mat_counts += x[2]
731 else:
732 t_templ+=1
733 t_tem_counts += x[2]
734 else:
735 f=0
736 for y in x[0].split("/"):
737 if "chr" in y.split("_")[-1]:
738 t_mature+=1
739 t_mat_counts += x[2]
740 f=1
741 break
742 if f==0:
743 t_templ+=1
744 t_tem_counts += x[2]
745
746 fig = plt.figure(figsize=(7,5))
747 labels = 'miRNA RefSeq','Template', 'Unmapped','Non-template'
748 sizes = [c_mat_counts, c_tem_counts, c_unmap_counts,c_non_counts]
749 colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue']
750 ax1 = plt.subplot2grid((1,2),(0,0))
751 patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8)
752 [x.set_fontsize(8) for x in texts]
753 plt.title('Control Group (reads)',fontsize=12)
754 labels = 'miRNA RefSeq','Template', 'Unmapped','non-template'
755 sizes = [t_mat_counts, t_tem_counts, t_unmap_counts, t_non_counts]
756 colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue']
757 ax2 = plt.subplot2grid((1,2),(0,1))
758 patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8)
759 [x.set_fontsize(8) for x in texts]
760 plt.title('Treated Group (reads)', fontsize=12)
761 plt.savefig('pie_non.png',dpi=300)
762
763 ######################################################################################################################################################
764
765 def merging_names(LH2E_copy,new):
766
767 dupes=[]
768 final_LH2E =[]
769
770 for num in range(len(LH2E_copy)):
771
772 if LH2E_copy[num][1] not in final_LH2E and LH2E_copy[num][0] not in final_LH2E:
773 final_LH2E.append(LH2E_copy[num][1])
774 final_LH2E.append(LH2E_copy[num][0])
775 else:
776 dupes.append(LH2E_copy[num][1])
777
778 dupes=list(set(dupes))
779
780 for i in range(len(dupes)):
781 dupes[i]=[dupes[i]]
782
783 for x in LH2E_copy:
784 for y in dupes:
785 if x[1]==y[0]:
786 fl=0
787 if len(y)==1:
788 y.append(x[0])
789 else:
790 for i in range(1,len(y)):
791 if y[i].split("_")[0]==x[0].split("_")[0]:
792 fl=1
793 if len(x[0])<len(y[i]):
794 del y[i]
795 y.append(x[0])
796 break
797
798 if fl==0:
799 y.append((x[0]))
800
801 for y in dupes:
802 if len(y)>2:
803 for i in range(len(y)-1,1,-1):
804 y[1]=y[1]+"/"+y[i]
805 del y[i]
806
807
808 for x in LH2E_copy:
809 for y in dupes:
810 if x[1]==y[0]:
811 x[0]=y[1]
812
813 LH2E_copy.sort()
814 LH2E_copy=list(LH2E_copy for LH2E_copy,_ in itertools.groupby(LH2E_copy))
815
816 new.extend(LH2E_copy)
817
818
819 ######################################################################################################################################################
820 def pie_temp(merge_LH2E,c_unmap,c_unmap_counts,merge_LH8E,t_unmap,t_unmap_counts):
821
822 c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E]
823 t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E]
824
825 c_templ = 0
826 c_tem_counts = 0
827 c_mature = 0
828 c_mat_counts = 0
829 t_templ = 0
830 t_tem_counts = 0
831 t_mature = 0
832 t_mat_counts = 0
833
834 for x in c_samples:
835
836 if "/" not in x[0]:
837 if "chr" in x[0].split("_")[-1]:
838 c_mature+=1
839 c_mat_counts += x[2]
840 else:
841 c_templ+=1
842 c_tem_counts += x[2]
843 else:
844 f=0
845 for y in x[0].split("/"):
846 if "chr" in y.split("_")[-1]:
847 c_mature+=1
848 c_mat_counts += x[2]
849 f=1
850 break
851 if f==0:
852 c_templ+=1
853 c_tem_counts += x[2]
854
855 for x in t_samples:
856
857 if "/" not in x[0]:
858 if "chr" in x[0].split("_")[-1]:
859 t_mature+=1
860 t_mat_counts += x[2]
861 else:
862 t_templ+=1
863 t_tem_counts += x[2]
864 else:
865 f=0
866 for y in x[0].split("/"):
867 if "chr" in y.split("_")[-1]:
868 t_mature+=1
869 t_mat_counts += x[2]
870 f=1
871 break
872 if f==0:
873 t_templ+=1
874 t_tem_counts += x[2]
875
876
877 fig = plt.figure()
878 labels = 'miRNA RefSeq','Template', 'Unmapped'
879 sizes = [c_mat_counts, c_tem_counts, c_unmap_counts]
880 colors = ['gold', 'yellowgreen', 'lightskyblue']
881 explode = (0.2, 0.05, 0.1)
882 ax1 = plt.subplot2grid((1,2),(0,0))
883 patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8)
884 [x.set_fontsize(8) for x in texts]
885 plt.title('Control group (reads)', fontsize=12)
886 labels = 'miRNA RefSeq','Template', 'Unmapped'
887 sizes = [t_mat_counts, t_tem_counts, t_unmap_counts]
888 colors = ['gold', 'yellowgreen', 'lightskyblue']
889 explode = (0.2, 0.05, 0.1)
890 ax2 = plt.subplot2grid((1,2),(0,1))
891 patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8)
892 [x.set_fontsize(8) for x in texts]
893 plt.title('Treated group (reads)',fontsize = 12)
894 plt.savefig('pie_tem.png',dpi=300)
895
896 ###################################################################################################################################################################################################################
897
898 def make_spider(merge_LH2E,merge_LH8E):
899
900 c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E]
901 t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E]
902
903 c_5 = 0
904 c_5_counts = 0
905 c_3 = 0
906 c_3_counts = 0
907 c_both =0
908 c_both_counts=0
909 c_mature = 0
910 c_mat_counts = 0
911 c_exception=0
912 c_exception_counts=0
913
914
915 t_5 = 0
916 t_5_counts = 0
917 t_3 = 0
918 t_3_counts = 0
919 t_both = 0
920 t_both_counts = 0
921 t_mature = 0
922 t_mat_counts = 0
923 t_exception = 0
924 t_exception_counts=0
925
926 for x in c_samples:
927
928 if "/" not in x[0]:
929 if "chr" in x[0].split("_")[-1]:
930 c_mature+=1
931 c_mat_counts += x[2]
932 elif 0 == int(x[0].split("_")[-1]):
933 c_5+=1
934 c_5_counts += x[2]
935 elif 0 == int(x[0].split("_")[-2]):
936 c_3+=1
937 c_3_counts += x[2]
938 else:
939 c_both+=1
940 c_both_counts+=x[2]
941
942 else:
943 f=0
944 for y in x[0].split("/"):
945 if "chr" in y.split("_")[-1]:
946 c_mature+=1
947 c_mat_counts += x[2]
948 f=1
949 break
950 if f==0:
951 for y in x[0].split("/"):
952 c_exception+=1
953 c_exception_counts += x[2]
954
955
956 for x in t_samples:
957
958 if "/" not in x[0]:
959 if "chr" in x[0].split("_")[-1]:
960 t_mature+=1
961 t_mat_counts += x[2]
962 elif 0 == int(x[0].split("_")[-1]):
963 t_5+=1
964 t_5_counts += x[2]
965 elif 0 == int(x[0].split("_")[-2]):
966 t_3+=1
967 t_3_counts += x[2]
968 else:
969 t_both+=1
970 t_both_counts+=x[2]
971
972 else:
973 f=0
974 for y in x[0].split("/"):
975 if "chr" in y.split("_")[-1]:
976 t_mature+=1
977 t_mat_counts += x[2]
978 f=1
979 break
980 if f==0:
981 for y in x[0].split("/"):
982 t_exception+=1
983 t_exception_counts += x[2]
984
985
986 c_all = c_5+c_3+c_both+c_mature+c_exception
987 c_all_counts = c_5_counts + c_3_counts + c_both_counts + c_mat_counts + c_exception_counts
988
989 t_all = t_5+t_3+t_both+t_mature + t_exception
990 t_all_counts = t_5_counts + t_3_counts + t_both_counts + t_mat_counts + t_exception_counts
991
992 c_5 = round(c_5/c_all*100,2)
993 c_3 = round(c_3/c_all*100,2)
994 c_both = round(c_both/c_all*100,2)
995 c_mature = round(c_mature/c_all*100,2)
996 c_exception = round(c_exception/c_all*100,2)
997
998 c_5_counts = round(c_5_counts/c_all_counts*100,2)
999 c_3_counts = round(c_3_counts/c_all_counts*100,2)
1000 c_both_counts = round(c_both_counts/c_all_counts*100,2)
1001 c_mat_counts = round(c_mat_counts/c_all_counts*100,2)
1002 c_exception_counts = round(c_exception_counts/c_all_counts*100,2)
1003
1004 t_5 = round(t_5/t_all*100,2)
1005 t_3 = round(t_3/t_all*100,2)
1006 t_both = round(t_both/t_all*100,2)
1007 t_mature = round(t_mature/t_all*100,2)
1008 t_exception = round(t_exception/t_all*100,2)
1009
1010 t_5_counts = round(t_5_counts/t_all_counts*100,2)
1011 t_3_counts = round(t_3_counts/t_all_counts*100,2)
1012 t_both_counts = round(t_both_counts/t_all_counts*100,2)
1013 t_mat_counts = round(t_mat_counts/t_all_counts*100,2)
1014 t_exception_counts = round(t_exception_counts/t_all_counts*100,2)
1015
1016 radar_max = max(c_5, c_3, c_both,c_mature,c_exception,t_5,t_3,t_both,t_mature,t_exception)
1017 radar_max_counts = max(c_5_counts,c_3_counts,c_both_counts,c_mat_counts,c_exception_counts,t_5_counts,t_3_counts,t_both_counts,t_mat_counts,t_exception_counts)
1018
1019 df=pd.DataFrame({
1020 'group':['Controls','Treated'],
1021 """5' and 3' isomiRs""":[c_both,t_both],
1022 """3' isomiRs""":[c_3,t_3],
1023 'miRNA RefSeq':[c_mature,t_mature],
1024 """5' isomiRs""":[c_5,t_5],
1025 'Others*':[c_exception,t_exception]})
1026
1027 df1=pd.DataFrame({
1028 'group':['Controls','Treated'],
1029 """5' and 3' isomiRs""":[c_both_counts,t_both_counts],
1030 """3' isomiRs""":[c_3_counts,t_3_counts],
1031 'miRNA RefSeq':[c_mat_counts,t_mat_counts],
1032 """5' isomiRs""":[c_5_counts,t_5_counts],
1033 'Others*':[c_exception_counts,t_exception_counts]})
1034
1035 spider_last(df,radar_max,1)
1036 spider_last(df1,radar_max_counts,2)
1037
1038
1039 def spider_last(df,radar_max,flag):
1040 # ------- PART 1: Create background
1041 fig = plt.figure()
1042 # number of variable
1043 categories=list(df)[1:]
1044 N = len(categories)
1045
1046 # What will be the angle of each axis in the plot? (we divide the plot / number of variable)
1047 angles = [n / float(N) * 2 * pi for n in range(N)]
1048 angles += angles[:1]
1049
1050 # Initialise the spider plot
1051 ax = plt.subplot(111, polar=True)
1052
1053 # If you want the first axis to be on top:
1054 ax.set_theta_offset(pi/2)
1055 ax.set_theta_direction(-1)
1056
1057 # Draw one axe per variable + add labels labels yet
1058 plt.xticks(angles[:-1], categories, fontsize=11)
1059
1060 # Draw ylabels
1061 radar_max=round(radar_max+radar_max*0.1)
1062 mul=len(str(radar_max))-1
1063 maxi=int(math.ceil(radar_max / pow(10,mul))) * pow(10,mul)
1064 sep = round(maxi/4)
1065 plt.yticks([sep, 2*sep, 3*sep, 4*sep, 5*sep], [str(sep)+'%', str(2*sep)+'%', str(3*sep)+'%', str(4*sep)+'%', str(5*sep)+'%'], color="grey", size=10)
1066 plt.ylim(0, maxi)
1067
1068 # ------- PART 2: Add plots
1069
1070 # Plot each individual = each line of the data
1071 # I don't do a loop, because plotting more than 3 groups makes the chart unreadable
1072
1073 # Ind1
1074 values=df.loc[0].drop('group').values.flatten().tolist()
1075 values += values[:1]
1076 ax.plot(angles, values,'-o', linewidth=1, linestyle='solid', label="Controls")
1077 ax.fill(angles, values, 'b', alpha=0.1)
1078
1079 # Ind2
1080 values=df.loc[1].drop('group').values.flatten().tolist()
1081 values += values[:1]
1082 ax.plot(angles, values, '-o' ,linewidth=1, linestyle='solid', label="Treated")
1083 ax.fill(angles, values, 'r', alpha=0.1)
1084
1085 # Add legend
1086 if flag==1:
1087 plt.legend(loc='upper right', bbox_to_anchor=(0.0, 0.1))
1088 plt.savefig('spider_non_red.png',dpi=300)
1089 else:
1090 plt.legend(loc='upper right', bbox_to_anchor=(0.0, 0.1))
1091 plt.savefig('spider_red.png',dpi=300)
1092
1093
1094 #############################################################################################################################################################################################################
1095 def hist_red(samples,flag):
1096 lengths=[]
1097 cat=[]
1098 total_reads=0
1099 seq=[]
1100
1101 if flag == "c":
1102 title = "Length Distribution of Control group (Redudant reads)"
1103 if flag == "t":
1104 title = "Length Distribution of Treated group (Redudant reads)"
1105
1106 for i in samples:
1107 for x in i:
1108 lengths.append(len(x[9]))
1109 if x[1]=="0":
1110 seq.append([x[9],x[0].split("-")[1],"Mapped"])
1111 cat.append("Mapped")
1112 if x[1] == "4":
1113 seq.append([x[9],x[0].split("-")[1],"Unmapped"])
1114 cat.append("Unmapped")
1115
1116 uni_len=list(set(lengths))
1117 uni_len=[x for x in uni_len if x<=35]
1118 low=min(lengths)
1119 up=max(lengths)
1120 seq.sort()
1121 uni_seq=list(seq for seq,_ in itertools.groupby(seq))
1122 dim=up-low
1123
1124 if dim>20:
1125 s=5
1126 else:
1127 s=8
1128
1129 total_reads+=sum([int(x[1]) for x in uni_seq])
1130
1131 map_reads=[]
1132 unmap_reads=[]
1133 length=[]
1134 for y in uni_len:
1135 map_temp=0
1136 unmap_temp=0
1137 for x in uni_seq:
1138 if len(x[0])==y and x[2]=="Mapped":
1139 map_temp+=int(x[1])
1140 if len(x[0])==y and x[2]=="Unmapped":
1141 unmap_temp+=int(x[1])
1142 if y<=35:
1143 length.append(y)
1144 map_reads.append(round(map_temp/total_reads*100,2))
1145 unmap_reads.append(round(unmap_temp/total_reads*100,2))
1146
1147 ylim=max([sum(x) for x in zip(unmap_reads, map_reads)])
1148 ylim=ylim+ylim*20/100
1149 fig, ax = plt.subplots()
1150 width=0.8
1151 ax.bar(length, unmap_reads, width, label='Unmapped')
1152 h=ax.bar(length, map_reads, width, bottom=unmap_reads, label='Mapped')
1153 plt.xticks(np.arange(length[0], length[-1]+1, 1))
1154 plt.xlabel('Length (nt)',fontsize=14)
1155 plt.ylabel('Percentage',fontsize=14)
1156 plt.title(title,fontsize=14)
1157 ax.legend()
1158 plt.ylim([0, ylim])
1159 ax.grid(axis='y',linewidth=0.2)
1160
1161 if flag=='c':
1162 plt.savefig('c_hist_red.png',dpi=300)
1163
1164 if flag=='t':
1165 plt.savefig('t_hist_red.png',dpi=300)
1166
1167 #################################################################################################################
1168
1169
1170 def logo_seq_red(merge, flag):
1171
1172 if flag=="c":
1173 titlos="Control group (Redundant)"
1174 file_logo="c_logo.png"
1175 file_bar="c_bar.png"
1176 if flag=="t":
1177 titlos="Treated group (Redundant)"
1178 file_logo="t_logo.png"
1179 file_bar="t_bar.png"
1180
1181 c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge]
1182
1183 A=[0]*5
1184 C=[0]*5
1185 G=[0]*5
1186 T=[0]*5
1187 total_reads=0
1188
1189 for y in c_samples:
1190 if "/" in y[0]:
1191 length=[]
1192 for x in y[0].split("/"):
1193 length.append([len(x.split("_")[-1]),x.split("_")[-1],y[2]])
1194
1195 best=min(length)
1196 total_reads+=best[2]
1197 for i in range(5):
1198 if i<len(best[1]):
1199 if best[1][i] == "A":
1200 A[i]+=best[2]
1201 elif best[1][i] == "C":
1202 C[i]+=best[2]
1203 elif best[1][i] == "G":
1204 G[i]+=best[2]
1205 else:
1206 T[i]+=best[2]
1207 else:
1208 total_reads+=y[2]
1209 for i in range(5):
1210 if i<len(y[0].split("_")[-1]):
1211 if y[0].split("_")[-1][i] == "A":
1212 A[i]+=(y[2])
1213 elif y[0].split("_")[-1][i] == "C":
1214 C[i]+=(y[2])
1215 elif y[0].split("_")[-1][i] == "G":
1216 G[i]+=(y[2])
1217 else:
1218 T[i]+=y[2]
1219
1220 A[:] = [round(x*100,1) / total_reads for x in A]
1221 C[:] = [round(x*100,1) / total_reads for x in C]
1222 G[:] = [round(x*100,1) / total_reads for x in G]
1223 T[:] = [round(x*100,1) / total_reads for x in T]
1224
1225
1226
1227 data = {'A':A,'C':C,'G':G,'T':T}
1228 df = pd.DataFrame(data, index=[1,2,3,4,5])
1229 h=df.plot.bar(color=tuple(["g", "b","gold","r"]) )
1230 h.grid(axis='y',linewidth=0.2)
1231 plt.xticks(rotation=0, ha="right")
1232 plt.ylabel("Counts (%)",fontsize=18)
1233 plt.xlabel("Positions (nt)",fontsize=18)
1234 plt.title(titlos,fontsize=20)
1235 plt.tight_layout()
1236 plt.savefig(file_bar, dpi=300)
1237
1238 import logomaker as lm
1239 crp_logo = lm.Logo(df, font_name = 'DejaVu Sans')
1240 crp_logo.style_spines(visible=False)
1241 crp_logo.style_spines(spines=['left', 'bottom'], visible=True)
1242 crp_logo.style_xticks(rotation=0, fmt='%d', anchor=0)
1243
1244 # style using Axes methods
1245 crp_logo.ax.set_title(titlos,fontsize=18)
1246 crp_logo.ax.set_ylabel("Counts (%)", fontsize=16,labelpad=5)
1247 crp_logo.ax.set_xlabel("Positions (nt)",fontsize=16, labelpad=5)
1248 crp_logo.ax.xaxis.set_ticks_position('none')
1249 crp_logo.ax.xaxis.set_tick_params(pad=-1)
1250 figure = plt.gcf()
1251 figure.set_size_inches(6, 4)
1252 crp_logo.fig.savefig(file_logo,dpi=300)
1253
1254 ##########################################################################################################################################################################################################
1255
1256
1257
1258 def logo_seq_non_red(merge_LH2E):
1259
1260 c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E]
1261
1262 A=[0]*5
1263 C=[0]*5
1264 G=[0]*5
1265 T=[0]*5
1266
1267 for y in c_samples:
1268 if "/" in y[0]:
1269 length=[]
1270 for x in y[0].split("/"):
1271 length.append([len(x.split("_")[-1]),x.split("_")[-1],y[2]])
1272
1273 best=min(length)
1274 for i in range(5):
1275 if i<len(best[1]):
1276 if best[1][i] == "A":
1277 A[i]+=1
1278 elif best[1][i] == "C":
1279 C[i]+=1
1280 elif best[1][i] == "G":
1281 G[i]+=1
1282 else:
1283 T[i]+=1
1284 else:
1285 for i in range(5):
1286 if i<len(y[0].split("_")[-1]):
1287 if y[0].split("_")[-1][i] == "A":
1288 A[i]+=1
1289 elif y[0].split("_")[-1][i] == "C":
1290 C[i]+=1
1291 elif y[0].split("_")[-1][i] == "G":
1292 G[i]+=1
1293 else:
1294 T[i]+=1
1295
1296 data = {'A':A,'C':C,'G':G,'T':T}
1297 df = pd.DataFrame(data, index=[1,2,3,4,5])
1298 h=df.plot.bar(title="Non-templated nucleotides after templated sequence",color=tuple(["g", "b","gold","r"]))
1299 h.set_xlabel("Positions (nt)")
1300 h.set_ylabel("Unique sequences")
1301 plt.xticks(rotation=0, ha="right")
1302 plt.tight_layout()
1303 plt.savefig("bar2.png", dpi=300)
1304
1305
1306 import logomaker as lm
1307 crp_logo = lm.Logo(df, font_name = 'DejaVu Sans')
1308
1309 # style using Logo methods
1310 crp_logo.style_spines(visible=False)
1311 crp_logo.style_spines(spines=['left', 'bottom'], visible=True)
1312 crp_logo.style_xticks(rotation=0, fmt='%d', anchor=0)
1313
1314 # style using Axes methods
1315 crp_logo.ax.set_ylabel("Unique sequences", labelpad=5)
1316 crp_logo.ax.set_xlabel("Positions (nt)", labelpad=5)
1317 crp_logo.ax.xaxis.set_ticks_position('none')
1318 crp_logo.ax.xaxis.set_tick_params(pad=-1)
1319 crp_logo.ax.set_title("Non-redundant")
1320 figure = plt.gcf()
1321 crp_logo.fig.savefig('logo2.png', dpi=300)
1322
1323
1324 ###################################################################################################################################################################################################################
1325
1326 def ssamples1(tem_names,tem_samp,non_names,non_samp,folder,pro):
1327
1328 for i in range(2,len(tem_samp[0])):
1329
1330 fp = open(folder+tem_names[i-2]+'.txt','w')
1331 fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n")
1332
1333 for x in tem_samp:
1334 fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
1335
1336 for j in range(len(non_names)):
1337 if non_names[j]==tem_names[i-2]:
1338 for x in non_samp:
1339 fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n")
1340 fp.close()
1341
1342 ###################################################################################################################################################################################################################
1343
1344 def download_matures(matures,org_name):
1345
1346 #url = 'ftp://mirbase.org/pub/mirbase/21/mature.fa.gz'
1347 url = 'ftp://mirbase.org/pub/mirbase/CURRENT/mature.fa.gz'
1348 data = urllib.request.urlopen(url).read()
1349 file_mirna = gzip.decompress(data).decode('utf-8')
1350 file_mirna = file_mirna.split("\n")
1351
1352 for i in range(0,len(file_mirna)-1,2):
1353
1354 if org_name in file_mirna[i]:
1355 matures.append(file_mirna[i])
1356 matures.append(file_mirna[i+1])
1357
1358 ###################################################################################################################################################################################################################
1359 def non_template_ref(sc,st,all_isoforms):
1360
1361 pre_uni_seq_con = list(sc)
1362 pre_uni_seq_tre = list(st)
1363
1364 for x in pre_uni_seq_con:
1365 for y in x:
1366 if ">"+y[2] not in all_isoforms and ")_" in y[2] :
1367 all_isoforms.append(">"+y[2])
1368 all_isoforms.append(y[9])
1369
1370
1371 for x in pre_uni_seq_tre:
1372 for y in x:
1373 if ">"+y[2] not in all_isoforms and ")_" in y[2]:
1374 all_isoforms.append(">"+y[2])
1375 all_isoforms.append(y[9])
1376
1377 ################################################################################################################################################################################################
1378
1379 def deseqe2(sample,mir_names,l,new_d,sample_name,sample_order):
1380
1381 for y in mir_names:
1382 flag=0
1383 for x in sample:
1384 if y[0]==x[0]:
1385 flag=1
1386 break
1387 if flag==0:
1388 sample.append([y[0],"0",y[1]])
1389
1390 sample.sort(key=lambda x: x[0])
1391 sample=list(sample for sample,_ in itertools.groupby(sample))
1392
1393 l.acquire()
1394 new_d.append(sample)
1395 sample_order.append(sample_name)
1396 l.release()
1397
1398 ###############################################################################################################################################################################################
1399
1400 if __name__ == '__main__':
1401
1402 starttime = time.time()
1403
1404 q1 = Queue()
1405 q2 = Queue()
1406 lock = Lock()
1407 manager = Manager()
1408
1409 mature_mirnas=manager.list()
1410 ps_mature=Process(target=download_matures,args=(mature_mirnas,args.org_name))
1411 ps_mature.start()
1412
1413 args.control[0]=args.control[0][1:]
1414 args.control[len(args.control)-1][:-1]
1415 control = [(args.control[i:i+2]) for i in range(0, len(args.control), 2)]
1416
1417 args.treated[0]=args.treated[0][1:]
1418 args.treated[len(args.treated)-1][:-1]
1419 treated = [(args.treated[i:i+2]) for i in range(0, len(args.treated), 2)]
1420
1421
1422 ############## Detection of templated isoforms ################
1423
1424 radar = manager.list([0,0,0,0])
1425 samples = manager.list()
1426 data= manager.list()
1427 names_con=manager.list()
1428 samples_mirna_names=manager.list()
1429 deseq=manager.list()
1430 unmap_seq=manager.Value('i',0)
1431 unmap_counts=manager.Value('i',0)
1432 LH2E_names=manager.list()
1433 ini_c_samples = manager.list()
1434
1435
1436 radar1 = manager.list([0,0,0,0])
1437 samples1 = manager.list()
1438 data1 = manager.list()
1439 names_tre = manager.list()
1440 samples_mirna_names1=manager.list()
1441 deseq1=manager.list()
1442 unmap1_seq = manager.Value('i',0)
1443 unmap1_counts = manager.Value('i',0)
1444 LH8E_names=manager.list()
1445 ini_t_samples = manager.list()
1446 ps_mature.join()
1447
1448
1449 mature_mirnas=list(mature_mirnas)
1450
1451
1452 starttime1 = time.time()
1453 ps_sam = [Process(target=sam,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,samples,data,names_con,unmap_seq,samples_mirna_names,deseq,LH2E_names,"0",ini_c_samples,unmap_counts)) for path in control]
1454 ps_sam.extend([Process(target=sam,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,samples1,data1,names_tre,unmap1_seq,samples_mirna_names1,deseq1,LH8E_names,"0",ini_t_samples,unmap1_counts)) for path in treated])
1455
1456 [p.start() for p in ps_sam]
1457 [p.join() for p in ps_sam]
1458 print('SAM took {} seconds'.format(time.time() - starttime1))
1459
1460 ps_hist=[Process(target=hist_red,args=(ini_c_samples,'c'))]
1461 ps_hist.extend([Process(target=hist_red,args=(ini_t_samples,'t'))])
1462 [x.start() for x in ps_hist]
1463
1464 starttime200=time.time()
1465
1466 sc = list(samples)
1467 st = list(samples1)
1468
1469 names_con=list(names_con)
1470 names_tre=list(names_tre)
1471 samples_mirna_names=list(samples_mirna_names)
1472 samples_mirna_names.sort()
1473 samples_mirna_names=list(samples_mirna_names for samples_mirna_names,_ in itertools.groupby(samples_mirna_names))
1474
1475 samples_mirna_names1=list(samples_mirna_names1)
1476 samples_mirna_names1.sort()
1477 samples_mirna_names1=list(samples_mirna_names1 for samples_mirna_names1,_ in itertools.groupby(samples_mirna_names1))
1478
1479 deseq=list(deseq)
1480 deseq1=list(deseq1)
1481
1482 new_names_con=manager.list()
1483 new_names_tre=manager.list()
1484 new_deseq=manager.list()
1485 new_deseq1=manager.list()
1486 ps_deseq=[Process(target=deseqe2,args=(sampp,samples_mirna_names,lock,new_deseq,names_con[i],new_names_con)) for i,sampp in enumerate(deseq)]
1487 ps_deseq.extend([Process(target=deseqe2,args=(sampp,samples_mirna_names1,lock,new_deseq1,names_tre[i],new_names_tre)) for i,sampp in enumerate(deseq1)])
1488
1489 [z.start() for z in ps_deseq]
1490 [z.join() for z in ps_deseq]
1491 new_deseq=list(new_deseq)
1492 new_deseq1=list(new_deseq1)
1493
1494 LH2E=[[x[0],x[2]] for x in new_deseq[0]]
1495 [LH2E[i].append(y[i][1]) for i,_ in enumerate(LH2E) for y in new_deseq]
1496
1497 LH8E=[[x[0],x[2]] for x in new_deseq1[0]]
1498 [LH8E[i].append(y[i][1]) for i,_ in enumerate(LH8E) for y in new_deseq1]
1499
1500 print('Deseq took {} seconds'.format(time.time() - starttime200))
1501
1502 merg_nam_LH2E=manager.list()
1503 merg_nam_LH8E=manager.list()
1504
1505 LH2E_copy=copy.deepcopy(list(LH2E))
1506 LH8E_copy=copy.deepcopy(list(LH8E))
1507
1508 fil_sort_tre=manager.list()
1509 fil_sort_con=manager.list()
1510 raw_sort_tre=manager.list()
1511 raw_sort_con=manager.list()
1512
1513 ps_main = Process(target=main_temp,args=(list(LH2E), samples_mirna_names, list(LH8E), samples_mirna_names1,1,list(names_con),list(names_tre),fil_sort_tre,fil_sort_con,raw_sort_tre,raw_sort_con))
1514 ps_main.start()
1515
1516 if args.anal=="2":
1517 all_iso = manager.list()
1518 ps_non_iso = Process(target=non_template_ref,args=(sc,st,all_iso))
1519 ps_non_iso.start()
1520
1521 ps_merge = [Process(target=merging_names,args=(LH2E_copy,merg_nam_LH2E))]
1522 ps_merge.extend([Process(target=merging_names,args=(LH8E_copy,merg_nam_LH8E))])
1523 [x.start() for x in ps_merge]
1524 [x.join() for x in ps_merge]
1525
1526 merg_nam_LH2E=list(merg_nam_LH2E)
1527 merg_nam_LH8E=list(merg_nam_LH8E)
1528
1529 starttime2 = time.time()
1530 procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in data]
1531 procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in data1])
1532 procs.extend([Process(target=make_spider,args=(merg_nam_LH2E,merg_nam_LH8E))])
1533 if args.anal == "1":
1534 procs.extend([Process(target=pie_temp,args=(merg_nam_LH2E,unmap_seq.value,unmap_counts.value,merg_nam_LH8E,unmap1_seq.value,unmap1_counts.value))])
1535
1536 [p.start() for p in procs]
1537
1538
1539 if args.anal=="1":
1540 [x.join() for x in ps_hist]
1541 [p.join() for p in procs]
1542 ps_pdf = Process(target=pdf_before_DE,args=(args.anal))
1543 ps_pdf.start()
1544
1545 print('Graphs took {} seconds'.format(time.time() - starttime2))
1546
1547 ps_main.join()
1548
1549 fil_sort_con=list(fil_sort_con)
1550 fil_sort_tre=list(fil_sort_tre)
1551 if fil_sort_con==[]:
1552 fil_sort_con=raw_sort_con
1553 fil_sort_tre=raw_sort_tre
1554
1555 raw_sort_con=list(raw_sort_con)
1556 raw_sort_tre=list(raw_sort_tre)
1557 names_con=list(new_names_con)
1558 names_tre=list(new_names_tre)
1559
1560 ps_write = Process(target=write_main,args=(raw_sort_con, raw_sort_tre, fil_sort_con, fil_sort_tre, names_con,names_tre,1))
1561 ps_write.start()
1562
1563 ps1_matrix = [Process(target=ssamples,args=(names_con,fil_sort_con,"Diff/temp_con/",0))]
1564 ps1_matrix.extend([Process(target=ssamples,args=(names_tre,fil_sort_tre,"Diff/temp_tre/",0))])
1565 [p.start() for p in ps1_matrix]
1566
1567 if args.anal=="1":
1568 ps_pdf.join()
1569 if args.anal=="2":
1570 [p.join() for p in procs]
1571 [x.join() for x in ps_hist]
1572
1573 ps_write.join()
1574 [p.join() for p in ps1_matrix]
1575
1576
1577 ############################## Detection of Both #######################################
1578
1579 starttime10 = time.time()
1580
1581 if args.anal == "2":
1582
1583 n_data= manager.list()
1584 n_names_con=manager.list()
1585 n_samples_mirna_names=manager.list()
1586 n_deseq=manager.list()
1587 n_LH2E_names=manager.list()
1588
1589 n_data1 = manager.list()
1590 n_names_tre = manager.list()
1591 n_samples_mirna_names1=manager.list()
1592 n_deseq1=manager.list()
1593 n_LH8E_names=manager.list()
1594
1595
1596 new_mat_mirnas = list(mature_mirnas)
1597 ps_non_iso.join()
1598
1599 all_iso=list(all_iso)
1600 new_mat_mirnas.extend(all_iso)
1601
1602 starttime11=time.time()
1603
1604 ps_sam = [Process(target=non_sam,args=(new_mat_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,n_data,n_names_con,n_deseq,n_samples_mirna_names,n_LH2E_names)) for path in control]
1605 ps_sam.extend([Process(target=non_sam,args=(new_mat_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,n_data1,n_names_tre,n_deseq1,n_samples_mirna_names1,n_LH8E_names)) for path in treated])
1606
1607 [p.start() for p in ps_sam]
1608 [p.join() for p in ps_sam]
1609
1610 print('Non-sam took {} seconds'.format(time.time() - starttime11))
1611
1612 starttime12=time.time()
1613
1614 n_names_con=list(n_names_con)
1615 n_names_tre=list(n_names_tre)
1616 n_samples_mirna_names=list(n_samples_mirna_names)
1617 n_samples_mirna_names.sort()
1618 n_samples_mirna_names=list(n_samples_mirna_names for n_samples_mirna_names,_ in itertools.groupby(n_samples_mirna_names))
1619
1620 n_samples_mirna_names1=list(n_samples_mirna_names1)
1621 n_samples_mirna_names1.sort()
1622 n_samples_mirna_names1=list(n_samples_mirna_names1 for n_samples_mirna_names1,_ in itertools.groupby(n_samples_mirna_names1))
1623
1624 n_deseq=list(n_deseq)
1625 n_deseq1=list(n_deseq1)
1626
1627 new_n_names_con=manager.list()
1628 new_n_names_tre=manager.list()
1629 n_new_deseq=manager.list()
1630 n_new_deseq1=manager.list()
1631 ps_deseq=[Process(target=deseqe2,args=(sampp,n_samples_mirna_names,lock,n_new_deseq,n_names_con[i],new_n_names_con)) for i,sampp in enumerate(n_deseq)]
1632 ps_deseq.extend([Process(target=deseqe2,args=(sampp,n_samples_mirna_names1,lock,n_new_deseq1,n_names_tre[i],new_n_names_tre)) for i,sampp in enumerate(n_deseq1)])
1633
1634 [x.start() for x in ps_deseq]
1635 [x.join() for x in ps_deseq]
1636 n_new_deseq=list(n_new_deseq)
1637 n_new_deseq1=list(n_new_deseq1)
1638
1639 print([len(n_new_deseq[0]),len(n_new_deseq[1])])
1640 print([len(n_new_deseq1[0]),len(n_new_deseq1[1])])
1641
1642 n_LH2E=[[x[0],x[2]] for x in n_new_deseq[0]]
1643 [n_LH2E[i].append(y[i][1]) for i,_ in enumerate(n_LH2E) for y in n_new_deseq]
1644
1645 n_LH8E=[[x[0],x[2]] for x in n_new_deseq1[0]]
1646 [n_LH8E[i].append(y[i][1]) for i,_ in enumerate(n_LH8E) for y in n_new_deseq1]
1647
1648 print('Non-deseq took {} seconds'.format(time.time() - starttime12))
1649
1650 merg_nam_n_LH2E=manager.list()
1651 merg_nam_n_LH8E=manager.list()
1652
1653 n_LH2E_copy=copy.deepcopy(list(n_LH2E))
1654 n_LH8E_copy=copy.deepcopy(list(n_LH8E))
1655
1656 n_sort_tre=manager.list()
1657 n_sort_con=manager.list()
1658
1659 n_fil_sort_con=manager.list()
1660 n_fil_sort_tre=manager.list()
1661 n_raw_sort_con=manager.list()
1662 n_raw_sort_tre=manager.list()
1663
1664 ps_main = Process(target=main_temp,args=(list(n_LH2E), n_samples_mirna_names, list(n_LH8E), n_samples_mirna_names1,1,list(n_names_con),list(n_names_tre),n_fil_sort_tre,n_fil_sort_con,n_raw_sort_tre,n_raw_sort_con))
1665 ps_main.start()
1666
1667 ps_merge = [Process(target=merging_names,args=(n_LH2E_copy,merg_nam_n_LH2E))]
1668 ps_merge.extend([Process(target=merging_names,args=(n_LH8E_copy,merg_nam_n_LH8E))])
1669 [p.start() for p in ps_merge]
1670 [p.join() for p in ps_merge]
1671
1672 merg_nam_n_LH2E=list(merg_nam_n_LH2E)
1673 merg_nam_n_LH8E=list(merg_nam_n_LH8E)
1674
1675 procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_data]
1676 procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_data1])
1677 procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_LH2E,'c'))])
1678 procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_LH8E,'t'))])
1679 procs.extend([Process(target=pie_non_temp,args=(merg_nam_LH2E,merg_nam_n_LH2E,merg_nam_LH8E,merg_nam_n_LH8E,unmap_seq.value,unmap1_seq.value,unmap_counts.value,unmap1_counts.value))])
1680
1681 starttime13=time.time()
1682 [p.start() for p in procs]
1683 [p.join() for p in procs]
1684
1685 print('Graphs took {} seconds'.format(time.time() - starttime13))
1686
1687 procs1 = Process(target=pdf_before_DE,args=(args.anal))
1688 procs1.start()
1689
1690 starttime14=time.time()
1691 ps_main.join()
1692
1693 n_fil_sort_con=list(n_fil_sort_con)
1694 n_fil_sort_tre=list(n_fil_sort_tre)
1695 if n_fil_sort_con==[]:
1696 n_fil_sort_con=n_raw_sort_con
1697 n_fil_sort_tre=n_raw_sort_tre
1698
1699 n_raw_sort_con=list(n_raw_sort_con)
1700 n_raw_sort_tre=list(n_raw_sort_tre)
1701 n_names_con=list(new_n_names_con)
1702 n_names_tre=list(new_n_names_tre)
1703
1704 ps_write = Process(target=write_main,args=(n_raw_sort_con, n_raw_sort_tre,n_fil_sort_con, n_fil_sort_tre, n_names_con, n_names_tre,2))
1705 ps_write.start()
1706
1707 ps1_matrix = [Process(target=ssamples1,args=(n_names_con,n_fil_sort_con,names_con,fil_sort_con,"Diff/n_temp_con/",0))]
1708 ps1_matrix.extend([Process(target=ssamples1,args=(n_names_tre,n_fil_sort_tre,names_tre,fil_sort_tre,"Diff/n_temp_tre/",0))])
1709 [p.start() for p in ps1_matrix]
1710
1711 ps_write.join()
1712 [p.join() for p in ps1_matrix]
1713 procs1.join()
1714 print('That took {} seconds'.format(time.time() - starttime10))
1715 print('That took {} seconds'.format(time.time() - starttime))
1716
1717
1718
1719
1720
1721
1722
1723