comparison mirbase_ultra_v2.py @ 14:3ad9701c7749 draft

Uploaded
author glogobyte
date Thu, 22 Oct 2020 07:30:47 +0000
parents
children
comparison
equal deleted inserted replaced
13:f1b31ee97e91 14:3ad9701c7749
1 from mirbase_functions import *
2 from mirbase_graphs import *
3 import itertools
4 import time
5 import sys
6 import os
7 import urllib.request
8 import gzip
9 from multiprocessing import Process, Queue, Lock, Pool, Manager, Value
10 import subprocess
11 import argparse
12 from collections import OrderedDict
13 from matplotlib.backends.backend_pdf import PdfPages
14 import pandas as pd
15 from math import pi
16 import numpy as np
17 import matplotlib.pyplot as plt
18 from matplotlib.ticker import PercentFormatter
19 import seaborn as sns
20 import scipy.stats as stats
21 from plotnine import *
22 import math
23 import re
24 import matplotlib.ticker as mtick
25 import copy
26
27 subprocess.call(['mkdir','-p', 'split1','split2','split3','split4','split11','split12','Counts','Diff/temp_con','Diff/temp_tre','Diff/n_temp_con','Diff/n_temp_tre'])
28
29 parser = argparse.ArgumentParser()
30 parser.add_argument("-analysis", "--anal", help="choose type of analysis", action="store")
31 parser.add_argument("-con", "--control", help="input fastq file", nargs='+', default=[])
32 parser.add_argument("-tre", "--treated", help="input fastq file", nargs='+', default=[] )
33 parser.add_argument("-tool_dir", "--tool_directory", help="tool directory path", action="store")
34 parser.add_argument("-gen", "--org_name", help="tool directory path", action="store")
35 parser.add_argument("-program", "--pro", help="choose type of analysis", action="store")
36 parser.add_argument("-f", "--flag", help="choose the database", action="store")
37 parser.add_argument("-umis", "--umi", help="choose the database", action="store")
38 parser.add_argument("-percentage", "--per", help="choose the database", action="store")
39 parser.add_argument("-counts", "--count", help="choose the database", action="store")
40 parser.add_argument("-name1", "--n1", help="choose the database", action="store")
41 parser.add_argument("-name2", "--n2", help="choose the database", action="store")
42 args = parser.parse_args()
43
44 #########################################################################################################################################
45
46 def collapse_sam(path):
47
48 ini_sam=read(path,0)
49 main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
50 intro_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" in x.split("\t")[0]]
51
52 uni_seq = []
53 for x in main_sam:
54
55 if [x[2], x[9]] not in uni_seq:
56 uni_seq.append([x[2], x[9]])
57
58 new_main_sam=[]
59 incr_num=0
60 for i in range(len(uni_seq)):
61 count=0
62 incr_num+=1
63 for y in main_sam:
64 if uni_seq[i][1]==y[9] and uni_seq[i][0]==y[2]:
65 count+=1
66 temp=y
67 temp[10]="~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
68 temp[0]=str(incr_num)+"-"+str(count)
69 new_main_sam.append(temp)
70
71 new_sam=intro_sam+new_main_sam
72
73 return new_sam
74
75 #################################################################################################################################################################################################################
76
77 def duplicate_chroms_isoforms(List):
78
79 dupes=[]
80
81 for num in range(len(List)):
82
83 if [List[num][9],List[num][0],List[num][2]] not in dupes :
84 dupes.append([List[num][9],List[num][0],List[num][2]])
85
86 for x in List:
87 for y in dupes:
88 if x[9]==y[0] and x[0]==y[1] and x[2].split("_")[0]==y[2].split("_")[0] and x[2]!=y[2]:
89 y.append(x[2])
90
91
92 double_List = [x[:] for x in List]
93
94 chr_order=[]
95 for x in dupes:
96 temp = []
97 for i in range(2,len(x)):
98 if x[i].split("chr")[1].split("(")[0].isdigit():
99 temp.append(int(x[i].split("chr")[1].split("(")[1][0]+x[i].split("chr")[1].split("(")[0]))
100 else:
101 temp.append(x[i].split("chr")[1][0:4])
102
103 for z in temp:
104 if 'X(-)'==z or 'Y(-)'==z or 'X(+)'==z or 'Y(+)'==z:
105 temp = [str(j) for j in temp]
106 temp=list(set(temp))
107 temp.sort()
108 chr_order.append(temp)
109
110 final_dupes=[]
111 for i in range(len(dupes)):
112 final_dupes.append([dupes[i][0],dupes[i][2].split("_")[0],dupes[i][1]])
113 for x in chr_order[i]:
114 result = re.match("[-+]?\d+$", str(x))
115 if len(chr_order[i]) == len(set(chr_order[i])):
116 if result is not None:
117
118 if int(x)<0:
119 final_dupes[i][1]=final_dupes[i][1]+"_chr"+str(abs(int(x)))+"(-)"
120 else:
121 final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(abs(int(x)))+"(+)"
122 else:
123 final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(x)
124 else:
125 if result is not None:
126 if int(x) < 0:
127 final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(-)"
128 else:
129 final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(+)"
130 else:
131 final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(x)
132
133 final_dupes.sort()
134 final_dupes=list(final_dupes for final_dupes,_ in itertools.groupby(final_dupes))
135
136 for i in range(len(double_List)):
137 for x in final_dupes:
138
139 if double_List[i][9] == x[0] and double_List[i][0] == x[2] and len(double_List[i][2].split("_")) >3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]:
140 gg=str("_"+double_List[i][2].split("_")[-2]+"_"+double_List[i][2].split("_")[-1])
141 double_List[i][2] = x[1]+gg
142
143 if double_List[i][9]==x[0] and double_List[i][0]== x[2] and len(double_List[i][2].split("_"))==3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]:
144 double_List[i][2]=x[1]
145 List[i][2] = x[1]
146
147 List.sort()
148 new_list=list(List for List,_ in itertools.groupby(List))
149
150 double_List.sort()
151 new_double_List = list(double_List for double_List, _ in itertools.groupby(double_List))
152
153 return new_list, new_double_List
154
155
156 #############################################################################################################################################################################################################
157
158 def sam(mature_mirnas,path,name,con,l,samples,data,names,unmap_seq,samples_mirna_names,deseq,LHE_names,umi,ini_sample,unmap_counts):
159
160 # read the sam file
161 ini_sam=read(path,0)
162 new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
163 unique_seq = [x for x in new_main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26]
164
165 sorted_uni_arms = []
166
167 for i in range(len(mature_mirnas)):
168 tmp_count_reads = 0 # calculate the total number of reads
169 tmp_count_seq = 0 # calculate the total number of sequences
170 for j in range(len(unique_seq)):
171
172 if "{" in unique_seq[j][2].split("_")[0]:
173 official=unique_seq[j][2].split("_")[0][:-4]
174 else:
175 official=unique_seq[j][2].split("_")[0]
176
177 if mature_mirnas[i].split(" ")[0][1:] == official:
178
179 temp_mature = mature_mirnas[i+1].strip().replace("U", "T")
180 off_part = longestSubstring(temp_mature, unique_seq[j][9])
181
182 mat_diff = temp_mature.split(off_part)
183 mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
184
185 unique_diff = unique_seq[j][9].split(off_part)
186 unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
187
188 # Problem with hsa-miR-8485
189 if mat_diff[1]!=0 and unique_diff[1]!=0:
190 unique_seq[j]=1
191 pre_pos = 0
192 post_pos = 0
193
194 elif mat_diff[0]!=0 and unique_diff[0]!=0:
195 unique_seq[j]=1
196 pre_pos = 0
197 post_pos = 0
198
199 else:
200 pre_pos = mat_diff[0]-unique_diff[0]
201 post_pos = unique_diff[1]-mat_diff[1]
202 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
203 tmp_count_seq = tmp_count_seq+1
204
205 if pre_pos != 0 or post_pos != 0:
206 if pre_pos == 0:
207 unique_seq[j][2] = unique_seq[j][2] + "_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos)
208 elif post_pos == 0:
209 unique_seq[j][2] = unique_seq[j][2] + "_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos)
210 else:
211 unique_seq[j][2] = unique_seq[j][2]+"_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos)
212
213 for x in range(unique_seq.count(1)):
214 unique_seq.remove(1)
215 if tmp_count_reads != 0 and tmp_count_seq != 0:
216 sorted_uni_arms.append([mature_mirnas[i].split(" ")[0][1:], tmp_count_seq, tmp_count_reads])
217 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
218 dedup_unique_seq,double_fil_uni_seq=duplicate_chroms_isoforms(unique_seq)
219
220 for y in sorted_uni_arms:
221 counts=0
222 seqs=0
223 for x in double_fil_uni_seq:
224 if y[0]==x[2].split("_")[0]:
225 counts+=int(x[0].split("-")[1])
226 seqs+=1
227
228 y[1]=seqs
229 y[2]=counts
230
231 LHE=[]
232 l.acquire()
233 if con=="c":
234 LHE.extend(z[2] for z in double_fil_uni_seq)
235 for y in double_fil_uni_seq:
236 samples_mirna_names.append([y[2],y[9]])
237 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq])
238 LHE_names.extend(LHE)
239 unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4'])
240 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4'])
241 names.append(name)
242 samples.append(dedup_unique_seq)
243 data.append([con,name,double_fil_uni_seq,sorted_uni_arms])
244 ini_sample.append(new_main_sam)
245
246 if con=="t":
247 LHE.extend(z[2] for z in double_fil_uni_seq)
248 for y in double_fil_uni_seq:
249 samples_mirna_names.append([y[2],y[9]])
250 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq])
251 LHE_names.extend(LHE)
252 unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4'])
253 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4'])
254 names.append(name)
255 samples.append(dedup_unique_seq)
256 data.append([con,name,double_fil_uni_seq,sorted_uni_arms])
257 ini_sample.append(new_main_sam)
258 l.release()
259
260
261 ######################################################################################################################################
262 """
263
264 Read a sam file from Bowtie and do the followings:
265
266 1) Remove reverse stranded mapped reads
267 2) Remove unmapped reads
268 3) Remove all sequences with reads less than 11 reads
269 4) Sort the arms with the most sequences in decreading rate
270 5) Sort the sequences of every arm with the most reads in decreasing rate
271 6) Calculate total number of sequences of every arm
272 7) Calculate total number of reads of sequences of every arm.
273 8) Store all the informations in a txt file
274
275 """
276
277 def non_sam(mature_mirnas,path,name,con,l,data,names,n_deseq,n_samples_mirna_names,n_LHE_names):
278
279 ini_sam=read(path,0)
280 new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
281 unique_seq=[]
282 unique_seq = [x for x in new_main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26]
283
284 uni_seq=[]
285 # Calculate the shifted positions for every isomir and add them to the name of it
286 sorted_uni_arms = []
287 for i in range(1,len(mature_mirnas),2):
288 tmp_count_reads = 0 # calculate the total number of reads
289 tmp_count_seq = 0 # calculate the total number of sequences
290
291 for j in range(len(unique_seq)):
292
293 temp_mature = mature_mirnas[i].strip().replace("U", "T")
294
295 if temp_mature in unique_seq[j][9]:
296
297 off_part = longestSubstring(temp_mature, unique_seq[j][9])
298
299 mat_diff = temp_mature.split(off_part)
300 mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
301
302 unique_diff = unique_seq[j][9].split(off_part)
303 if len(unique_diff)<=2:
304 unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
305
306 pre_pos = mat_diff[0]-unique_diff[0]
307 post_pos = unique_diff[1]-mat_diff[1]
308
309 lengthofmir = len(off_part) + post_pos
310 if pre_pos == 0:
311 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
312 tmp_count_seq = tmp_count_seq + 1
313
314 if pre_pos == 0:
315
316 t_name=unique_seq[j].copy()
317 t_name[2]=mature_mirnas[i - 1].split(" ")[0][1:] + "__" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):])
318 uni_seq.append(t_name)
319
320
321 if tmp_count_reads != 0 and tmp_count_seq != 0:
322 sorted_uni_arms.append([mature_mirnas[i-1].split(" ")[0][1:], tmp_count_seq, tmp_count_reads])
323
324
325 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
326 unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq))))
327
328 LHE=[]
329
330 l.acquire()
331 if con=="c":
332 LHE.extend(x[2] for x in unique_seq if x[2]!="*")
333 for x in unique_seq:
334 if x[2]!="*":
335 n_samples_mirna_names.append([x[2],x[9]])
336 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
337 n_LHE_names.extend(LHE)
338 names.append(name)
339 data.append([con,name,unique_seq,sorted_uni_arms])
340
341
342 if con=="t":
343 LHE.extend(x[2] for x in unique_seq if x[2]!="*")
344 for x in unique_seq:
345 if x[2]!="*":
346 n_samples_mirna_names.append([x[2],x[9]])
347 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
348 n_LHE_names.extend(LHE)
349 names.append(name)
350 data.append([con,name,unique_seq,sorted_uni_arms])
351 l.release()
352
353 #####################################################################################################################################################################################################################
354 def deseq2_temp(samples_mirna_names,deseq,con,l):
355
356 samples_mirna_names.sort(key=lambda x:[0])
357 for i in range(len(deseq)):
358 for y in samples_mirna_names:
359 flag = 0
360 for x in deseq[i]:
361 if y[0] == x[0]:
362 flag = 1
363 break
364
365 if flag == 0:
366 deseq[i].append([y[0], "0", y[1]])
367
368 [deseq[i].sort(key=lambda x: x[0]) for i, _ in enumerate(deseq)]
369 deseq_final = [[x[0],x[2]] for x in deseq[0]]
370 [deseq_final[z].append(deseq[i][j][1]) for z,_ in enumerate(deseq_final) for i, _ in enumerate(deseq) for j,_ in enumerate(deseq[i]) if deseq_final[z][0] == deseq[i][j][0]]
371
372 l.acquire()
373 if con=="c":
374 q1.put(deseq_final)
375
376 if con=="t":
377 q2.put(deseq_final)
378 l.release()
379
380
381 ####################################################################################################################################################################################################################
382
383 def main_temp(LH2E, LH2E_names, LH8E, LH8E_names,flag,names_con,names_tre,filter_LH8E,filter_LH2E,raw_LH8E,raw_LH2E):
384
385 LH8E_add_names = [x for x in LH2E_names if x not in LH8E_names]
386 LH2E_add_names = [x for x in LH8E_names if x not in LH2E_names]
387
388 LH8E_add_names.sort()
389 LH2E_add_names.sort()
390 LH8E_add_names = list(LH8E_add_names for LH8E_add_names,_ in itertools.groupby(LH8E_add_names))
391 LH2E_add_names = list(LH2E_add_names for LH2E_add_names,_ in itertools.groupby(LH2E_add_names))
392
393 LH2E.sort()
394 LH8E.sort()
395 LH2E = list(LH2E for LH2E,_ in itertools.groupby(LH2E))
396 LH8E = list(LH8E for LH8E,_ in itertools.groupby(LH8E))
397
398 zeros=["0"]*(len(LH8E[0])-2)
399 [LH8E_add_names[i].extend(zeros) for i,_ in enumerate(LH8E_add_names)]
400 LH8E=LH8E+LH8E_add_names
401
402 zeros=["0"]*(len(LH2E[0])-2)
403 [LH2E_add_names[i].extend(zeros) for i,_ in enumerate(LH2E_add_names)]
404 LH2E=LH2E+LH2E_add_names
405
406 dupes=[]
407 final_LH2E =[]
408
409 for num,_ in enumerate(LH2E):
410
411 if LH2E[num][1] not in final_LH2E and LH2E[num][0] not in final_LH2E:
412 final_LH2E.append(LH2E[num][1])
413 final_LH2E.append(LH2E[num][0])
414 else:
415 dupes.append(LH2E[num][1])
416
417
418 dupes=list(set(dupes))
419
420 dupes=[[x] for x in dupes]
421
422 for x in LH2E:
423 for y in dupes:
424 if x[1]==y[0]:
425 fl=0
426 if len(y)==1:
427 y.append(x[0])
428 else:
429 for i in range(1,len(y)):
430 if y[i].split("_")[0]==x[0].split("_")[0]:
431 fl=1
432 if len(x[0])<len(y[i]):
433 del y[i]
434 y.append(x[0])
435 break
436
437 if fl==0:
438 y.append((x[0]))
439
440 for y in dupes:
441 if len(y)>2:
442 for i in range(len(y)-1,1,-1):
443 y[1]=y[1]+"/"+y[i]
444 del y[i]
445
446 for x in LH2E:
447 for y in dupes:
448 if x[1]==y[0]:
449 x[0]=y[1]
450
451 for x in LH8E:
452 for y in dupes:
453 if x[1]==y[0]:
454 x[0]=y[1]
455
456
457 LH2E.sort()
458 LH2E=list(LH2E for LH2E,_ in itertools.groupby(LH2E))
459
460 LH8E.sort()
461 LH8E=list(LH8E for LH8E,_ in itertools.groupby(LH8E))
462
463 LH8E_new=[]
464 LH2E_new=[]
465
466 if int(args.per)!=-1:
467 percent=int(args.per)/100
468
469 c_col_filter=round(percent*(len(LH2E[1])-2))
470 t_col_filter=round(percent*(len(LH8E[1])-2))
471
472 for i, _ in enumerate(LH2E):
473 c_cols=0
474 t_cols=0
475
476 c_cols=sum([1 for j in range(len(LH2E[i])-2) if int(LH2E[i][j+2])>=int(args.count)])
477 t_cols=sum([1 for j in range(len(LH8E[i])-2) if int(LH8E[i][j+2])>=int(args.count)])
478
479 if c_cols>=c_col_filter or t_cols>=t_col_filter:
480 LH8E_new.append(LH8E[i])
481 LH2E_new.append(LH2E[i])
482
483 filter_LH2E.extend(LH2E_new)
484 filter_LH8E.extend(LH8E_new)
485 raw_LH2E.extend(LH2E)
486 raw_LH8E.extend(LH8E)
487
488 ##################################################################################################################################################################################################################
489
490 def write_main(raw_LH2E, raw_LH8E, fil_LH2E, fil_LH8E, names_con, names_tre, flag):
491
492 if flag == 1 and int(args.per)!=-1:
493 fp = open('Counts/Filtered '+args.n2 +' Templated Counts', 'w')
494 fp.write("Name\t")
495 fp.write("Sequence")
496 for y in names_tre:
497 fp.write("\t"+y)
498
499 for x in fil_LH8E:
500 fp.write("\n%s" % "\t".join(x))
501 fp.close()
502
503 fp = open('Counts/Filtered '+args.n1+' Templated Counts', 'w')
504 fp.write("Name\t")
505 fp.write("Sequence")
506 for y in names_con:
507 fp.write("\t"+y)
508
509 for x in fil_LH2E:
510 fp.write("\n%s" % "\t".join(x))
511 fp.close()
512
513
514 if flag == 2 and int(args.per)!=-1:
515 fp = open('Counts/Filtered '+args.n2+' Non-Templated Counts', 'w')
516 fp.write("Name\t")
517 fp.write("Sequence")
518 for y in names_tre:
519 fp.write("\t"+y)
520
521
522 for x in fil_LH8E:
523 fp.write("\n%s" % "\t".join(x))
524 fp.close()
525
526 fp = open('Counts/Filtered '+args.n1+' Non-Templated Counts', 'w')
527 fp.write("Name\t")
528 fp.write("Sequence")
529 for y in names_con:
530 fp.write("\t"+y)
531
532 for x in fil_LH2E:
533 fp.write("\n%s" % "\t".join(x))
534 fp.close()
535
536
537 if flag == 1:
538 fp = open('Counts/Raw '+args.n2+' Templated Counts', 'w')
539 fp.write("Name\t")
540 fp.write("Sequence")
541 for y in names_tre:
542 fp.write("\t"+y)
543
544 for x in raw_LH8E:
545 fp.write("\n%s" % "\t".join(x))
546 fp.close()
547
548 fp = open('Counts/Raw '+args.n1+' Templated Counts', 'w')
549 fp.write("Name\t")
550 fp.write("Sequence")
551 for y in names_con:
552 fp.write("\t"+y)
553
554 for x in raw_LH2E:
555 fp.write("\n%s" % "\t".join(x))
556 fp.close()
557
558
559 if flag == 2:
560 fp = open('Counts/Raw '+args.n2+' Non-Templated Counts', 'w')
561 fp.write("Name\t")
562 fp.write("Sequence")
563 for y in names_tre:
564 fp.write("\t"+y)
565
566
567 for x in raw_LH8E:
568 fp.write("\n%s" % "\t".join(x))
569 fp.close()
570
571 fp = open('Counts/Raw '+args.n1+' Non-Templated Counts', 'w')
572 fp.write("Name\t")
573 fp.write("Sequence")
574 for y in names_con:
575 fp.write("\t"+y)
576
577 for x in raw_LH2E:
578 fp.write("\n%s" % "\t".join(x))
579 fp.close()
580
581
582 #########################################################################################################################################
583
584 def ssamples(names,samp,folder,pro):
585
586 for i in range(2,len(samp[0])):
587
588 fp = open(folder+names[i-2]+'.txt','w')
589 fp.write("miRNA id"+"\t"+names[i-2]+"\n")
590
591 for x in samp:
592 fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
593 fp.close()
594
595 ##################################################################################################################
596
597 def DB_write(con,name,unique_seq,sorted_uni_arms,f):
598
599 if f==1:
600 # Write a txt file with all the information
601 if con=="c":
602 fp = open('split1/'+name, 'w')
603
604 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
605 if con=="t":
606 fp = open('split2/'+name, 'w')
607 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
608
609
610 for i in range(len(sorted_uni_arms)):
611 temp = []
612 for j in range(len(unique_seq)):
613
614 if sorted_uni_arms[i][0] in unique_seq[j][2].split("_")[0]:
615
616 temp.append(unique_seq[j])
617
618 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
619 fp.write("*********************************************************************************************************\n")
620 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
621 fp.write("*********************************************************************************************************\n\n")
622 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
623 fp.write("\n" + "\n")
624 fp.close()
625
626 if f==2:
627
628 if con=="c":
629 fp = open('split3/'+name, 'w')
630 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
631 if con=="t":
632 fp = open('split4/'+name, 'w')
633 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
634
635
636 for i in range(len(sorted_uni_arms)):
637 temp = []
638 for j in range(len(unique_seq)):
639 if sorted_uni_arms[i][0]==unique_seq[j][2].split("__")[0]:
640 temp.append(unique_seq[j])
641 if temp!=[]:
642 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
643 fp.write("*********************************************************************************************************\n")
644 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
645 fp.write("*********************************************************************************************************\n\n")
646 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
647 fp.write("\n" + "\n")
648 fp.close()
649
650
651 ##########################################################################################################################
652
653 def new_mat_seq(pre_unique_seq,mat_mirnas,l):
654
655 unique_iso = []
656 for x in pre_unique_seq:
657 if len(x[2].split("_"))==3:
658 for y in pre_unique_seq:
659 if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]):
660 if any(y[2] in lst2 for lst2 in unique_iso)==False:
661 y[2]=">"+y[2]
662 unique_iso.append(y)
663 l.acquire()
664 for x in unique_iso:
665 mat_mirnas.append(x[2])
666 mat_mirnas.append(x[9])
667 l.release()
668
669 #########################################################################################################################
670 def pie_non_temp(merge_LH2E,merge_non_LH2E,merge_LH8E,merge_non_LH8E,c_unmap,t_unmap,c_unmap_counts,t_unmap_counts):
671
672 c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E]
673 t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E]
674 c_non_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_non_LH2E]
675 t_non_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_non_LH8E]
676
677 c_templ = 0
678 c_tem_counts = 0
679 c_mature = 0
680 c_mat_counts = 0
681 t_templ = 0
682 t_tem_counts = 0
683 t_mature = 0
684 t_mat_counts = 0
685
686 c_non = len(c_non_samples)
687 c_non_counts = sum(x[2] for x in c_non_samples)
688 t_non = len(t_non_samples)
689 t_non_counts = sum(x[2] for x in t_non_samples)
690
691 c_unmap = c_unmap - c_non
692 t_unmap = c_unmap - t_non
693
694 c_unmap_counts=c_unmap_counts - c_non_counts
695 t_unmap_counts=t_unmap_counts - t_non_counts
696
697
698 for x in c_samples:
699
700 if "/" not in x[0]:
701 if "chr" in x[0].split("_")[-1]:
702 c_mature+=1
703 c_mat_counts += x[2]
704 else:
705 c_templ+=1
706 c_tem_counts += x[2]
707 else:
708 f=0
709 for y in x[0].split("/"):
710 if "chr" in y.split("_")[-1]:
711 c_mature+=1
712 c_mat_counts += x[2]
713 f=1
714 break
715 if f==0:
716 c_templ+=1
717 c_tem_counts += x[2]
718
719 for x in t_samples:
720
721 if "/" not in x[0]:
722 if "chr" in x[0].split("_")[-1]:
723 t_mature+=1
724 t_mat_counts += x[2]
725 else:
726 t_templ+=1
727 t_tem_counts += x[2]
728 else:
729 f=0
730 for y in x[0].split("/"):
731 if "chr" in y.split("_")[-1]:
732 t_mature+=1
733 t_mat_counts += x[2]
734 f=1
735 break
736 if f==0:
737 t_templ+=1
738 t_tem_counts += x[2]
739
740 fig = plt.figure(figsize=(7,5))
741 labels = 'miRNA RefSeq','Template', 'Unmapped','Non-template'
742 sizes = [c_mat_counts, c_tem_counts, c_unmap_counts,c_non_counts]
743 colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue']
744 ax1 = plt.subplot2grid((1,2),(0,0))
745 patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8)
746 [x.set_fontsize(8) for x in texts]
747 plt.title('Control Group (reads)',fontsize=12)
748 labels = 'miRNA RefSeq','Template', 'Unmapped','non-template'
749 sizes = [t_mat_counts, t_tem_counts, t_unmap_counts, t_non_counts]
750 colors = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue']
751 ax2 = plt.subplot2grid((1,2),(0,1))
752 patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8)
753 [x.set_fontsize(8) for x in texts]
754 plt.title('Treated Group (reads)', fontsize=12)
755 plt.savefig('pie_non.png',dpi=300)
756
757 ######################################################################################################################################################
758
759 def merging_names(LH2E_copy,new):
760
761 dupes=[]
762 final_LH2E =[]
763
764 for num in range(len(LH2E_copy)):
765
766 if LH2E_copy[num][1] not in final_LH2E and LH2E_copy[num][0] not in final_LH2E:
767 final_LH2E.append(LH2E_copy[num][1])
768 final_LH2E.append(LH2E_copy[num][0])
769 else:
770 dupes.append(LH2E_copy[num][1])
771
772 dupes=list(set(dupes))
773
774 for i in range(len(dupes)):
775 dupes[i]=[dupes[i]]
776
777 for x in LH2E_copy:
778 for y in dupes:
779 if x[1]==y[0]:
780 fl=0
781 if len(y)==1:
782 y.append(x[0])
783 else:
784 for i in range(1,len(y)):
785 if y[i].split("_")[0]==x[0].split("_")[0]:
786 fl=1
787 if len(x[0])<len(y[i]):
788 del y[i]
789 y.append(x[0])
790 break
791
792 if fl==0:
793 y.append((x[0]))
794
795 for y in dupes:
796 if len(y)>2:
797 for i in range(len(y)-1,1,-1):
798 y[1]=y[1]+"/"+y[i]
799 del y[i]
800
801
802 for x in LH2E_copy:
803 for y in dupes:
804 if x[1]==y[0]:
805 x[0]=y[1]
806
807 LH2E_copy.sort()
808 LH2E_copy=list(LH2E_copy for LH2E_copy,_ in itertools.groupby(LH2E_copy))
809
810 new.extend(LH2E_copy)
811
812
813 ######################################################################################################################################################
814 def pie_temp(merge_LH2E,c_unmap,c_unmap_counts,merge_LH8E,t_unmap,t_unmap_counts):
815
816 c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E]
817 t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E]
818
819 c_templ = 0
820 c_tem_counts = 0
821 c_mature = 0
822 c_mat_counts = 0
823 t_templ = 0
824 t_tem_counts = 0
825 t_mature = 0
826 t_mat_counts = 0
827
828 for x in c_samples:
829
830 if "/" not in x[0]:
831 if "chr" in x[0].split("_")[-1]:
832 c_mature+=1
833 c_mat_counts += x[2]
834 else:
835 c_templ+=1
836 c_tem_counts += x[2]
837 else:
838 f=0
839 for y in x[0].split("/"):
840 if "chr" in y.split("_")[-1]:
841 c_mature+=1
842 c_mat_counts += x[2]
843 f=1
844 break
845 if f==0:
846 c_templ+=1
847 c_tem_counts += x[2]
848
849 for x in t_samples:
850
851 if "/" not in x[0]:
852 if "chr" in x[0].split("_")[-1]:
853 t_mature+=1
854 t_mat_counts += x[2]
855 else:
856 t_templ+=1
857 t_tem_counts += x[2]
858 else:
859 f=0
860 for y in x[0].split("/"):
861 if "chr" in y.split("_")[-1]:
862 t_mature+=1
863 t_mat_counts += x[2]
864 f=1
865 break
866 if f==0:
867 t_templ+=1
868 t_tem_counts += x[2]
869
870
871 fig = plt.figure()
872 labels = 'miRNA RefSeq','Template', 'Unmapped'
873 sizes = [c_mat_counts, c_tem_counts, c_unmap_counts]
874 colors = ['gold', 'yellowgreen', 'lightskyblue']
875 explode = (0.2, 0.05, 0.1)
876 ax1 = plt.subplot2grid((1,2),(0,0))
877 patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8)
878 [x.set_fontsize(8) for x in texts]
879 plt.title('Control group (reads)', fontsize=12)
880 labels = 'miRNA RefSeq','Template', 'Unmapped'
881 sizes = [t_mat_counts, t_tem_counts, t_unmap_counts]
882 colors = ['gold', 'yellowgreen', 'lightskyblue']
883 explode = (0.2, 0.05, 0.1)
884 ax2 = plt.subplot2grid((1,2),(0,1))
885 patches, texts, autotexts=plt.pie(sizes, labels=labels, colors=colors, startangle=140,autopct='%1.1f%%',radius=0.8)
886 [x.set_fontsize(8) for x in texts]
887 plt.title('Treated group (reads)',fontsize = 12)
888 plt.savefig('pie_tem.png',dpi=300)
889
890 ###################################################################################################################################################################################################################
891
892 def make_spider(merge_LH2E,merge_LH8E):
893
894 c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E]
895 t_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH8E]
896
897 c_5 = 0
898 c_5_counts = 0
899 c_3 = 0
900 c_3_counts = 0
901 c_both =0
902 c_both_counts=0
903 c_mature = 0
904 c_mat_counts = 0
905 c_exception=0
906 c_exception_counts=0
907
908
909 t_5 = 0
910 t_5_counts = 0
911 t_3 = 0
912 t_3_counts = 0
913 t_both = 0
914 t_both_counts = 0
915 t_mature = 0
916 t_mat_counts = 0
917 t_exception = 0
918 t_exception_counts=0
919
920 for x in c_samples:
921
922 if "/" not in x[0]:
923 if "chr" in x[0].split("_")[-1]:
924 c_mature+=1
925 c_mat_counts += x[2]
926 elif 0 == int(x[0].split("_")[-1]):
927 c_5+=1
928 c_5_counts += x[2]
929 elif 0 == int(x[0].split("_")[-2]):
930 c_3+=1
931 c_3_counts += x[2]
932 else:
933 c_both+=1
934 c_both_counts+=x[2]
935
936 else:
937 f=0
938 for y in x[0].split("/"):
939 if "chr" in y.split("_")[-1]:
940 c_mature+=1
941 c_mat_counts += x[2]
942 f=1
943 break
944 if f==0:
945 for y in x[0].split("/"):
946 c_exception+=1
947 c_exception_counts += x[2]
948
949
950 for x in t_samples:
951
952 if "/" not in x[0]:
953 if "chr" in x[0].split("_")[-1]:
954 t_mature+=1
955 t_mat_counts += x[2]
956 elif 0 == int(x[0].split("_")[-1]):
957 t_5+=1
958 t_5_counts += x[2]
959 elif 0 == int(x[0].split("_")[-2]):
960 t_3+=1
961 t_3_counts += x[2]
962 else:
963 t_both+=1
964 t_both_counts+=x[2]
965
966 else:
967 f=0
968 for y in x[0].split("/"):
969 if "chr" in y.split("_")[-1]:
970 t_mature+=1
971 t_mat_counts += x[2]
972 f=1
973 break
974 if f==0:
975 for y in x[0].split("/"):
976 t_exception+=1
977 t_exception_counts += x[2]
978
979
980 c_all = c_5+c_3+c_both+c_mature+c_exception
981 c_all_counts = c_5_counts + c_3_counts + c_both_counts + c_mat_counts + c_exception_counts
982
983 t_all = t_5+t_3+t_both+t_mature + t_exception
984 t_all_counts = t_5_counts + t_3_counts + t_both_counts + t_mat_counts + t_exception_counts
985
986 c_5 = round(c_5/c_all*100,2)
987 c_3 = round(c_3/c_all*100,2)
988 c_both = round(c_both/c_all*100,2)
989 c_mature = round(c_mature/c_all*100,2)
990 c_exception = round(c_exception/c_all*100,2)
991
992 c_5_counts = round(c_5_counts/c_all_counts*100,2)
993 c_3_counts = round(c_3_counts/c_all_counts*100,2)
994 c_both_counts = round(c_both_counts/c_all_counts*100,2)
995 c_mat_counts = round(c_mat_counts/c_all_counts*100,2)
996 c_exception_counts = round(c_exception_counts/c_all_counts*100,2)
997
998 t_5 = round(t_5/t_all*100,2)
999 t_3 = round(t_3/t_all*100,2)
1000 t_both = round(t_both/t_all*100,2)
1001 t_mature = round(t_mature/t_all*100,2)
1002 t_exception = round(t_exception/t_all*100,2)
1003
1004 t_5_counts = round(t_5_counts/t_all_counts*100,2)
1005 t_3_counts = round(t_3_counts/t_all_counts*100,2)
1006 t_both_counts = round(t_both_counts/t_all_counts*100,2)
1007 t_mat_counts = round(t_mat_counts/t_all_counts*100,2)
1008 t_exception_counts = round(t_exception_counts/t_all_counts*100,2)
1009
1010 radar_max = max(c_5, c_3, c_both,c_mature,c_exception,t_5,t_3,t_both,t_mature,t_exception)
1011 radar_max_counts = max(c_5_counts,c_3_counts,c_both_counts,c_mat_counts,c_exception_counts,t_5_counts,t_3_counts,t_both_counts,t_mat_counts,t_exception_counts)
1012
1013 df=pd.DataFrame({
1014 'group':['Controls','Treated'],
1015 """5' and 3' isomiRs""":[c_both,t_both],
1016 """3' isomiRs""":[c_3,t_3],
1017 'miRNA RefSeq':[c_mature,t_mature],
1018 """5' isomiRs""":[c_5,t_5],
1019 'Others*':[c_exception,t_exception]})
1020
1021 df1=pd.DataFrame({
1022 'group':['Controls','Treated'],
1023 """5' and 3' isomiRs""":[c_both_counts,t_both_counts],
1024 """3' isomiRs""":[c_3_counts,t_3_counts],
1025 'miRNA RefSeq':[c_mat_counts,t_mat_counts],
1026 """5' isomiRs""":[c_5_counts,t_5_counts],
1027 'Others*':[c_exception_counts,t_exception_counts]})
1028
1029 spider_last(df,radar_max,1)
1030 spider_last(df1,radar_max_counts,2)
1031
1032
1033 def spider_last(df,radar_max,flag):
1034 # ------- PART 1: Create background
1035 fig = plt.figure()
1036 # number of variable
1037 categories=list(df)[1:]
1038 N = len(categories)
1039
1040 # What will be the angle of each axis in the plot? (we divide the plot / number of variable)
1041 angles = [n / float(N) * 2 * pi for n in range(N)]
1042 angles += angles[:1]
1043
1044 # Initialise the spider plot
1045 ax = plt.subplot(111, polar=True)
1046
1047 # If you want the first axis to be on top:
1048 ax.set_theta_offset(pi/2)
1049 ax.set_theta_direction(-1)
1050
1051 # Draw one axe per variable + add labels labels yet
1052 plt.xticks(angles[:-1], categories, fontsize=11)
1053
1054 # Draw ylabels
1055 radar_max=round(radar_max+radar_max*0.1)
1056 mul=len(str(radar_max))-1
1057 maxi=int(math.ceil(radar_max / pow(10,mul))) * pow(10,mul)
1058 sep = round(maxi/4)
1059 plt.yticks([sep, 2*sep, 3*sep, 4*sep, 5*sep], [str(sep)+'%', str(2*sep)+'%', str(3*sep)+'%', str(4*sep)+'%', str(5*sep)+'%'], color="grey", size=10)
1060 plt.ylim(0, maxi)
1061
1062 # ------- PART 2: Add plots
1063
1064 # Plot each individual = each line of the data
1065 # I don't do a loop, because plotting more than 3 groups makes the chart unreadable
1066
1067 # Ind1
1068 values=df.loc[0].drop('group').values.flatten().tolist()
1069 values += values[:1]
1070 ax.plot(angles, values,'-o', linewidth=1, linestyle='solid', label="Controls")
1071 ax.fill(angles, values, 'b', alpha=0.1)
1072
1073 # Ind2
1074 values=df.loc[1].drop('group').values.flatten().tolist()
1075 values += values[:1]
1076 ax.plot(angles, values, '-o' ,linewidth=1, linestyle='solid', label="Treated")
1077 ax.fill(angles, values, 'r', alpha=0.1)
1078
1079 # Add legend
1080 if flag==1:
1081 plt.legend(loc='upper right', bbox_to_anchor=(0.0, 0.1))
1082 plt.savefig('spider_non_red.png',dpi=300)
1083 else:
1084 plt.legend(loc='upper right', bbox_to_anchor=(0.0, 0.1))
1085 plt.savefig('spider_red.png',dpi=300)
1086
1087
1088 #############################################################################################################################################################################################################
1089 def hist_red(samples,flag):
1090 lengths=[]
1091 cat=[]
1092 total_reads=0
1093 seq=[]
1094
1095 if flag == "c":
1096 title = "Length Distribution of Control group (Redudant reads)"
1097 if flag == "t":
1098 title = "Length Distribution of Treated group (Redudant reads)"
1099
1100 for i in samples:
1101 for x in i:
1102 lengths.append(len(x[9]))
1103 if x[1]=="0":
1104 seq.append([x[9],x[0].split("-")[1],"Mapped"])
1105 cat.append("Mapped")
1106 if x[1] == "4":
1107 seq.append([x[9],x[0].split("-")[1],"Unmapped"])
1108 cat.append("Unmapped")
1109
1110 uni_len=list(set(lengths))
1111 uni_len=[x for x in uni_len if x<=35]
1112 low=min(lengths)
1113 up=max(lengths)
1114 seq.sort()
1115 uni_seq=list(seq for seq,_ in itertools.groupby(seq))
1116 dim=up-low
1117
1118 if dim>20:
1119 s=5
1120 else:
1121 s=8
1122
1123 total_reads+=sum([int(x[1]) for x in uni_seq])
1124
1125 map_reads=[]
1126 unmap_reads=[]
1127 length=[]
1128 for y in uni_len:
1129 map_temp=0
1130 unmap_temp=0
1131 for x in uni_seq:
1132 if len(x[0])==y and x[2]=="Mapped":
1133 map_temp+=int(x[1])
1134 if len(x[0])==y and x[2]=="Unmapped":
1135 unmap_temp+=int(x[1])
1136 if y<=35:
1137 length.append(y)
1138 map_reads.append(round(map_temp/total_reads*100,2))
1139 unmap_reads.append(round(unmap_temp/total_reads*100,2))
1140
1141 ylim=max([sum(x) for x in zip(unmap_reads, map_reads)])
1142 ylim=ylim+ylim*20/100
1143 fig, ax = plt.subplots()
1144 width=0.8
1145 ax.bar(length, unmap_reads, width, label='Unmapped')
1146 h=ax.bar(length, map_reads, width, bottom=unmap_reads, label='Mapped')
1147 plt.xticks(np.arange(length[0], length[-1]+1, 1))
1148 plt.xlabel('Length (nt)',fontsize=14)
1149 plt.ylabel('Percentage',fontsize=14)
1150 plt.title(title,fontsize=14)
1151 ax.legend()
1152 plt.ylim([0, ylim])
1153 ax.grid(axis='y',linewidth=0.2)
1154
1155 if flag=='c':
1156 plt.savefig('c_hist_red.png',dpi=300)
1157
1158 if flag=='t':
1159 plt.savefig('t_hist_red.png',dpi=300)
1160
1161 #################################################################################################################
1162
1163
1164 def logo_seq_red(merge, flag):
1165
1166 if flag=="c":
1167 titlos="Control group (Redundant)"
1168 file_logo="c_logo.png"
1169 file_bar="c_bar.png"
1170 if flag=="t":
1171 titlos="Treated group (Redundant)"
1172 file_logo="t_logo.png"
1173 file_bar="t_bar.png"
1174
1175 c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge]
1176
1177 A=[0]*5
1178 C=[0]*5
1179 G=[0]*5
1180 T=[0]*5
1181 total_reads=0
1182
1183 for y in c_samples:
1184 if "/" in y[0]:
1185 length=[]
1186 for x in y[0].split("/"):
1187 length.append([len(x.split("_")[-1]),x.split("_")[-1],y[2]])
1188
1189 best=min(length)
1190 total_reads+=best[2]
1191 for i in range(5):
1192 if i<len(best[1]):
1193 if best[1][i] == "A":
1194 A[i]+=best[2]
1195 elif best[1][i] == "C":
1196 C[i]+=best[2]
1197 elif best[1][i] == "G":
1198 G[i]+=best[2]
1199 else:
1200 T[i]+=best[2]
1201 else:
1202 total_reads+=y[2]
1203 for i in range(5):
1204 if i<len(y[0].split("_")[-1]):
1205 if y[0].split("_")[-1][i] == "A":
1206 A[i]+=(y[2])
1207 elif y[0].split("_")[-1][i] == "C":
1208 C[i]+=(y[2])
1209 elif y[0].split("_")[-1][i] == "G":
1210 G[i]+=(y[2])
1211 else:
1212 T[i]+=y[2]
1213
1214 A[:] = [round(x*100,1) / total_reads for x in A]
1215 C[:] = [round(x*100,1) / total_reads for x in C]
1216 G[:] = [round(x*100,1) / total_reads for x in G]
1217 T[:] = [round(x*100,1) / total_reads for x in T]
1218
1219
1220
1221 data = {'A':A,'C':C,'G':G,'T':T}
1222 df = pd.DataFrame(data, index=[1,2,3,4,5])
1223 h=df.plot.bar(color=tuple(["g", "b","gold","r"]) )
1224 h.grid(axis='y',linewidth=0.2)
1225 plt.xticks(rotation=0, ha="right")
1226 plt.ylabel("Counts (%)",fontsize=18)
1227 plt.xlabel("Positions (nt)",fontsize=18)
1228 plt.title(titlos,fontsize=20)
1229 plt.tight_layout()
1230 plt.savefig(file_bar, dpi=300)
1231
1232 import logomaker as lm
1233 crp_logo = lm.Logo(df, font_name = 'DejaVu Sans')
1234 crp_logo.style_spines(visible=False)
1235 crp_logo.style_spines(spines=['left', 'bottom'], visible=True)
1236 crp_logo.style_xticks(rotation=0, fmt='%d', anchor=0)
1237
1238 # style using Axes methods
1239 crp_logo.ax.set_title(titlos,fontsize=18)
1240 crp_logo.ax.set_ylabel("Counts (%)", fontsize=16,labelpad=5)
1241 crp_logo.ax.set_xlabel("Positions (nt)",fontsize=16, labelpad=5)
1242 crp_logo.ax.xaxis.set_ticks_position('none')
1243 crp_logo.ax.xaxis.set_tick_params(pad=-1)
1244 figure = plt.gcf()
1245 figure.set_size_inches(6, 4)
1246 crp_logo.fig.savefig(file_logo,dpi=300)
1247
1248 ##########################################################################################################################################################################################################
1249
1250
1251
1252 def logo_seq_non_red(merge_LH2E):
1253
1254 c_samples=[[x[0],x[1],sum(int(i) for i in x[2:])] for x in merge_LH2E]
1255
1256 A=[0]*5
1257 C=[0]*5
1258 G=[0]*5
1259 T=[0]*5
1260
1261 for y in c_samples:
1262 if "/" in y[0]:
1263 length=[]
1264 for x in y[0].split("/"):
1265 length.append([len(x.split("_")[-1]),x.split("_")[-1],y[2]])
1266
1267 best=min(length)
1268 for i in range(5):
1269 if i<len(best[1]):
1270 if best[1][i] == "A":
1271 A[i]+=1
1272 elif best[1][i] == "C":
1273 C[i]+=1
1274 elif best[1][i] == "G":
1275 G[i]+=1
1276 else:
1277 T[i]+=1
1278 else:
1279 for i in range(5):
1280 if i<len(y[0].split("_")[-1]):
1281 if y[0].split("_")[-1][i] == "A":
1282 A[i]+=1
1283 elif y[0].split("_")[-1][i] == "C":
1284 C[i]+=1
1285 elif y[0].split("_")[-1][i] == "G":
1286 G[i]+=1
1287 else:
1288 T[i]+=1
1289
1290 data = {'A':A,'C':C,'G':G,'T':T}
1291 df = pd.DataFrame(data, index=[1,2,3,4,5])
1292 h=df.plot.bar(title="Non-templated nucleotides after templated sequence",color=tuple(["g", "b","gold","r"]))
1293 h.set_xlabel("Positions (nt)")
1294 h.set_ylabel("Unique sequences")
1295 plt.xticks(rotation=0, ha="right")
1296 plt.tight_layout()
1297 plt.savefig("bar2.png", dpi=300)
1298
1299
1300 import logomaker as lm
1301 crp_logo = lm.Logo(df, font_name = 'DejaVu Sans')
1302
1303 # style using Logo methods
1304 crp_logo.style_spines(visible=False)
1305 crp_logo.style_spines(spines=['left', 'bottom'], visible=True)
1306 crp_logo.style_xticks(rotation=0, fmt='%d', anchor=0)
1307
1308 # style using Axes methods
1309 crp_logo.ax.set_ylabel("Unique sequences", labelpad=5)
1310 crp_logo.ax.set_xlabel("Positions (nt)", labelpad=5)
1311 crp_logo.ax.xaxis.set_ticks_position('none')
1312 crp_logo.ax.xaxis.set_tick_params(pad=-1)
1313 crp_logo.ax.set_title("Non-redundant")
1314 figure = plt.gcf()
1315 crp_logo.fig.savefig('logo2.png', dpi=300)
1316
1317
1318 ###################################################################################################################################################################################################################
1319
1320 def ssamples1(tem_names,tem_samp,non_names,non_samp,folder,pro):
1321
1322 for i in range(2,len(tem_samp[0])):
1323
1324 fp = open(folder+tem_names[i-2]+'.txt','w')
1325 fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n")
1326
1327 for x in tem_samp:
1328 fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
1329
1330 for j in range(len(non_names)):
1331 if non_names[j]==tem_names[i-2]:
1332 for x in non_samp:
1333 fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n")
1334 fp.close()
1335
1336 ###################################################################################################################################################################################################################
1337
1338 def download_matures(matures,org_name):
1339
1340 #url = 'ftp://mirbase.org/pub/mirbase/21/mature.fa.gz'
1341 url = 'ftp://mirbase.org/pub/mirbase/CURRENT/mature.fa.gz'
1342 data = urllib.request.urlopen(url).read()
1343 file_mirna = gzip.decompress(data).decode('utf-8')
1344 file_mirna = file_mirna.split("\n")
1345
1346 for i in range(0,len(file_mirna)-1,2):
1347
1348 if org_name in file_mirna[i]:
1349 matures.append(file_mirna[i])
1350 matures.append(file_mirna[i+1])
1351
1352 ###################################################################################################################################################################################################################
1353 def non_template_ref(sc,st,all_isoforms):
1354
1355 pre_uni_seq_con = list(sc)
1356 pre_uni_seq_tre = list(st)
1357
1358 for x in pre_uni_seq_con:
1359 for y in x:
1360 if ">"+y[2] not in all_isoforms and ")_" in y[2] :
1361 all_isoforms.append(">"+y[2])
1362 all_isoforms.append(y[9])
1363
1364
1365 for x in pre_uni_seq_tre:
1366 for y in x:
1367 if ">"+y[2] not in all_isoforms and ")_" in y[2]:
1368 all_isoforms.append(">"+y[2])
1369 all_isoforms.append(y[9])
1370
1371 ################################################################################################################################################################################################
1372
1373 def deseqe2(sample,mir_names,l,new_d,sample_name,sample_order):
1374
1375 for y in mir_names:
1376 flag=0
1377 for x in sample:
1378 if y[0]==x[0]:
1379 flag=1
1380 break
1381 if flag==0:
1382 sample.append([y[0],"0",y[1]])
1383
1384 sample.sort(key=lambda x: x[0])
1385 sample=list(sample for sample,_ in itertools.groupby(sample))
1386
1387 l.acquire()
1388 new_d.append(sample)
1389 sample_order.append(sample_name)
1390 l.release()
1391
1392 ###############################################################################################################################################################################################
1393
1394 if __name__ == '__main__':
1395
1396 starttime = time.time()
1397
1398 q1 = Queue()
1399 q2 = Queue()
1400 lock = Lock()
1401 manager = Manager()
1402
1403 mature_mirnas=manager.list()
1404 ps_mature=Process(target=download_matures,args=(mature_mirnas,args.org_name))
1405 ps_mature.start()
1406
1407 args.control[0]=args.control[0][1:]
1408 args.control[len(args.control)-1][:-1]
1409 control = [(args.control[i:i+2]) for i in range(0, len(args.control), 2)]
1410
1411 args.treated[0]=args.treated[0][1:]
1412 args.treated[len(args.treated)-1][:-1]
1413 treated = [(args.treated[i:i+2]) for i in range(0, len(args.treated), 2)]
1414
1415
1416 ############## Detection of templated isoforms ################
1417
1418 radar = manager.list([0,0,0,0])
1419 samples = manager.list()
1420 data= manager.list()
1421 names_con=manager.list()
1422 samples_mirna_names=manager.list()
1423 deseq=manager.list()
1424 unmap_seq=manager.Value('i',0)
1425 unmap_counts=manager.Value('i',0)
1426 LH2E_names=manager.list()
1427 ini_c_samples = manager.list()
1428
1429
1430 radar1 = manager.list([0,0,0,0])
1431 samples1 = manager.list()
1432 data1 = manager.list()
1433 names_tre = manager.list()
1434 samples_mirna_names1=manager.list()
1435 deseq1=manager.list()
1436 unmap1_seq = manager.Value('i',0)
1437 unmap1_counts = manager.Value('i',0)
1438 LH8E_names=manager.list()
1439 ini_t_samples = manager.list()
1440 ps_mature.join()
1441
1442
1443 mature_mirnas=list(mature_mirnas)
1444
1445
1446 starttime1 = time.time()
1447 ps_sam = [Process(target=sam,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,samples,data,names_con,unmap_seq,samples_mirna_names,deseq,LH2E_names,"0",ini_c_samples,unmap_counts)) for path in control]
1448 ps_sam.extend([Process(target=sam,args=(mature_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,samples1,data1,names_tre,unmap1_seq,samples_mirna_names1,deseq1,LH8E_names,"0",ini_t_samples,unmap1_counts)) for path in treated])
1449
1450 [p.start() for p in ps_sam]
1451 [p.join() for p in ps_sam]
1452 print('SAM took {} seconds'.format(time.time() - starttime1))
1453
1454 ps_hist=[Process(target=hist_red,args=(ini_c_samples,'c'))]
1455 ps_hist.extend([Process(target=hist_red,args=(ini_t_samples,'t'))])
1456 [x.start() for x in ps_hist]
1457
1458 starttime200=time.time()
1459
1460 sc = list(samples)
1461 st = list(samples1)
1462
1463 names_con=list(names_con)
1464 names_tre=list(names_tre)
1465 samples_mirna_names=list(samples_mirna_names)
1466 samples_mirna_names.sort()
1467 samples_mirna_names=list(samples_mirna_names for samples_mirna_names,_ in itertools.groupby(samples_mirna_names))
1468
1469 samples_mirna_names1=list(samples_mirna_names1)
1470 samples_mirna_names1.sort()
1471 samples_mirna_names1=list(samples_mirna_names1 for samples_mirna_names1,_ in itertools.groupby(samples_mirna_names1))
1472
1473 deseq=list(deseq)
1474 deseq1=list(deseq1)
1475
1476 new_names_con=manager.list()
1477 new_names_tre=manager.list()
1478 new_deseq=manager.list()
1479 new_deseq1=manager.list()
1480 ps_deseq=[Process(target=deseqe2,args=(sampp,samples_mirna_names,lock,new_deseq,names_con[i],new_names_con)) for i,sampp in enumerate(deseq)]
1481 ps_deseq.extend([Process(target=deseqe2,args=(sampp,samples_mirna_names1,lock,new_deseq1,names_tre[i],new_names_tre)) for i,sampp in enumerate(deseq1)])
1482
1483 [z.start() for z in ps_deseq]
1484 [z.join() for z in ps_deseq]
1485 new_deseq=list(new_deseq)
1486 new_deseq1=list(new_deseq1)
1487
1488 LH2E=[[x[0],x[2]] for x in new_deseq[0]]
1489 [LH2E[i].append(y[i][1]) for i,_ in enumerate(LH2E) for y in new_deseq]
1490
1491 LH8E=[[x[0],x[2]] for x in new_deseq1[0]]
1492 [LH8E[i].append(y[i][1]) for i,_ in enumerate(LH8E) for y in new_deseq1]
1493
1494 print('Deseq took {} seconds'.format(time.time() - starttime200))
1495
1496 merg_nam_LH2E=manager.list()
1497 merg_nam_LH8E=manager.list()
1498
1499 LH2E_copy=copy.deepcopy(list(LH2E))
1500 LH8E_copy=copy.deepcopy(list(LH8E))
1501
1502 fil_sort_tre=manager.list()
1503 fil_sort_con=manager.list()
1504 raw_sort_tre=manager.list()
1505 raw_sort_con=manager.list()
1506
1507 ps_main = Process(target=main_temp,args=(list(LH2E), samples_mirna_names, list(LH8E), samples_mirna_names1,1,list(names_con),list(names_tre),fil_sort_tre,fil_sort_con,raw_sort_tre,raw_sort_con))
1508 ps_main.start()
1509
1510 if args.anal=="2":
1511 all_iso = manager.list()
1512 ps_non_iso = Process(target=non_template_ref,args=(sc,st,all_iso))
1513 ps_non_iso.start()
1514
1515 ps_merge = [Process(target=merging_names,args=(LH2E_copy,merg_nam_LH2E))]
1516 ps_merge.extend([Process(target=merging_names,args=(LH8E_copy,merg_nam_LH8E))])
1517 [x.start() for x in ps_merge]
1518 [x.join() for x in ps_merge]
1519
1520 merg_nam_LH2E=list(merg_nam_LH2E)
1521 merg_nam_LH8E=list(merg_nam_LH8E)
1522
1523 starttime2 = time.time()
1524 procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in data]
1525 procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],1)) for x in data1])
1526 procs.extend([Process(target=make_spider,args=(merg_nam_LH2E,merg_nam_LH8E))])
1527 if args.anal == "1":
1528 procs.extend([Process(target=pie_temp,args=(merg_nam_LH2E,unmap_seq.value,unmap_counts.value,merg_nam_LH8E,unmap1_seq.value,unmap1_counts.value))])
1529
1530 [p.start() for p in procs]
1531
1532
1533 if args.anal=="1":
1534 [x.join() for x in ps_hist]
1535 [p.join() for p in procs]
1536 ps_pdf = Process(target=pdf_before_DE,args=(args.anal))
1537 ps_pdf.start()
1538
1539 print('Graphs took {} seconds'.format(time.time() - starttime2))
1540
1541 ps_main.join()
1542
1543 fil_sort_con=list(fil_sort_con)
1544 fil_sort_tre=list(fil_sort_tre)
1545 if fil_sort_con==[]:
1546 fil_sort_con=raw_sort_con
1547 fil_sort_tre=raw_sort_tre
1548
1549 raw_sort_con=list(raw_sort_con)
1550 raw_sort_tre=list(raw_sort_tre)
1551 names_con=list(new_names_con)
1552 names_tre=list(new_names_tre)
1553
1554 ps_write = Process(target=write_main,args=(raw_sort_con, raw_sort_tre, fil_sort_con, fil_sort_tre, names_con,names_tre,1))
1555 ps_write.start()
1556
1557 ps1_matrix = [Process(target=ssamples,args=(names_con,fil_sort_con,"Diff/temp_con/",0))]
1558 ps1_matrix.extend([Process(target=ssamples,args=(names_tre,fil_sort_tre,"Diff/temp_tre/",0))])
1559 [p.start() for p in ps1_matrix]
1560
1561 if args.anal=="1":
1562 ps_pdf.join()
1563 if args.anal=="2":
1564 [p.join() for p in procs]
1565 [x.join() for x in ps_hist]
1566
1567 ps_write.join()
1568 [p.join() for p in ps1_matrix]
1569
1570
1571 ############################## Detection of Both #######################################
1572
1573 starttime10 = time.time()
1574
1575 if args.anal == "2":
1576
1577 n_data= manager.list()
1578 n_names_con=manager.list()
1579 n_samples_mirna_names=manager.list()
1580 n_deseq=manager.list()
1581 n_LH2E_names=manager.list()
1582
1583 n_data1 = manager.list()
1584 n_names_tre = manager.list()
1585 n_samples_mirna_names1=manager.list()
1586 n_deseq1=manager.list()
1587 n_LH8E_names=manager.list()
1588
1589
1590 new_mat_mirnas = list(mature_mirnas)
1591 ps_non_iso.join()
1592
1593 all_iso=list(all_iso)
1594 new_mat_mirnas.extend(all_iso)
1595
1596 starttime11=time.time()
1597
1598 ps_sam = [Process(target=non_sam,args=(new_mat_mirnas,path[1][:-1],path[0].split(",")[0],"c",lock,n_data,n_names_con,n_deseq,n_samples_mirna_names,n_LH2E_names)) for path in control]
1599 ps_sam.extend([Process(target=non_sam,args=(new_mat_mirnas,path[1][:-1],path[0].split(",")[0],"t",lock,n_data1,n_names_tre,n_deseq1,n_samples_mirna_names1,n_LH8E_names)) for path in treated])
1600
1601 [p.start() for p in ps_sam]
1602 [p.join() for p in ps_sam]
1603
1604 print('Non-sam took {} seconds'.format(time.time() - starttime11))
1605
1606 starttime12=time.time()
1607
1608 n_names_con=list(n_names_con)
1609 n_names_tre=list(n_names_tre)
1610 n_samples_mirna_names=list(n_samples_mirna_names)
1611 n_samples_mirna_names.sort()
1612 n_samples_mirna_names=list(n_samples_mirna_names for n_samples_mirna_names,_ in itertools.groupby(n_samples_mirna_names))
1613
1614 n_samples_mirna_names1=list(n_samples_mirna_names1)
1615 n_samples_mirna_names1.sort()
1616 n_samples_mirna_names1=list(n_samples_mirna_names1 for n_samples_mirna_names1,_ in itertools.groupby(n_samples_mirna_names1))
1617
1618 n_deseq=list(n_deseq)
1619 n_deseq1=list(n_deseq1)
1620
1621 new_n_names_con=manager.list()
1622 new_n_names_tre=manager.list()
1623 n_new_deseq=manager.list()
1624 n_new_deseq1=manager.list()
1625 ps_deseq=[Process(target=deseqe2,args=(sampp,n_samples_mirna_names,lock,n_new_deseq,n_names_con[i],new_n_names_con)) for i,sampp in enumerate(n_deseq)]
1626 ps_deseq.extend([Process(target=deseqe2,args=(sampp,n_samples_mirna_names1,lock,n_new_deseq1,n_names_tre[i],new_n_names_tre)) for i,sampp in enumerate(n_deseq1)])
1627
1628 [x.start() for x in ps_deseq]
1629 [x.join() for x in ps_deseq]
1630 n_new_deseq=list(n_new_deseq)
1631 n_new_deseq1=list(n_new_deseq1)
1632
1633 n_LH2E=[[x[0],x[2]] for x in n_new_deseq[0]]
1634 [n_LH2E[i].append(y[i][1]) for i,_ in enumerate(n_LH2E) for y in n_new_deseq]
1635
1636 n_LH8E=[[x[0],x[2]] for x in n_new_deseq1[0]]
1637 [n_LH8E[i].append(y[i][1]) for i,_ in enumerate(n_LH8E) for y in n_new_deseq1]
1638
1639 print('Non-deseq took {} seconds'.format(time.time() - starttime12))
1640
1641 merg_nam_n_LH2E=manager.list()
1642 merg_nam_n_LH8E=manager.list()
1643
1644 n_LH2E_copy=copy.deepcopy(list(n_LH2E))
1645 n_LH8E_copy=copy.deepcopy(list(n_LH8E))
1646
1647 n_sort_tre=manager.list()
1648 n_sort_con=manager.list()
1649
1650 n_fil_sort_con=manager.list()
1651 n_fil_sort_tre=manager.list()
1652 n_raw_sort_con=manager.list()
1653 n_raw_sort_tre=manager.list()
1654
1655 ps_main = Process(target=main_temp,args=(list(n_LH2E), n_samples_mirna_names, list(n_LH8E), n_samples_mirna_names1,1,list(n_names_con),list(n_names_tre),n_fil_sort_tre,n_fil_sort_con,n_raw_sort_tre,n_raw_sort_con))
1656 ps_main.start()
1657
1658 ps_merge = [Process(target=merging_names,args=(n_LH2E_copy,merg_nam_n_LH2E))]
1659 ps_merge.extend([Process(target=merging_names,args=(n_LH8E_copy,merg_nam_n_LH8E))])
1660 [p.start() for p in ps_merge]
1661 [p.join() for p in ps_merge]
1662
1663 merg_nam_n_LH2E=list(merg_nam_n_LH2E)
1664 merg_nam_n_LH8E=list(merg_nam_n_LH8E)
1665
1666 procs = [Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_data]
1667 procs.extend([Process(target=DB_write,args=(x[0],x[1],x[2],x[3],2)) for x in n_data1])
1668 procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_LH2E,'c'))])
1669 procs.extend([Process(target=logo_seq_red,args=(merg_nam_n_LH8E,'t'))])
1670 procs.extend([Process(target=pie_non_temp,args=(merg_nam_LH2E,merg_nam_n_LH2E,merg_nam_LH8E,merg_nam_n_LH8E,unmap_seq.value,unmap1_seq.value,unmap_counts.value,unmap1_counts.value))])
1671
1672 starttime13=time.time()
1673 [p.start() for p in procs]
1674 [p.join() for p in procs]
1675
1676 print('Graphs took {} seconds'.format(time.time() - starttime13))
1677
1678 procs1 = Process(target=pdf_before_DE,args=(args.anal))
1679 procs1.start()
1680
1681 starttime14=time.time()
1682 ps_main.join()
1683
1684 n_fil_sort_con=list(n_fil_sort_con)
1685 n_fil_sort_tre=list(n_fil_sort_tre)
1686 if n_fil_sort_con==[]:
1687 n_fil_sort_con=n_raw_sort_con
1688 n_fil_sort_tre=n_raw_sort_tre
1689
1690 n_raw_sort_con=list(n_raw_sort_con)
1691 n_raw_sort_tre=list(n_raw_sort_tre)
1692 n_names_con=list(new_n_names_con)
1693 n_names_tre=list(new_n_names_tre)
1694
1695 ps_write = Process(target=write_main,args=(n_raw_sort_con, n_raw_sort_tre,n_fil_sort_con, n_fil_sort_tre, n_names_con, n_names_tre,2))
1696 ps_write.start()
1697
1698 ps1_matrix = [Process(target=ssamples1,args=(n_names_con,n_fil_sort_con,names_con,fil_sort_con,"Diff/n_temp_con/",0))]
1699 ps1_matrix.extend([Process(target=ssamples1,args=(n_names_tre,n_fil_sort_tre,names_tre,fil_sort_tre,"Diff/n_temp_tre/",0))])
1700 [p.start() for p in ps1_matrix]
1701
1702 ps_write.join()
1703 [p.join() for p in ps1_matrix]
1704 procs1.join()
1705 print('That took {} seconds'.format(time.time() - starttime10))
1706 print('That took {} seconds'.format(time.time() - starttime))
1707
1708
1709
1710
1711
1712
1713
1714