comparison mirbase_functions.py @ 0:258aaaa465f3 draft

Uploaded
author glogobyte
date Fri, 16 Oct 2020 10:48:17 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:258aaaa465f3
1 import itertools
2 import time
3 import sys
4 import os
5 import urllib.request
6 import gzip
7 from multiprocessing import Process, Queue, Lock, Pool, Manager, Value
8 import subprocess
9 import argparse
10 from collections import OrderedDict
11 from matplotlib.backends.backend_pdf import PdfPages
12 import pandas as pd
13 from math import pi
14 import numpy as np
15 import matplotlib.pyplot as plt
16 from matplotlib.ticker import PercentFormatter
17 import seaborn as sns
18 import scipy.stats as stats
19 from plotnine import *
20 import math
21 import re
22 import matplotlib.ticker as mtick
23 import copy
24
25
26 """---------------------- Simple Functions -----------------------"""
27
28 # Read a file and return it as a list
29 def read(path, flag):
30 if flag == 0:
31 with open(path) as fp:
32 file=fp.readlines()
33 fp.close()
34 return file
35
36 if flag == 1:
37 with open(path) as fp:
38 file = fp.read().splitlines()
39 fp.close()
40 return file
41
42 # Write a list to a txt file
43 def write(path, list):
44 with open(path,'w') as fp:
45 for x in list:
46 fp.write(str("\t".join(x[1:-1])))
47 fp.close()
48
49 """---------------------- RNA-seq Functions ----------------------"""
50
51 # Detect the longest common substring sequence between two mirnas
52 def longestSubstring(str1, str2):
53
54 from difflib import SequenceMatcher
55 # initialize SequenceMatcher object with
56 # input string
57 seqMatch = SequenceMatcher(None, str1, str2)
58
59 # find match of longest sub-string
60 # output will be like Match(a=0, b=0, size=5)
61 match = seqMatch.find_longest_match(0, len(str1), 0, len(str2))
62
63 # print longest substring
64 if (match.size != 0):
65 return str1[match.a: match.a + match.size]
66 else:
67 print('No longest common sub-string found')
68
69
70
71 ########################################################################################################################################################
72
73 def collapse_sam(path):
74
75 ini_sam=read(path,0)
76 main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
77 intro_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" in x.split("\t")[0]]
78
79 uni_seq = []
80 for x in main_sam:
81
82 if [x[2], x[9]] not in uni_seq:
83 uni_seq.append([x[2], x[9]])
84
85 new_main_sam=[]
86 incr_num=0
87 for i in range(len(uni_seq)):
88 count=0
89 incr_num+=1
90 for y in main_sam:
91 if uni_seq[i][1]==y[9] and uni_seq[i][0]==y[2]:
92 count+=1
93 temp=y
94 temp[10]="~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
95 temp[0]=str(incr_num)+"-"+str(count)
96 new_main_sam.append(temp)
97
98 new_sam=intro_sam+new_main_sam
99
100 return new_sam
101
102 #################################################################################################################################################################################################################
103
104 def duplicate_chroms_isoforms(List):
105
106 dupes=[]
107
108 for num in range(len(List)):
109
110 if [List[num][9],List[num][0],List[num][2]] not in dupes :
111 dupes.append([List[num][9],List[num][0],List[num][2]])
112
113 for x in List:
114 for y in dupes:
115 if x[9]==y[0] and x[0]==y[1] and x[2].split("_")[0]==y[2].split("_")[0] and x[2]!=y[2]:
116 y.append(x[2])
117
118
119 double_List = [x[:] for x in List]
120
121 chr_order=[]
122 for x in dupes:
123 temp = []
124 for i in range(2,len(x)):
125 if x[i].split("chr")[1].split("(")[0].isdigit():
126 temp.append(int(x[i].split("chr")[1].split("(")[1][0]+x[i].split("chr")[1].split("(")[0]))
127 else:
128 temp.append(x[i].split("chr")[1][0:4])
129
130 for z in temp:
131 if 'X(-)'==z or 'Y(-)'==z or 'X(+)'==z or 'Y(+)'==z:
132 temp = [str(j) for j in temp]
133 temp=list(set(temp))
134 temp.sort()
135 chr_order.append(temp)
136
137 final_dupes=[]
138 for i in range(len(dupes)):
139 final_dupes.append([dupes[i][0],dupes[i][2].split("_")[0],dupes[i][1]])
140 for x in chr_order[i]:
141 result = re.match("[-+]?\d+$", str(x))
142 if len(chr_order[i]) == len(set(chr_order[i])):
143 if result is not None:
144
145 if int(x)<0:
146 final_dupes[i][1]=final_dupes[i][1]+"_chr"+str(abs(int(x)))+"(-)"
147 else:
148 final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(abs(int(x)))+"(+)"
149 else:
150 final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(x)
151 else:
152 if result is not None:
153 if int(x) < 0:
154 final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(-)"
155 else:
156 final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(+)"
157 else:
158 final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(x)
159
160 final_dupes.sort()
161 final_dupes=list(final_dupes for final_dupes,_ in itertools.groupby(final_dupes))
162
163 for i in range(len(double_List)):
164 for x in final_dupes:
165
166 if double_List[i][9] == x[0] and double_List[i][0] == x[2] and len(double_List[i][2].split("_")) >3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]:
167 gg=str("_"+double_List[i][2].split("_")[-2]+"_"+double_List[i][2].split("_")[-1])
168 double_List[i][2] = x[1]+gg
169
170 if double_List[i][9]==x[0] and double_List[i][0]== x[2] and len(double_List[i][2].split("_"))==3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]:
171 double_List[i][2]=x[1]
172 List[i][2] = x[1]
173
174 List.sort()
175 new_list=list(List for List,_ in itertools.groupby(List))
176
177 double_List.sort()
178 new_double_List = list(double_List for double_List, _ in itertools.groupby(double_List))
179
180 return new_list, new_double_List
181
182
183 #############################################################################################################################################################################################################
184
185 def sam(mature_mirnas,path,name,con,l,samples,data,names,unmap_seq,samples_mirna_names,deseq,LHE_names,umi,ini_sample,unmap_counts):
186
187 # read the sam file
188 ini_sam=read(path,0)
189 new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
190 unique_seq = [x for x in new_main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26]
191
192 sorted_uni_arms = []
193
194 for i in range(len(mature_mirnas)):
195 tmp_count_reads = 0 # calculate the total number of reads
196 tmp_count_seq = 0 # calculate the total number of sequences
197 for j in range(len(unique_seq)):
198
199 if "{" in unique_seq[j][2].split("_")[0]:
200 official=unique_seq[j][2].split("_")[0][:-4]
201 else:
202 official=unique_seq[j][2].split("_")[0]
203
204 if mature_mirnas[i].split(" ")[0][1:] == official:
205
206 temp_mature = mature_mirnas[i+1].strip().replace("U", "T")
207 off_part = longestSubstring(temp_mature, unique_seq[j][9])
208
209 mat_diff = temp_mature.split(off_part)
210 mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
211
212 unique_diff = unique_seq[j][9].split(off_part)
213 unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
214
215 # Problem with hsa-miR-8485
216 if mat_diff[1]!=0 and unique_diff[1]!=0:
217 unique_seq[j]=1
218 pre_pos = 0
219 post_pos = 0
220
221 elif mat_diff[0]!=0 and unique_diff[0]!=0:
222 unique_seq[j]=1
223 pre_pos = 0
224 post_pos = 0
225
226 else:
227 pre_pos = mat_diff[0]-unique_diff[0]
228 post_pos = unique_diff[1]-mat_diff[1]
229 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
230 tmp_count_seq = tmp_count_seq+1
231
232 if pre_pos != 0 or post_pos != 0:
233 if pre_pos == 0:
234 unique_seq[j][2] = unique_seq[j][2] + "_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos)
235 elif post_pos == 0:
236 unique_seq[j][2] = unique_seq[j][2] + "_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos)
237 else:
238 unique_seq[j][2] = unique_seq[j][2]+"_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos)
239
240 for x in range(unique_seq.count(1)):
241 unique_seq.remove(1)
242 if tmp_count_reads != 0 and tmp_count_seq != 0:
243 sorted_uni_arms.append([mature_mirnas[i].split(" ")[0][1:], tmp_count_seq, tmp_count_reads])
244 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
245 dedup_unique_seq,double_fil_uni_seq=duplicate_chroms_isoforms(unique_seq)
246
247 for y in sorted_uni_arms:
248 counts=0
249 seqs=0
250 for x in double_fil_uni_seq:
251 if y[0]==x[2].split("_")[0]:
252 counts+=int(x[0].split("-")[1])
253 seqs+=1
254
255 y[1]=seqs
256 y[2]=counts
257
258 LHE=[]
259 l.acquire()
260 if con=="c":
261 LHE.extend(z[2] for z in double_fil_uni_seq)
262 for y in double_fil_uni_seq:
263 samples_mirna_names.append([y[2],y[9]])
264 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq])
265 LHE_names.extend(LHE)
266 unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4'])
267 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4'])
268 names.append(name)
269 samples.append(dedup_unique_seq)
270 data.append([con,name,double_fil_uni_seq,sorted_uni_arms])
271 ini_sample.append(new_main_sam)
272
273 if con=="t":
274 LHE.extend(z[2] for z in double_fil_uni_seq)
275 for y in double_fil_uni_seq:
276 samples_mirna_names.append([y[2],y[9]])
277 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq])
278 LHE_names.extend(LHE)
279 unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4'])
280 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4'])
281 names.append(name)
282 samples.append(dedup_unique_seq)
283 data.append([con,name,double_fil_uni_seq,sorted_uni_arms])
284 ini_sample.append(new_main_sam)
285 l.release()
286
287
288 ######################################################################################################################################
289
290 """
291
292 Read a sam file from Bowtie and do the followings:
293
294 1) Remove reverse stranded mapped reads
295 2) Remove unmapped reads
296 3) Remove all sequences with reads less than 11 reads
297 4) Sort the arms with the most sequences in decreading rate
298 5) Sort the sequences of every arm with the most reads in decreasing rate
299 6) Calculate total number of sequences of every arm
300 7) Calculate total number of reads of sequences of every arm.
301 8) Store all the informations in a txt file
302
303 """
304
305 def non_sam(mature_mirnas,path,name,con,l,data,names,n_deseq,n_samples_mirna_names,n_LHE_names):
306
307 ini_sam=read(path,0)
308 new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
309 unique_seq=[]
310 unique_seq = [x for x in new_main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26]
311
312 uni_seq=[]
313 # Calculate the shifted positions for every isomir and add them to the name of it
314 sorted_uni_arms = []
315 for i in range(1,len(mature_mirnas),2):
316 tmp_count_reads = 0 # calculate the total number of reads
317 tmp_count_seq = 0 # calculate the total number of sequences
318
319 for j in range(len(unique_seq)):
320
321 temp_mature = mature_mirnas[i].strip().replace("U", "T")
322
323 if temp_mature in unique_seq[j][9]:
324
325 off_part = longestSubstring(temp_mature, unique_seq[j][9])
326
327 mat_diff = temp_mature.split(off_part)
328 mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
329
330 unique_diff = unique_seq[j][9].split(off_part)
331 if len(unique_diff)<=2:
332 unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
333
334 pre_pos = mat_diff[0]-unique_diff[0]
335 post_pos = unique_diff[1]-mat_diff[1]
336
337 lengthofmir = len(off_part) + post_pos
338 if pre_pos == 0:
339 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
340 tmp_count_seq = tmp_count_seq + 1
341
342 if pre_pos == 0:
343
344 t_name=unique_seq[j].copy()
345 t_name[2]=mature_mirnas[i - 1].split(" ")[0][1:] + "__" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):])
346 uni_seq.append(t_name)
347
348
349 if tmp_count_reads != 0 and tmp_count_seq != 0:
350 sorted_uni_arms.append([mature_mirnas[i-1].split(" ")[0][1:], tmp_count_seq, tmp_count_reads])
351
352
353 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
354 unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq))))
355
356 LHE=[]
357
358 l.acquire()
359 if con=="c":
360 LHE.extend(x[2] for x in unique_seq if x[2]!="*")
361 for x in unique_seq:
362 if x[2]!="*":
363 n_samples_mirna_names.append([x[2],x[9]])
364 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
365 n_LHE_names.extend(LHE)
366 names.append(name)
367 data.append([con,name,unique_seq,sorted_uni_arms])
368
369
370 if con=="t":
371 LHE.extend(x[2] for x in unique_seq if x[2]!="*")
372 for x in unique_seq:
373 if x[2]!="*":
374 n_samples_mirna_names.append([x[2],x[9]])
375 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
376 n_LHE_names.extend(LHE)
377 names.append(name)
378 data.append([con,name,unique_seq,sorted_uni_arms])
379 l.release()
380
381 #####################################################################################################################################################################################################################
382 def deseq2_temp(samples_mirna_names,deseq,con,l):
383
384 samples_mirna_names.sort(key=lambda x:[0])
385 for i in range(len(deseq)):
386 for y in samples_mirna_names:
387 flag = 0
388 for x in deseq[i]:
389 if y[0] == x[0]:
390 flag = 1
391 break
392
393 if flag == 0:
394 deseq[i].append([y[0], "0", y[1]])
395
396 [deseq[i].sort(key=lambda x: x[0]) for i, _ in enumerate(deseq)]
397 deseq_final = [[x[0],x[2]] for x in deseq[0]]
398 [deseq_final[z].append(deseq[i][j][1]) for z,_ in enumerate(deseq_final) for i, _ in enumerate(deseq) for j,_ in enumerate(deseq[i]) if deseq_final[z][0] == deseq[i][j][0]]
399
400 l.acquire()
401 if con=="c":
402 q1.put(deseq_final)
403
404 if con=="t":
405 q2.put(deseq_final)
406 l.release()
407
408
409 ####################################################################################################################################################################################################################
410
411 def main_temp(LH2E, LH2E_names, LH8E, LH8E_names,flag,names_con,names_tre,filter_LH8E,filter_LH2E,raw_LH8E,raw_LH2E,per,count):
412
413 LH8E_add_names = [x for x in LH2E_names if x not in LH8E_names]
414 LH2E_add_names = [x for x in LH8E_names if x not in LH2E_names]
415
416 LH8E_add_names.sort()
417 LH2E_add_names.sort()
418 LH8E_add_names = list(LH8E_add_names for LH8E_add_names,_ in itertools.groupby(LH8E_add_names))
419 LH2E_add_names = list(LH2E_add_names for LH2E_add_names,_ in itertools.groupby(LH2E_add_names))
420
421 LH2E.sort()
422 LH8E.sort()
423 LH2E = list(LH2E for LH2E,_ in itertools.groupby(LH2E))
424 LH8E = list(LH8E for LH8E,_ in itertools.groupby(LH8E))
425
426 print("LHE_names")
427 print([len(LH8E_add_names),len(LH2E_add_names)])
428 print([len(LH8E),len(LH2E)])
429
430 zeros=["0"]*(len(LH8E[0])-2)
431 [LH8E_add_names[i].extend(zeros) for i,_ in enumerate(LH8E_add_names)]
432 LH8E=LH8E+LH8E_add_names
433
434 zeros=["0"]*(len(LH2E[0])-2)
435 [LH2E_add_names[i].extend(zeros) for i,_ in enumerate(LH2E_add_names)]
436 LH2E=LH2E+LH2E_add_names
437
438 dupes=[]
439 final_LH2E =[]
440
441 for num,_ in enumerate(LH2E):
442
443 if LH2E[num][1] not in final_LH2E and LH2E[num][0] not in final_LH2E:
444 final_LH2E.append(LH2E[num][1])
445 final_LH2E.append(LH2E[num][0])
446 else:
447 dupes.append(LH2E[num][1])
448
449
450 dupes=list(set(dupes))
451
452 dupes=[[x] for x in dupes]
453
454 for x in LH2E:
455 for y in dupes:
456 if x[1]==y[0]:
457 fl=0
458 if len(y)==1:
459 y.append(x[0])
460 else:
461 for i in range(1,len(y)):
462 if y[i].split("_")[0]==x[0].split("_")[0]:
463 fl=1
464 if len(x[0])<len(y[i]):
465 del y[i]
466 y.append(x[0])
467 break
468
469 if fl==0:
470 y.append((x[0]))
471
472 for y in dupes:
473 if len(y)>2:
474 for i in range(len(y)-1,1,-1):
475 y[1]=y[1]+"/"+y[i]
476 del y[i]
477
478 for x in LH2E:
479 for y in dupes:
480 if x[1]==y[0]:
481 x[0]=y[1]
482
483 for x in LH8E:
484 for y in dupes:
485 if x[1]==y[0]:
486 x[0]=y[1]
487
488
489 LH2E.sort()
490 LH2E=list(LH2E for LH2E,_ in itertools.groupby(LH2E))
491
492 LH8E.sort()
493 LH8E=list(LH8E for LH8E,_ in itertools.groupby(LH8E))
494
495 if int(per)!=-1:
496 percent=int(per)/100
497 print(percent)
498 print(count)
499
500 c_col_filter=round(percent*(len(LH2E[1])-2))
501 t_col_filter=round(percent*(len(LH8E[1])-2))
502
503 for i, _ in enumerate(LH2E):
504 c_cols=0
505 t_cols=0
506
507 c_cols=sum([1 for j in range(len(LH2E[i])-2) if int(LH2E[i][j+2])>=int(count)])
508 t_cols=sum([1 for j in range(len(LH8E[i])-2) if int(LH8E[i][j+2])>=int(count)])
509
510 if c_cols>=c_col_filter or t_cols>=t_col_filter:
511 filter_LH8E.append(LH8E[i])
512 filter_LH2E.append(LH2E[i])
513
514 raw_LH2E.extend(LH2E)
515 raw_LH8E.extend(LH8E)
516
517 ##################################################################################################################################################################################################################
518
519 def write_main(raw_LH2E, raw_LH8E, fil_LH2E, fil_LH8E, names_con, names_tre, flag, per, n1, n2):
520
521 if flag == 1 and int(per)!=-1:
522 fp = open('Counts/Filtered '+n2 +' Templated Counts', 'w')
523 fp.write("Name\t")
524 fp.write("Sequence")
525 for y in names_tre:
526 fp.write("\t"+y)
527
528 for x in fil_LH8E:
529 fp.write("\n%s" % "\t".join(x))
530 fp.close()
531
532 fp = open('Counts/Filtered '+n1+' Templated Counts', 'w')
533 fp.write("Name\t")
534 fp.write("Sequence")
535 for y in names_con:
536 fp.write("\t"+y)
537
538 for x in fil_LH2E:
539 fp.write("\n%s" % "\t".join(x))
540 fp.close()
541
542
543 if flag == 2 and int(per)!=-1:
544 fp = open('Counts/Filtered '+n2+' Non-Templated Counts', 'w')
545 fp.write("Name\t")
546 fp.write("Sequence")
547 for y in names_tre:
548 fp.write("\t"+y)
549
550
551 for x in fil_LH8E:
552 fp.write("\n%s" % "\t".join(x))
553 fp.close()
554
555 fp = open('Counts/Filtered '+n1+' Non-Templated Counts', 'w')
556 fp.write("Name\t")
557 fp.write("Sequence")
558 for y in names_con:
559 fp.write("\t"+y)
560
561 for x in fil_LH2E:
562 fp.write("\n%s" % "\t".join(x))
563 fp.close()
564
565
566 if flag == 1:
567 fp = open('Counts/Raw '+n2+' Templated Counts', 'w')
568 fp.write("Name\t")
569 fp.write("Sequence")
570 for y in names_tre:
571 fp.write("\t"+y)
572
573 for x in raw_LH8E:
574 fp.write("\n%s" % "\t".join(x))
575 fp.close()
576
577 fp = open('Counts/Raw '+n1+' Templated Counts', 'w')
578 fp.write("Name\t")
579 fp.write("Sequence")
580 for y in names_con:
581 fp.write("\t"+y)
582
583 for x in raw_LH2E:
584 fp.write("\n%s" % "\t".join(x))
585 fp.close()
586
587
588 if flag == 2:
589 fp = open('Counts/Raw '+n2+' Non-Templated Counts', 'w')
590 fp.write("Name\t")
591 fp.write("Sequence")
592 for y in names_tre:
593 fp.write("\t"+y)
594
595
596 for x in raw_LH8E:
597 fp.write("\n%s" % "\t".join(x))
598 fp.close()
599
600 fp = open('Counts/Raw '+n1+' Non-Templated Counts', 'w')
601 fp.write("Name\t")
602 fp.write("Sequence")
603 for y in names_con:
604 fp.write("\t"+y)
605
606 for x in raw_LH2E:
607 fp.write("\n%s" % "\t".join(x))
608 fp.close()
609
610
611 #########################################################################################################################################
612
613 def ssamples(names,samp,folder,pro):
614
615 for i in range(2,len(samp[0])):
616
617 fp = open(folder+names[i-2]+'.txt','w')
618 fp.write("miRNA id"+"\t"+names[i-2]+"\n")
619
620 for x in samp:
621 fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
622 fp.close()
623
624 ##################################################################################################################
625
626 def DB_write(con,name,unique_seq,sorted_uni_arms,f):
627
628 if f==1:
629 # Write a txt file with all the information
630 if con=="c":
631 fp = open('split1/'+name, 'w')
632
633 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
634 if con=="t":
635 fp = open('split2/'+name, 'w')
636 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
637
638
639 for i in range(len(sorted_uni_arms)):
640 temp = []
641 for j in range(len(unique_seq)):
642
643 if sorted_uni_arms[i][0] in unique_seq[j][2].split("_")[0]:
644
645 temp.append(unique_seq[j])
646
647 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
648 fp.write("*********************************************************************************************************\n")
649 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
650 fp.write("*********************************************************************************************************\n\n")
651 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
652 fp.write("\n" + "\n")
653 fp.close()
654
655 if f==2:
656
657 if con=="c":
658 fp = open('split3/'+name, 'w')
659 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
660 if con=="t":
661 fp = open('split4/'+name, 'w')
662 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
663
664
665 for i in range(len(sorted_uni_arms)):
666 temp = []
667 for j in range(len(unique_seq)):
668 if sorted_uni_arms[i][0]==unique_seq[j][2].split("__")[0]:
669 temp.append(unique_seq[j])
670 if temp!=[]:
671 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
672 fp.write("*********************************************************************************************************\n")
673 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
674 fp.write("*********************************************************************************************************\n\n")
675 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
676 fp.write("\n" + "\n")
677 fp.close()
678
679
680 ##########################################################################################################################
681
682 def new_mat_seq(pre_unique_seq,mat_mirnas,l):
683
684 unique_iso = []
685 for x in pre_unique_seq:
686 if len(x[2].split("_"))==3:
687 for y in pre_unique_seq:
688 if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]):
689 if any(y[2] in lst2 for lst2 in unique_iso)==False:
690 y[2]=">"+y[2]
691 unique_iso.append(y)
692 l.acquire()
693 for x in unique_iso:
694 mat_mirnas.append(x[2])
695 mat_mirnas.append(x[9])
696 l.release()
697
698 #########################################################################################################################
699
700 def merging_names(LH2E_copy,new):
701
702 dupes=[]
703 final_LH2E =[]
704
705 for num in range(len(LH2E_copy)):
706
707 if LH2E_copy[num][1] not in final_LH2E and LH2E_copy[num][0] not in final_LH2E:
708 final_LH2E.append(LH2E_copy[num][1])
709 final_LH2E.append(LH2E_copy[num][0])
710 else:
711 dupes.append(LH2E_copy[num][1])
712
713 dupes=list(set(dupes))
714
715 for i in range(len(dupes)):
716 dupes[i]=[dupes[i]]
717
718 for x in LH2E_copy:
719 for y in dupes:
720 if x[1]==y[0]:
721 fl=0
722 if len(y)==1:
723 y.append(x[0])
724 else:
725 for i in range(1,len(y)):
726 if y[i].split("_")[0]==x[0].split("_")[0]:
727 fl=1
728 if len(x[0])<len(y[i]):
729 del y[i]
730 y.append(x[0])
731 break
732
733 if fl==0:
734 y.append((x[0]))
735
736 for y in dupes:
737 if len(y)>2:
738 for i in range(len(y)-1,1,-1):
739 y[1]=y[1]+"/"+y[i]
740 del y[i]
741
742
743 for x in LH2E_copy:
744 for y in dupes:
745 if x[1]==y[0]:
746 x[0]=y[1]
747
748 LH2E_copy.sort()
749 LH2E_copy=list(LH2E_copy for LH2E_copy,_ in itertools.groupby(LH2E_copy))
750
751 new.extend(LH2E_copy)
752
753
754 ######################################################################################################################################################
755
756 def ssamples1(tem_names,tem_samp,non_names,non_samp,folder,pro):
757
758 for i in range(2,len(tem_samp[0])):
759
760 fp = open(folder+tem_names[i-2]+'.txt','w')
761 fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n")
762
763 for x in tem_samp:
764 fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
765
766 for j in range(len(non_names)):
767 if non_names[j]==tem_names[i-2]:
768 for x in non_samp:
769 fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n")
770 fp.close()
771
772 ###################################################################################################################################################################################################################
773
774 def download_matures(matures,org_name):
775
776 #url = 'ftp://mirbase.org/pub/mirbase/21/mature.fa.gz'
777 url = 'ftp://mirbase.org/pub/mirbase/CURRENT/mature.fa.gz'
778 data = urllib.request.urlopen(url).read()
779 file_mirna = gzip.decompress(data).decode('utf-8')
780 file_mirna = file_mirna.split("\n")
781
782 for i in range(0,len(file_mirna)-1,2):
783
784 if org_name in file_mirna[i]:
785 matures.append(file_mirna[i])
786 matures.append(file_mirna[i+1])
787
788 ###################################################################################################################################################################################################################
789 def non_template_ref(sc,st,all_isoforms):
790
791 pre_uni_seq_con = list(sc)
792 pre_uni_seq_tre = list(st)
793
794 for x in pre_uni_seq_con:
795 for y in x:
796 if ">"+y[2] not in all_isoforms and ")_" in y[2] :
797 all_isoforms.append(">"+y[2])
798 all_isoforms.append(y[9])
799
800
801 for x in pre_uni_seq_tre:
802 for y in x:
803 if ">"+y[2] not in all_isoforms and ")_" in y[2]:
804 all_isoforms.append(">"+y[2])
805 all_isoforms.append(y[9])
806
807 ################################################################################################################################################################################################
808
809 def deseqe2(sample,mir_names,l,new_d,sample_name,sample_order):
810
811 for y in mir_names:
812 flag=0
813 for x in sample:
814 if y[0]==x[0]:
815 flag=1
816 break
817 if flag==0:
818 sample.append([y[0],"0",y[1]])
819
820 sample.sort(key=lambda x: x[0])
821 sample=list(sample for sample,_ in itertools.groupby(sample))
822
823 l.acquire()
824 new_d.append(sample)
825 sample_order.append(sample_name)
826 l.release()
827
828 ###############################################################################################################################################################################################
829