comparison mirbase_functions.py @ 16:e19c832c5368 draft

Uploaded
author glogobyte
date Thu, 22 Oct 2020 07:47:58 +0000
parents
children
comparison
equal deleted inserted replaced
15:f06ec007c578 16:e19c832c5368
1 import itertools
2 import time
3 import sys
4 import os
5 import urllib.request
6 import gzip
7 from multiprocessing import Process, Queue, Lock, Pool, Manager, Value
8 import subprocess
9 import argparse
10 from collections import OrderedDict
11 from matplotlib.backends.backend_pdf import PdfPages
12 import pandas as pd
13 from math import pi
14 import numpy as np
15 import matplotlib.pyplot as plt
16 from matplotlib.ticker import PercentFormatter
17 import seaborn as sns
18 import scipy.stats as stats
19 from plotnine import *
20 import math
21 import re
22 import matplotlib.ticker as mtick
23 import copy
24
25
26 """---------------------- Simple Functions -----------------------"""
27
28 # Read a file and return it as a list
29 def read(path, flag):
30 if flag == 0:
31 with open(path) as fp:
32 file=fp.readlines()
33 fp.close()
34 return file
35
36 if flag == 1:
37 with open(path) as fp:
38 file = fp.read().splitlines()
39 fp.close()
40 return file
41
42 # Write a list to a txt file
43 def write(path, list):
44 with open(path,'w') as fp:
45 for x in list:
46 fp.write(str("\t".join(x[1:-1])))
47 fp.close()
48
49 """---------------------- RNA-seq Functions ----------------------"""
50
51 # Detect the longest common substring sequence between two mirnas
52 def longestSubstring(str1, str2):
53
54 from difflib import SequenceMatcher
55 # initialize SequenceMatcher object with
56 # input string
57 seqMatch = SequenceMatcher(None, str1, str2)
58
59 # find match of longest sub-string
60 # output will be like Match(a=0, b=0, size=5)
61 match = seqMatch.find_longest_match(0, len(str1), 0, len(str2))
62
63 # print longest substring
64 if (match.size != 0):
65 return str1[match.a: match.a + match.size]
66 else:
67 print('No longest common sub-string found')
68
69
70
71 ########################################################################################################################################################
72
73 def collapse_sam(path):
74
75 ini_sam=read(path,0)
76 main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
77 intro_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" in x.split("\t")[0]]
78
79 uni_seq = []
80 for x in main_sam:
81
82 if [x[2], x[9]] not in uni_seq:
83 uni_seq.append([x[2], x[9]])
84
85 new_main_sam=[]
86 incr_num=0
87 for i in range(len(uni_seq)):
88 count=0
89 incr_num+=1
90 for y in main_sam:
91 if uni_seq[i][1]==y[9] and uni_seq[i][0]==y[2]:
92 count+=1
93 temp=y
94 temp[10]="~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
95 temp[0]=str(incr_num)+"-"+str(count)
96 new_main_sam.append(temp)
97
98 new_sam=intro_sam+new_main_sam
99
100 return new_sam
101
102 #################################################################################################################################################################################################################
103
104 def duplicate_chroms_isoforms(List):
105
106 dupes=[]
107
108 for num in range(len(List)):
109
110 if [List[num][9],List[num][0],List[num][2]] not in dupes :
111 dupes.append([List[num][9],List[num][0],List[num][2]])
112
113 for x in List:
114 for y in dupes:
115 if x[9]==y[0] and x[0]==y[1] and x[2].split("_")[0]==y[2].split("_")[0] and x[2]!=y[2]:
116 y.append(x[2])
117
118
119 double_List = [x[:] for x in List]
120
121 chr_order=[]
122 for x in dupes:
123 temp = []
124 for i in range(2,len(x)):
125 if x[i].split("chr")[1].split("(")[0].isdigit():
126 temp.append(int(x[i].split("chr")[1].split("(")[1][0]+x[i].split("chr")[1].split("(")[0]))
127 else:
128 temp.append(x[i].split("chr")[1][0:4])
129
130 for z in temp:
131 if 'X(-)'==z or 'Y(-)'==z or 'X(+)'==z or 'Y(+)'==z:
132 temp = [str(j) for j in temp]
133 temp=list(set(temp))
134 temp.sort()
135 chr_order.append(temp)
136
137 final_dupes=[]
138 for i in range(len(dupes)):
139 final_dupes.append([dupes[i][0],dupes[i][2].split("_")[0],dupes[i][1]])
140 for x in chr_order[i]:
141 result = re.match("[-+]?\d+$", str(x))
142 if len(chr_order[i]) == len(set(chr_order[i])):
143 if result is not None:
144
145 if int(x)<0:
146 final_dupes[i][1]=final_dupes[i][1]+"_chr"+str(abs(int(x)))+"(-)"
147 else:
148 final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(abs(int(x)))+"(+)"
149 else:
150 final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(x)
151 else:
152 if result is not None:
153 if int(x) < 0:
154 final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(-)"
155 else:
156 final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(+)"
157 else:
158 final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(x)
159
160 final_dupes.sort()
161 final_dupes=list(final_dupes for final_dupes,_ in itertools.groupby(final_dupes))
162
163 for i in range(len(double_List)):
164 for x in final_dupes:
165
166 if double_List[i][9] == x[0] and double_List[i][0] == x[2] and len(double_List[i][2].split("_")) >3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]:
167 gg=str("_"+double_List[i][2].split("_")[-2]+"_"+double_List[i][2].split("_")[-1])
168 double_List[i][2] = x[1]+gg
169
170 if double_List[i][9]==x[0] and double_List[i][0]== x[2] and len(double_List[i][2].split("_"))==3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]:
171 double_List[i][2]=x[1]
172 List[i][2] = x[1]
173
174 List.sort()
175 new_list=list(List for List,_ in itertools.groupby(List))
176
177 double_List.sort()
178 new_double_List = list(double_List for double_List, _ in itertools.groupby(double_List))
179
180 return new_list, new_double_List
181
182
183 #############################################################################################################################################################################################################
184
185 def sam(mature_mirnas,path,name,con,l,samples,data,names,unmap_seq,samples_mirna_names,deseq,LHE_names,umi,ini_sample,unmap_counts):
186
187 # read the sam file
188 ini_sam=read(path,0)
189 new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
190 unique_seq = [x for x in new_main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26]
191
192 sorted_uni_arms = []
193
194 for i in range(len(mature_mirnas)):
195 tmp_count_reads = 0 # calculate the total number of reads
196 tmp_count_seq = 0 # calculate the total number of sequences
197 for j in range(len(unique_seq)):
198
199 if "{" in unique_seq[j][2].split("_")[0]:
200 official=unique_seq[j][2].split("_")[0][:-4]
201 else:
202 official=unique_seq[j][2].split("_")[0]
203
204 if mature_mirnas[i].split(" ")[0][1:] == official:
205
206 temp_mature = mature_mirnas[i+1].strip().replace("U", "T")
207 off_part = longestSubstring(temp_mature, unique_seq[j][9])
208
209 mat_diff = temp_mature.split(off_part)
210 mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
211
212 unique_diff = unique_seq[j][9].split(off_part)
213 unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
214
215 # Problem with hsa-miR-8485
216 if mat_diff[1]!=0 and unique_diff[1]!=0:
217 unique_seq[j]=1
218 pre_pos = 0
219 post_pos = 0
220
221 elif mat_diff[0]!=0 and unique_diff[0]!=0:
222 unique_seq[j]=1
223 pre_pos = 0
224 post_pos = 0
225
226 else:
227 pre_pos = mat_diff[0]-unique_diff[0]
228 post_pos = unique_diff[1]-mat_diff[1]
229 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
230 tmp_count_seq = tmp_count_seq+1
231
232 if pre_pos != 0 or post_pos != 0:
233 if pre_pos == 0:
234 unique_seq[j][2] = unique_seq[j][2] + "_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos)
235 elif post_pos == 0:
236 unique_seq[j][2] = unique_seq[j][2] + "_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos)
237 else:
238 unique_seq[j][2] = unique_seq[j][2]+"_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos)
239
240 for x in range(unique_seq.count(1)):
241 unique_seq.remove(1)
242 if tmp_count_reads != 0 and tmp_count_seq != 0:
243 sorted_uni_arms.append([mature_mirnas[i].split(" ")[0][1:], tmp_count_seq, tmp_count_reads])
244 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
245 dedup_unique_seq,double_fil_uni_seq=duplicate_chroms_isoforms(unique_seq)
246
247 for y in sorted_uni_arms:
248 counts=0
249 seqs=0
250 for x in double_fil_uni_seq:
251 if y[0]==x[2].split("_")[0]:
252 counts+=int(x[0].split("-")[1])
253 seqs+=1
254
255 y[1]=seqs
256 y[2]=counts
257
258 LHE=[]
259 l.acquire()
260 if con=="c":
261 LHE.extend(z[2] for z in double_fil_uni_seq)
262 for y in double_fil_uni_seq:
263 samples_mirna_names.append([y[2],y[9]])
264 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq])
265 LHE_names.extend(LHE)
266 unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4'])
267 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4'])
268 names.append(name)
269 samples.append(dedup_unique_seq)
270 data.append([con,name,double_fil_uni_seq,sorted_uni_arms])
271 ini_sample.append(new_main_sam)
272
273 if con=="t":
274 LHE.extend(z[2] for z in double_fil_uni_seq)
275 for y in double_fil_uni_seq:
276 samples_mirna_names.append([y[2],y[9]])
277 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq])
278 LHE_names.extend(LHE)
279 unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4'])
280 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4'])
281 names.append(name)
282 samples.append(dedup_unique_seq)
283 data.append([con,name,double_fil_uni_seq,sorted_uni_arms])
284 ini_sample.append(new_main_sam)
285 l.release()
286
287
288 ######################################################################################################################################
289
290 """
291
292 Read a sam file from Bowtie and do the followings:
293
294 1) Remove reverse stranded mapped reads
295 2) Remove unmapped reads
296 3) Remove all sequences with reads less than 11 reads
297 4) Sort the arms with the most sequences in decreading rate
298 5) Sort the sequences of every arm with the most reads in decreasing rate
299 6) Calculate total number of sequences of every arm
300 7) Calculate total number of reads of sequences of every arm.
301 8) Store all the informations in a txt file
302
303 """
304
305 def non_sam(mature_mirnas,path,name,con,l,data,names,n_deseq,n_samples_mirna_names,n_LHE_names):
306
307 ini_sam=read(path,0)
308 new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
309 unique_seq=[]
310 unique_seq = [x for x in new_main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26]
311
312 uni_seq=[]
313 # Calculate the shifted positions for every isomir and add them to the name of it
314 sorted_uni_arms = []
315 for i in range(1,len(mature_mirnas),2):
316 tmp_count_reads = 0 # calculate the total number of reads
317 tmp_count_seq = 0 # calculate the total number of sequences
318
319 for j in range(len(unique_seq)):
320
321 temp_mature = mature_mirnas[i].strip().replace("U", "T")
322
323 if temp_mature in unique_seq[j][9]:
324
325 off_part = longestSubstring(temp_mature, unique_seq[j][9])
326
327 mat_diff = temp_mature.split(off_part)
328 mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
329
330 unique_diff = unique_seq[j][9].split(off_part)
331 if len(unique_diff)<=2:
332 unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
333
334 pre_pos = mat_diff[0]-unique_diff[0]
335 post_pos = unique_diff[1]-mat_diff[1]
336
337 lengthofmir = len(off_part) + post_pos
338 if pre_pos == 0:
339 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
340 tmp_count_seq = tmp_count_seq + 1
341
342 if pre_pos == 0:
343
344 t_name=unique_seq[j].copy()
345 t_name[2]=mature_mirnas[i - 1].split(" ")[0][1:] + "__" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):])
346 uni_seq.append(t_name)
347
348
349 if tmp_count_reads != 0 and tmp_count_seq != 0:
350 sorted_uni_arms.append([mature_mirnas[i-1].split(" ")[0][1:], tmp_count_seq, tmp_count_reads])
351
352
353 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
354 unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq))))
355
356 LHE=[]
357
358 l.acquire()
359 if con=="c":
360 LHE.extend(x[2] for x in unique_seq if x[2]!="*")
361 for x in unique_seq:
362 if x[2]!="*":
363 n_samples_mirna_names.append([x[2],x[9]])
364 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
365 n_LHE_names.extend(LHE)
366 names.append(name)
367 data.append([con,name,unique_seq,sorted_uni_arms])
368
369
370 if con=="t":
371 LHE.extend(x[2] for x in unique_seq if x[2]!="*")
372 for x in unique_seq:
373 if x[2]!="*":
374 n_samples_mirna_names.append([x[2],x[9]])
375 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
376 n_LHE_names.extend(LHE)
377 names.append(name)
378 data.append([con,name,unique_seq,sorted_uni_arms])
379 l.release()
380
381 #####################################################################################################################################################################################################################
382 def deseq2_temp(samples_mirna_names,deseq,con,l):
383
384 samples_mirna_names.sort(key=lambda x:[0])
385 for i in range(len(deseq)):
386 for y in samples_mirna_names:
387 flag = 0
388 for x in deseq[i]:
389 if y[0] == x[0]:
390 flag = 1
391 break
392
393 if flag == 0:
394 deseq[i].append([y[0], "0", y[1]])
395
396 [deseq[i].sort(key=lambda x: x[0]) for i, _ in enumerate(deseq)]
397 deseq_final = [[x[0],x[2]] for x in deseq[0]]
398 [deseq_final[z].append(deseq[i][j][1]) for z,_ in enumerate(deseq_final) for i, _ in enumerate(deseq) for j,_ in enumerate(deseq[i]) if deseq_final[z][0] == deseq[i][j][0]]
399
400 l.acquire()
401 if con=="c":
402 q1.put(deseq_final)
403
404 if con=="t":
405 q2.put(deseq_final)
406 l.release()
407
408
409 ####################################################################################################################################################################################################################
410
411 def main_temp(LH2E, LH2E_names, LH8E, LH8E_names,flag,names_con,names_tre,filter_LH8E,filter_LH2E,raw_LH8E,raw_LH2E,per,count):
412
413 LH8E_add_names = [x for x in LH2E_names if x not in LH8E_names]
414 LH2E_add_names = [x for x in LH8E_names if x not in LH2E_names]
415
416 LH8E_add_names.sort()
417 LH2E_add_names.sort()
418 LH8E_add_names = list(LH8E_add_names for LH8E_add_names,_ in itertools.groupby(LH8E_add_names))
419 LH2E_add_names = list(LH2E_add_names for LH2E_add_names,_ in itertools.groupby(LH2E_add_names))
420
421 LH2E.sort()
422 LH8E.sort()
423 LH2E = list(LH2E for LH2E,_ in itertools.groupby(LH2E))
424 LH8E = list(LH8E for LH8E,_ in itertools.groupby(LH8E))
425
426 zeros=["0"]*(len(LH8E[0])-2)
427 [LH8E_add_names[i].extend(zeros) for i,_ in enumerate(LH8E_add_names)]
428 LH8E=LH8E+LH8E_add_names
429
430 zeros=["0"]*(len(LH2E[0])-2)
431 [LH2E_add_names[i].extend(zeros) for i,_ in enumerate(LH2E_add_names)]
432 LH2E=LH2E+LH2E_add_names
433
434 dupes=[]
435 final_LH2E =[]
436
437 for num,_ in enumerate(LH2E):
438
439 if LH2E[num][1] not in final_LH2E and LH2E[num][0] not in final_LH2E:
440 final_LH2E.append(LH2E[num][1])
441 final_LH2E.append(LH2E[num][0])
442 else:
443 dupes.append(LH2E[num][1])
444
445
446 dupes=list(set(dupes))
447
448 dupes=[[x] for x in dupes]
449
450 for x in LH2E:
451 for y in dupes:
452 if x[1]==y[0]:
453 fl=0
454 if len(y)==1:
455 y.append(x[0])
456 else:
457 for i in range(1,len(y)):
458 if y[i].split("_")[0]==x[0].split("_")[0]:
459 fl=1
460 if len(x[0])<len(y[i]):
461 del y[i]
462 y.append(x[0])
463 break
464
465 if fl==0:
466 y.append((x[0]))
467
468 for y in dupes:
469 if len(y)>2:
470 for i in range(len(y)-1,1,-1):
471 y[1]=y[1]+"/"+y[i]
472 del y[i]
473
474 for x in LH2E:
475 for y in dupes:
476 if x[1]==y[0]:
477 x[0]=y[1]
478
479 for x in LH8E:
480 for y in dupes:
481 if x[1]==y[0]:
482 x[0]=y[1]
483
484
485 LH2E.sort()
486 LH2E=list(LH2E for LH2E,_ in itertools.groupby(LH2E))
487
488 LH8E.sort()
489 LH8E=list(LH8E for LH8E,_ in itertools.groupby(LH8E))
490
491 if int(per)!=-1:
492 percent=int(per)/100
493
494 c_col_filter=round(percent*(len(LH2E[1])-2))
495 t_col_filter=round(percent*(len(LH8E[1])-2))
496
497 for i, _ in enumerate(LH2E):
498 c_cols=0
499 t_cols=0
500
501 c_cols=sum([1 for j in range(len(LH2E[i])-2) if int(LH2E[i][j+2])>=int(count)])
502 t_cols=sum([1 for j in range(len(LH8E[i])-2) if int(LH8E[i][j+2])>=int(count)])
503
504 if c_cols>=c_col_filter or t_cols>=t_col_filter:
505 filter_LH8E.append(LH8E[i])
506 filter_LH2E.append(LH2E[i])
507
508 raw_LH2E.extend(LH2E)
509 raw_LH8E.extend(LH8E)
510
511 ##################################################################################################################################################################################################################
512
513 def write_main(raw_LH2E, raw_LH8E, fil_LH2E, fil_LH8E, names_con, names_tre, flag, per, n1, n2):
514
515 if flag == 1 and int(per)!=-1:
516 fp = open('Counts/Filtered '+n2 +' Templated Counts', 'w')
517 fp.write("Name\t")
518 fp.write("Sequence")
519 for y in names_tre:
520 fp.write("\t"+y)
521
522 for x in fil_LH8E:
523 fp.write("\n%s" % "\t".join(x))
524 fp.close()
525
526 fp = open('Counts/Filtered '+n1+' Templated Counts', 'w')
527 fp.write("Name\t")
528 fp.write("Sequence")
529 for y in names_con:
530 fp.write("\t"+y)
531
532 for x in fil_LH2E:
533 fp.write("\n%s" % "\t".join(x))
534 fp.close()
535
536
537 if flag == 2 and int(per)!=-1:
538 fp = open('Counts/Filtered '+n2+' Non-Templated Counts', 'w')
539 fp.write("Name\t")
540 fp.write("Sequence")
541 for y in names_tre:
542 fp.write("\t"+y)
543
544
545 for x in fil_LH8E:
546 fp.write("\n%s" % "\t".join(x))
547 fp.close()
548
549 fp = open('Counts/Filtered '+n1+' Non-Templated Counts', 'w')
550 fp.write("Name\t")
551 fp.write("Sequence")
552 for y in names_con:
553 fp.write("\t"+y)
554
555 for x in fil_LH2E:
556 fp.write("\n%s" % "\t".join(x))
557 fp.close()
558
559
560 if flag == 1:
561 fp = open('Counts/Raw '+n2+' Templated Counts', 'w')
562 fp.write("Name\t")
563 fp.write("Sequence")
564 for y in names_tre:
565 fp.write("\t"+y)
566
567 for x in raw_LH8E:
568 fp.write("\n%s" % "\t".join(x))
569 fp.close()
570
571 fp = open('Counts/Raw '+n1+' Templated Counts', 'w')
572 fp.write("Name\t")
573 fp.write("Sequence")
574 for y in names_con:
575 fp.write("\t"+y)
576
577 for x in raw_LH2E:
578 fp.write("\n%s" % "\t".join(x))
579 fp.close()
580
581
582 if flag == 2:
583 fp = open('Counts/Raw '+n2+' Non-Templated Counts', 'w')
584 fp.write("Name\t")
585 fp.write("Sequence")
586 for y in names_tre:
587 fp.write("\t"+y)
588
589
590 for x in raw_LH8E:
591 fp.write("\n%s" % "\t".join(x))
592 fp.close()
593
594 fp = open('Counts/Raw '+n1+' Non-Templated Counts', 'w')
595 fp.write("Name\t")
596 fp.write("Sequence")
597 for y in names_con:
598 fp.write("\t"+y)
599
600 for x in raw_LH2E:
601 fp.write("\n%s" % "\t".join(x))
602 fp.close()
603
604
605 #########################################################################################################################################
606
607 def ssamples(names,samp,folder,pro):
608
609 for i in range(2,len(samp[0])):
610
611 fp = open(folder+names[i-2]+'.txt','w')
612 fp.write("miRNA id"+"\t"+names[i-2]+"\n")
613
614 for x in samp:
615 fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
616 fp.close()
617
618 ##################################################################################################################
619
620 def DB_write(con,name,unique_seq,sorted_uni_arms,f):
621
622 if f==1:
623 # Write a txt file with all the information
624 if con=="c":
625 fp = open('split1/'+name, 'w')
626
627 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
628 if con=="t":
629 fp = open('split2/'+name, 'w')
630 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
631
632
633 for i in range(len(sorted_uni_arms)):
634 temp = []
635 for j in range(len(unique_seq)):
636
637 if sorted_uni_arms[i][0] in unique_seq[j][2].split("_")[0]:
638
639 temp.append(unique_seq[j])
640
641 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
642 fp.write("*********************************************************************************************************\n")
643 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
644 fp.write("*********************************************************************************************************\n\n")
645 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
646 fp.write("\n" + "\n")
647 fp.close()
648
649 if f==2:
650
651 if con=="c":
652 fp = open('split3/'+name, 'w')
653 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
654 if con=="t":
655 fp = open('split4/'+name, 'w')
656 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
657
658
659 for i in range(len(sorted_uni_arms)):
660 temp = []
661 for j in range(len(unique_seq)):
662 if sorted_uni_arms[i][0]==unique_seq[j][2].split("__")[0]:
663 temp.append(unique_seq[j])
664 if temp!=[]:
665 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
666 fp.write("*********************************************************************************************************\n")
667 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
668 fp.write("*********************************************************************************************************\n\n")
669 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
670 fp.write("\n" + "\n")
671 fp.close()
672
673
674 ##########################################################################################################################
675
676 def new_mat_seq(pre_unique_seq,mat_mirnas,l):
677
678 unique_iso = []
679 for x in pre_unique_seq:
680 if len(x[2].split("_"))==3:
681 for y in pre_unique_seq:
682 if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]):
683 if any(y[2] in lst2 for lst2 in unique_iso)==False:
684 y[2]=">"+y[2]
685 unique_iso.append(y)
686 l.acquire()
687 for x in unique_iso:
688 mat_mirnas.append(x[2])
689 mat_mirnas.append(x[9])
690 l.release()
691
692 #########################################################################################################################
693
694 def merging_names(LH2E_copy,new):
695
696 dupes=[]
697 final_LH2E =[]
698
699 for num in range(len(LH2E_copy)):
700
701 if LH2E_copy[num][1] not in final_LH2E and LH2E_copy[num][0] not in final_LH2E:
702 final_LH2E.append(LH2E_copy[num][1])
703 final_LH2E.append(LH2E_copy[num][0])
704 else:
705 dupes.append(LH2E_copy[num][1])
706
707 dupes=list(set(dupes))
708
709 for i in range(len(dupes)):
710 dupes[i]=[dupes[i]]
711
712 for x in LH2E_copy:
713 for y in dupes:
714 if x[1]==y[0]:
715 fl=0
716 if len(y)==1:
717 y.append(x[0])
718 else:
719 for i in range(1,len(y)):
720 if y[i].split("_")[0]==x[0].split("_")[0]:
721 fl=1
722 if len(x[0])<len(y[i]):
723 del y[i]
724 y.append(x[0])
725 break
726
727 if fl==0:
728 y.append((x[0]))
729
730 for y in dupes:
731 if len(y)>2:
732 for i in range(len(y)-1,1,-1):
733 y[1]=y[1]+"/"+y[i]
734 del y[i]
735
736
737 for x in LH2E_copy:
738 for y in dupes:
739 if x[1]==y[0]:
740 x[0]=y[1]
741
742 LH2E_copy.sort()
743 LH2E_copy=list(LH2E_copy for LH2E_copy,_ in itertools.groupby(LH2E_copy))
744
745 new.extend(LH2E_copy)
746
747
748 ######################################################################################################################################################
749
750 def ssamples1(tem_names,tem_samp,non_names,non_samp,folder,pro):
751
752 for i in range(2,len(tem_samp[0])):
753
754 fp = open(folder+tem_names[i-2]+'.txt','w')
755 fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n")
756
757 for x in tem_samp:
758 fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
759
760 for j in range(len(non_names)):
761 if non_names[j]==tem_names[i-2]:
762 for x in non_samp:
763 fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n")
764 fp.close()
765
766 ###################################################################################################################################################################################################################
767
768 def download_matures(matures,org_name):
769
770 #url = 'ftp://mirbase.org/pub/mirbase/21/mature.fa.gz'
771 url = 'ftp://mirbase.org/pub/mirbase/CURRENT/mature.fa.gz'
772 data = urllib.request.urlopen(url).read()
773 file_mirna = gzip.decompress(data).decode('utf-8')
774 file_mirna = file_mirna.split("\n")
775
776 for i in range(0,len(file_mirna)-1,2):
777
778 if org_name in file_mirna[i]:
779 matures.append(file_mirna[i])
780 matures.append(file_mirna[i+1])
781
782 ###################################################################################################################################################################################################################
783 def non_template_ref(sc,st,all_isoforms):
784
785 pre_uni_seq_con = list(sc)
786 pre_uni_seq_tre = list(st)
787
788 for x in pre_uni_seq_con:
789 for y in x:
790 if ">"+y[2] not in all_isoforms and ")_" in y[2] :
791 all_isoforms.append(">"+y[2])
792 all_isoforms.append(y[9])
793
794
795 for x in pre_uni_seq_tre:
796 for y in x:
797 if ">"+y[2] not in all_isoforms and ")_" in y[2]:
798 all_isoforms.append(">"+y[2])
799 all_isoforms.append(y[9])
800
801 ################################################################################################################################################################################################
802
803 def deseqe2(sample,mir_names,l,new_d,sample_name,sample_order):
804
805 for y in mir_names:
806 flag=0
807 for x in sample:
808 if y[0]==x[0]:
809 flag=1
810 break
811 if flag==0:
812 sample.append([y[0],"0",y[1]])
813
814 sample.sort(key=lambda x: x[0])
815 sample=list(sample for sample,_ in itertools.groupby(sample))
816
817 l.acquire()
818 new_d.append(sample)
819 sample_order.append(sample_name)
820 l.release()
821
822 ###############################################################################################################################################################################################
823