comparison mirgene_functions.py @ 17:dc31f01cf21d draft

Uploaded
author glogobyte
date Thu, 22 Oct 2020 07:48:12 +0000
parents
children
comparison
equal deleted inserted replaced
16:e19c832c5368 17:dc31f01cf21d
1 import itertools
2 import time
3 import sys
4 import os
5 import urllib.request
6 import gzip
7 from multiprocessing import Process, Queue, Lock, Pool, Manager, Value
8 import subprocess
9 import argparse
10 from collections import OrderedDict
11 from matplotlib.backends.backend_pdf import PdfPages
12 import pandas as pd
13 from math import pi
14 import numpy as np
15 import matplotlib.pyplot as plt
16 from matplotlib.ticker import PercentFormatter
17 import seaborn as sns
18 import scipy.stats as stats
19 from plotnine import *
20 import math
21 import re
22 import matplotlib.ticker as mtick
23 import copy
24
25
26
27
28 """---------------------- Simple Functions -----------------------"""
29
30 # Read a file and return it as a list
31 def read(path, flag):
32 if flag == 0:
33 with open(path) as fp:
34 file=fp.readlines()
35 fp.close()
36 return file
37
38 if flag == 1:
39 with open(path) as fp:
40 file = fp.read().splitlines()
41 fp.close()
42 return file
43
44 # Write a list to a txt file
45 def write(path, list):
46 with open(path,'w') as fp:
47 for x in list:
48 fp.write(str("\t".join(x[1:-1])))
49 fp.close()
50
51 """---------------------- RNA-seq Functions ----------------------"""
52
53 # Detect the longest common substring sequence between two mirnas
54 def longestSubstring(str1, str2):
55
56 from difflib import SequenceMatcher
57 # initialize SequenceMatcher object with
58 # input string
59 seqMatch = SequenceMatcher(None, str1, str2)
60
61 # find match of longest sub-string
62 # output will be like Match(a=0, b=0, size=5)
63 match = seqMatch.find_longest_match(0, len(str1), 0, len(str2))
64
65 # print longest substring
66 if (match.size != 0):
67 return str1[match.a: match.a + match.size]
68 else:
69 print('No longest common sub-string found')
70
71
72
73 ########################################################################################################################################################
74 def collapse_sam(path):
75
76 ini_sam=read(path,0)
77 main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
78 intro_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" in x.split("\t")[0]]
79
80 uni_seq = []
81 for x in main_sam:
82
83 if [x[2], x[9]] not in uni_seq:
84 uni_seq.append([x[2], x[9]])
85
86 new_main_sam=[]
87 incr_num=0
88 for i in range(len(uni_seq)):
89 count=0
90 incr_num+=1
91 for y in main_sam:
92 if uni_seq[i][1]==y[9] and uni_seq[i][0]==y[2]:
93 count+=1
94 temp=y
95 temp[10]="~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
96 temp[0]=str(incr_num)+"-"+str(count)
97 new_main_sam.append(temp)
98
99 new_sam=intro_sam+new_main_sam
100
101 return new_sam
102
103 #################################################################################################################################################################
104
105 def sam(mature_mirnas,path,name,con,l,samples,data,names,unmap_seq,samples_mirna_names,deseq,LHE_names,umi,ini_sample,unmap_counts):
106
107 # read the sam file
108
109 ini_sam=read(path,0)
110 new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
111 unique_seq = [x for x in new_main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26]
112
113 # Calculate the shifted positions for every isomir and add them to the name of it
114 sorted_uni_arms = []
115
116 for i in range(len(mature_mirnas)):
117 tmp_count_reads = 0 # calculate the total number of reads
118 tmp_count_seq = 0 # calculate the total number of sequences
119 for j in range(len(unique_seq)):
120
121 if mature_mirnas[i] == unique_seq[j][2]:
122
123 temp_mature = mature_mirnas[i+1]
124 off_part = longestSubstring(temp_mature, unique_seq[j][9])
125
126 mat_diff = temp_mature.split(off_part)
127 mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
128
129 unique_diff = unique_seq[j][9].split(off_part)
130 unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
131
132 # Problem with hsa-miR-8485
133 if mat_diff[1]!=0 and unique_diff[1]!=0:
134 unique_seq[j]=1
135 pre_pos = 0
136 post_pos = 0
137
138 elif mat_diff[0]!=0 and unique_diff[0]!=0:
139 unique_seq[j]=1
140 pre_pos = 0
141 post_pos = 0
142
143 else:
144 pre_pos = mat_diff[0]-unique_diff[0]
145 post_pos = unique_diff[1]-mat_diff[1]
146 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
147 tmp_count_seq = tmp_count_seq+1
148
149
150 if pre_pos != 0 or post_pos != 0:
151 if pre_pos == 0:
152 unique_seq[j][2] = unique_seq[j][2] + "_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos)
153 elif post_pos == 0:
154 unique_seq[j][2] = unique_seq[j][2] + "_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos)
155 else:
156 unique_seq[j][2] = unique_seq[j][2]+"_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos)
157
158 for x in range(unique_seq.count(1)):
159 unique_seq.remove(1)
160 if tmp_count_reads != 0 and tmp_count_seq != 0:
161 sorted_uni_arms.append([mature_mirnas[i], tmp_count_seq, tmp_count_reads])
162
163 # Store name of arms, number of sequences and number of reads
164 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
165
166 for y in sorted_uni_arms:
167 counts=0
168 seqs=0
169 for x in unique_seq:
170 if y[0]==x[2].split("_")[0]+"_"+x[2].split("_")[1]:
171 counts+=int(x[0].split("-")[1])
172 seqs+=1
173
174 y[1]=seqs
175 y[2]=counts
176
177 LHE=[]
178
179 l.acquire()
180 if con=="c":
181 LHE.extend(z[2] for z in unique_seq)
182 for y in unique_seq:
183 samples_mirna_names.append([y[2],y[9]])
184 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq])
185 LHE_names.extend(LHE)
186 unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4'])
187 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4'])
188 names.append(name)
189 samples.append(unique_seq)
190 data.append([con,name,unique_seq,sorted_uni_arms])
191 ini_sample.append(new_main_sam)
192
193 if con=="t":
194 LHE.extend(z[2] for z in unique_seq)
195 for y in unique_seq:
196 samples_mirna_names.append([y[2],y[9]])
197 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq])
198 LHE_names.extend(LHE)
199 unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4'])
200 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4'])
201 names.append(name)
202 samples.append(unique_seq)
203 data.append([con,name,unique_seq,sorted_uni_arms])
204 ini_sample.append(new_main_sam)
205 l.release()
206
207
208 ######################################################################################################################################
209 """
210 Read a sam file from Bowtie and do the followings:
211
212 1) Remove reverse stranded mapped reads
213 2) Remove unmapped reads
214 3) Remove all sequences with reads less than 11 reads
215 4) Sort the arms with the most sequences in decreading rate
216 5) Sort the sequences of every arm with the most reads in decreasing rate
217 6) Calculate total number of sequences of every arm
218 7) Calculate total number of reads of sequences of every arm.
219 8) Store all the informations in a txt file
220
221 """
222
223 def non_sam(mature_mirnas,path,name,con,l,data,names,n_deseq,n_samples_mirna_names,n_LHE_names):
224
225
226 ini_sam=read(path,0)
227 new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
228 unique_seq=[]
229 unique_seq = [x for x in new_main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26]
230 uni_seq=[]
231
232 # Calculate the shifted positions for every isomir and add them to the name of it
233 sorted_uni_arms = []
234 for i in range(1,len(mature_mirnas),2):
235 tmp_count_reads = 0 # calculate the total number of reads
236 tmp_count_seq = 0 # calculate the total number of sequences
237
238 for j in range(len(unique_seq)):
239
240 temp_mature = mature_mirnas[i].strip().replace("U", "T")
241
242 if temp_mature in unique_seq[j][9]:
243
244 off_part = longestSubstring(temp_mature, unique_seq[j][9])
245
246 mat_diff = temp_mature.split(off_part)
247 mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
248
249 unique_diff = unique_seq[j][9].split(off_part)
250 if len(unique_diff)<=2:
251 unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
252
253 pre_pos = mat_diff[0]-unique_diff[0]
254 post_pos = unique_diff[1]-mat_diff[1]
255
256 lengthofmir = len(off_part) + post_pos
257 if pre_pos == 0:
258 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
259 tmp_count_seq = tmp_count_seq + 1
260
261 if pre_pos == 0:
262 t_name=copy.deepcopy(unique_seq[j])
263 t_name[2]=mature_mirnas[i - 1] + "__" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):])
264 uni_seq.append(t_name)
265
266 if tmp_count_reads != 0 and tmp_count_seq != 0:
267 sorted_uni_arms.append([mature_mirnas[i-1], tmp_count_seq, tmp_count_reads])
268
269
270 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
271 unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq))))
272
273 LHE=[]
274
275 l.acquire()
276 if con=="c":
277 LHE.extend(x[2] for x in unique_seq if x[2]!="*")
278 for x in unique_seq:
279 if x[2]!="*":
280 n_samples_mirna_names.append([x[2],x[9]])
281 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
282 n_LHE_names.extend(LHE)
283 names.append(name)
284 data.append([con,name,unique_seq,sorted_uni_arms])
285
286
287 if con=="t":
288 LHE.extend(x[2] for x in unique_seq if x[2]!="*")
289 for x in unique_seq:
290 if x[2]!="*":
291 n_samples_mirna_names.append([x[2],x[9]])
292 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
293 n_LHE_names.extend(LHE)
294 names.append(name)
295 data.append([con,name,unique_seq,sorted_uni_arms])
296 l.release()
297
298 #################################################################################################################################################################################################################
299
300 def deseq2_temp(samples_mirna_names,deseq,con,l):
301
302 samples_mirna_names.sort(key=lambda x:[0])
303
304 for i in range(len(deseq)):
305 for y in samples_mirna_names:
306 flag = 0
307 for x in deseq[i]:
308 if y[0] == x[0]:
309 flag = 1
310 break
311
312 if flag == 0:
313 deseq[i].append([y[0], "0", y[1]])
314
315 [deseq[i].sort(key=lambda x: x[0]) for i, _ in enumerate(deseq)]
316 deseq_final = [[x[0],x[2]] for x in deseq[0]]
317 [deseq_final[z].append(deseq[i][j][1]) for z,_ in enumerate(deseq_final) for i, _ in enumerate(deseq) for j,_ in enumerate(deseq[i]) if deseq_final[z][0] == deseq[i][j][0]]
318
319 l.acquire()
320 if con=="c":
321 q1.put(deseq_final)
322
323 if con=="t":
324 q2.put(deseq_final)
325 l.release()
326
327
328 ##################################################################################################################################################################################################################
329
330 def main_temp(LH2E, LH2E_names, LH8E, LH8E_names,flag,names_con,names_tre,filter_LH8E,filter_LH2E,raw_LH8E,raw_LH2E,per,count):
331
332 LH8E_add_names = [x for x in LH2E_names if x not in LH8E_names]
333 LH2E_add_names = [x for x in LH8E_names if x not in LH2E_names]
334
335 LH8E_add_names.sort()
336 LH2E_add_names.sort()
337 LH8E_add_names = list(LH8E_add_names for LH8E_add_names,_ in itertools.groupby(LH8E_add_names))
338 LH2E_add_names = list(LH2E_add_names for LH2E_add_names,_ in itertools.groupby(LH2E_add_names))
339
340 LH2E.sort()
341 LH8E.sort()
342 LH2E = list(LH2E for LH2E,_ in itertools.groupby(LH2E))
343 LH8E = list(LH8E for LH8E,_ in itertools.groupby(LH8E))
344
345 zeros=["0"]*(len(LH8E[0])-2)
346 [LH8E_add_names[i].extend(zeros) for i,_ in enumerate(LH8E_add_names)]
347 LH8E=LH8E+LH8E_add_names
348
349 zeros=["0"]*(len(LH2E[0])-2)
350 [LH2E_add_names[i].extend(zeros) for i,_ in enumerate(LH2E_add_names)]
351 LH2E=LH2E+LH2E_add_names
352
353 dupes=[]
354 final_LH2E =[]
355
356 for num,_ in enumerate(LH2E):
357
358 if LH2E[num][1] not in final_LH2E and LH2E[num][0] not in final_LH2E:
359 final_LH2E.append(LH2E[num][1])
360 final_LH2E.append(LH2E[num][0])
361 else:
362 dupes.append(LH2E[num][1])
363
364 dupes=list(set(dupes))
365
366 dupes=[[x] for x in dupes]
367
368 for x in LH2E:
369 for y in dupes:
370 if x[1]==y[0]:
371 fl=0
372 if len(y)==1:
373 y.append(x[0])
374 else:
375 for i in range(1,len(y)):
376 if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]:
377 fl=1
378 if len(x[0])<len(y[i]):
379 del y[i]
380 y.append(x[0])
381 break
382
383 if fl==0:
384 y.append((x[0]))
385
386 for y in dupes:
387 if len(y)>2:
388 for i in range(len(y)-1,1,-1):
389 y[1]=y[1]+"/"+y[i]
390 del y[i]
391
392
393 for x in LH2E:
394 for y in dupes:
395 if x[1]==y[0]:
396 x[0]=y[1]
397
398 for x in LH8E:
399 for y in dupes:
400 if x[1]==y[0]:
401 x[0]=y[1]
402
403
404
405 LH2E.sort()
406 LH2E=list(LH2E for LH2E,_ in itertools.groupby(LH2E))
407
408 LH8E.sort()
409 LH8E=list(LH8E for LH8E,_ in itertools.groupby(LH8E))
410
411 if int(per)!=-1:
412 percent=int(per)/100
413
414 c_col_filter=round(percent*(len(LH2E[1])-2))
415 t_col_filter=round(percent*(len(LH8E[1])-2))
416
417 for i, _ in enumerate(LH2E):
418 c_cols=0
419 t_cols=0
420
421 c_cols=sum([1 for j in range(len(LH2E[i])-2) if int(LH2E[i][j+2])>=int(count)])
422 t_cols=sum([1 for j in range(len(LH8E[i])-2) if int(LH8E[i][j+2])>=int(count)])
423
424 if c_cols>=c_col_filter or t_cols>=t_col_filter:
425 filter_LH8E.append(LH8E[i])
426 filter_LH2E.append(LH2E[i])
427
428 raw_LH2E.extend(LH2E)
429 raw_LH8E.extend(LH8E)
430
431 ##################################################################################################################################################################################################################
432
433 def write_main(raw_LH2E, raw_LH8E, fil_LH2E, fil_LH8E, names_con, names_tre, flag, per, n1, n2):
434
435 if flag == 1 and int(per)!=-1:
436 fp = open('Counts/Filtered '+n2 +' Templated Counts', 'w')
437 fp.write("Name\t")
438 fp.write("Sequence")
439 for y in names_tre:
440 fp.write("\t"+y)
441
442 for x in fil_LH8E:
443 fp.write("\n%s" % "\t".join(x))
444 fp.close()
445
446 fp = open('Counts/Filtered '+n1+' Templated Counts', 'w')
447 fp.write("Name\t")
448 fp.write("Sequence")
449 for y in names_con:
450 fp.write("\t"+y)
451
452 for x in fil_LH2E:
453 fp.write("\n%s" % "\t".join(x))
454 fp.close()
455
456
457 if flag == 2 and int(per)!=-1:
458 fp = open('Counts/Filtered '+n2+' Non-Templated Counts', 'w')
459 fp.write("Name\t")
460 fp.write("Sequence")
461 for y in names_tre:
462 fp.write("\t"+y)
463
464
465 for x in fil_LH8E:
466 fp.write("\n%s" % "\t".join(x))
467 fp.close()
468
469 fp = open('Counts/Filtered '+n1+' Non-Templated Counts', 'w')
470 fp.write("Name\t")
471 fp.write("Sequence")
472 for y in names_con:
473 fp.write("\t"+y)
474
475 for x in fil_LH2E:
476 fp.write("\n%s" % "\t".join(x))
477 fp.close()
478
479
480 if flag == 1:
481 fp = open('Counts/Raw '+n2+' Templated Counts', 'w')
482 fp.write("Name\t")
483 fp.write("Sequence")
484 for y in names_tre:
485 fp.write("\t"+y)
486
487 for x in raw_LH8E:
488 fp.write("\n%s" % "\t".join(x))
489 fp.close()
490
491 fp = open('Counts/Raw '+n1+' Templated Counts', 'w')
492 fp.write("Name\t")
493 fp.write("Sequence")
494 for y in names_con:
495 fp.write("\t"+y)
496
497 for x in raw_LH2E:
498 fp.write("\n%s" % "\t".join(x))
499 fp.close()
500
501 if flag == 2:
502 fp = open('Counts/Raw '+n2+' Non-Templated Counts', 'w')
503 fp.write("Name\t")
504 fp.write("Sequence")
505 for y in names_tre:
506 fp.write("\t"+y)
507
508
509 for x in raw_LH8E:
510 fp.write("\n%s" % "\t".join(x))
511 fp.close()
512
513 fp = open('Counts/Raw '+n1+' Non-Templated Counts', 'w')
514 fp.write("Name\t")
515 fp.write("Sequence")
516 for y in names_con:
517 fp.write("\t"+y)
518
519 for x in raw_LH2E:
520 fp.write("\n%s" % "\t".join(x))
521 fp.close()
522
523 ####################################################################################################################################################################################################################
524
525 def ssamples(names,samp,folder,pro):
526
527 for i in range(2,len(samp[0])):
528
529 fp = open(folder+names[i-2]+'.txt','w')
530 fp.write("miRNA id"+"\t"+names[i-2]+"\n")
531
532 for x in samp:
533 fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
534 fp.close()
535
536 ####################################################################################################################################################################################################################
537
538 def DB_write(con,name,unique_seq,sorted_uni_arms,f):
539
540 if f==1:
541 # Write a txt file with all the information
542 if con=="c":
543 fp = open('split1/'+name, 'w')
544
545 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
546 if con=="t":
547 fp = open('split2/'+name, 'w')
548 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
549
550 for i in range(len(sorted_uni_arms)):
551 temp = []
552 for j in range(len(unique_seq)):
553
554 if sorted_uni_arms[i][0] in (unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[1]):
555
556 temp.append(unique_seq[j])
557
558 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
559 fp.write("*********************************************************************************************************\n")
560 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
561 fp.write("*********************************************************************************************************\n\n")
562 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
563 fp.write("\n" + "\n")
564 fp.close()
565
566 if f==2:
567
568 if con=="c":
569 fp = open('split3/'+name, 'w')
570 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
571 if con=="t":
572 fp = open('split4/'+name, 'w')
573 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
574
575 for i in range(len(sorted_uni_arms)):
576 temp = []
577 for j in range(len(unique_seq)):
578 if sorted_uni_arms[i][0]==unique_seq[j][2].split("__")[0]:
579 temp.append(unique_seq[j])
580 if temp!=[]:
581 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
582 fp.write("*********************************************************************************************************\n")
583 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
584 fp.write("*********************************************************************************************************\n\n")
585 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
586 fp.write("\n" + "\n")
587 fp.close()
588
589
590 ##########################################################################################################################
591
592 def new_mat_seq(pre_unique_seq,mat_mirnas,l):
593
594 unique_iso = []
595 for x in pre_unique_seq:
596 if len(x[2].split("_"))==3:
597 for y in pre_unique_seq:
598 if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]):
599 if any(y[2] in lst2 for lst2 in unique_iso)==False:
600 y[2]=">"+y[2]
601 unique_iso.append(y)
602 l.acquire()
603 for x in unique_iso:
604 mat_mirnas.append(x[2])
605 mat_mirnas.append(x[9])
606 l.release()
607
608 #########################################################################################################################
609
610 def merging_names(LH2E_copy,new):
611
612 dupes=[]
613 final_LH2E =[]
614
615 for num in range(len(LH2E_copy)):
616
617 if LH2E_copy[num][1] not in final_LH2E and LH2E_copy[num][0] not in final_LH2E:
618 final_LH2E.append(LH2E_copy[num][1])
619 final_LH2E.append(LH2E_copy[num][0])
620 else:
621 dupes.append(LH2E_copy[num][1])
622
623 dupes=list(set(dupes))
624
625 for i in range(len(dupes)):
626 dupes[i]=[dupes[i]]
627
628 for x in LH2E_copy:
629 for y in dupes:
630 if x[1]==y[0]:
631 fl=0
632 if len(y)==1:
633 y.append(x[0])
634 else:
635 for i in range(1,len(y)):
636 if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]:
637 fl=1
638 if len(x[0])<len(y[i]):
639 del y[i]
640 y.append(x[0])
641 break
642
643 if fl==0:
644 y.append((x[0]))
645
646 for y in dupes:
647 if len(y)>2:
648 for i in range(len(y)-1,1,-1):
649 y[1]=y[1]+"/"+y[i]
650 del y[i]
651
652
653 for x in LH2E_copy:
654 for y in dupes:
655 if x[1]==y[0]:
656 x[0]=y[1]
657
658 LH2E_copy.sort()
659 LH2E_copy=list(LH2E_copy for LH2E_copy,_ in itertools.groupby(LH2E_copy))
660 new.extend(LH2E_copy)
661
662 ######################################################################################################################################################
663
664 def ssamples1(tem_names,tem_samp,non_names,non_samp,folder,pro):
665
666 for i in range(2,len(tem_samp[0])):
667
668 fp = open(folder+tem_names[i-2]+'.txt','w')
669 fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n")
670
671 for x in tem_samp:
672 fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
673
674 for j in range(len(non_names)):
675 if non_names[j]==tem_names[i-2]:
676 for x in non_samp:
677 fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n")
678 fp.close()
679
680 #################################################################################################################################################################################################################
681
682 def download_matures(matures,org_name):
683
684 mature_mir=[]
685
686 mat_url = 'http://mirgenedb.org/fasta/'+org_name+'?mat=1'
687 star_url = 'http://mirgenedb.org/fasta/'+org_name+'?star=1'
688
689 data = urllib.request.urlopen(mat_url).read()
690 file_mirna = data.decode('utf-8')
691 mature_mir = file_mirna.split("\n")
692 mature_mir = [x.replace(">","") for x in mature_mir]
693 del mature_mir[-1]
694
695 data = urllib.request.urlopen(star_url).read()
696 file_mirna = data.decode('utf-8')
697 star_mir = file_mirna.split("\n")
698 star_mir = [x.replace(">","") for x in star_mir]
699 del star_mir[-1]
700
701 mature_mir.extend(star_mir)
702
703 for i in range(1,len(mature_mir),2):
704 mature_mir[i]=mature_mir[i].replace("U","T")
705
706 matures.extend(mature_mir)
707
708 ###################################################################################################################
709
710 def non_template_ref(sc,st,all_isoforms):
711
712 pre_uni_seq_con = list(sc)
713 pre_uni_seq_tre = list(st)
714
715 for x in pre_uni_seq_con:
716 for y in x:
717 if y[2] not in all_isoforms and len(y[2].split("_"))>2:
718 all_isoforms.append(y[2])
719 all_isoforms.append(y[9])
720
721 for x in pre_uni_seq_tre:
722 for y in x:
723 if y[2] not in all_isoforms and len(y[2].split("_"))>2:
724 all_isoforms.append(y[2])
725 all_isoforms.append(y[9])
726
727 ################################################################################################################################################################################################
728
729 def deseqe2(sample,mir_names,l,new_d,sample_name,sample_order):
730
731 for y in mir_names:
732 flag=0
733 for x in sample:
734 if y[0]==x[0]:
735 flag=1
736 break
737 if flag==0:
738 sample.append([y[0],"0",y[1]])
739
740 sample.sort(key=lambda x: x[0])
741 sample=list(sample for sample,_ in itertools.groupby(sample))
742
743 l.acquire()
744 new_d.append(sample)
745 sample_order.append(sample_name)
746 l.release()
747
748 ###############################################################################################################################################################################################
749