comparison mirgene_functions.py @ 3:106c4aea4650 draft

Uploaded
author glogobyte
date Fri, 16 Oct 2020 18:54:12 +0000
parents
children
comparison
equal deleted inserted replaced
2:f0c4c70ceb00 3:106c4aea4650
1 import itertools
2 import time
3 import sys
4 import os
5 import urllib.request
6 import gzip
7 from multiprocessing import Process, Queue, Lock, Pool, Manager, Value
8 import subprocess
9 import argparse
10 from collections import OrderedDict
11 from matplotlib.backends.backend_pdf import PdfPages
12 import pandas as pd
13 from math import pi
14 import numpy as np
15 import matplotlib.pyplot as plt
16 from matplotlib.ticker import PercentFormatter
17 import seaborn as sns
18 import scipy.stats as stats
19 from plotnine import *
20 import math
21 import re
22 import matplotlib.ticker as mtick
23 import copy
24
25
26
27
28 """---------------------- Simple Functions -----------------------"""
29
30 # Read a file and return it as a list
31 def read(path, flag):
32 if flag == 0:
33 with open(path) as fp:
34 file=fp.readlines()
35 fp.close()
36 return file
37
38 if flag == 1:
39 with open(path) as fp:
40 file = fp.read().splitlines()
41 fp.close()
42 return file
43
44 # Write a list to a txt file
45 def write(path, list):
46 with open(path,'w') as fp:
47 for x in list:
48 fp.write(str("\t".join(x[1:-1])))
49 fp.close()
50
51 """---------------------- RNA-seq Functions ----------------------"""
52
53 # Detect the longest common substring sequence between two mirnas
54 def longestSubstring(str1, str2):
55
56 from difflib import SequenceMatcher
57 # initialize SequenceMatcher object with
58 # input string
59 seqMatch = SequenceMatcher(None, str1, str2)
60
61 # find match of longest sub-string
62 # output will be like Match(a=0, b=0, size=5)
63 match = seqMatch.find_longest_match(0, len(str1), 0, len(str2))
64
65 # print longest substring
66 if (match.size != 0):
67 return str1[match.a: match.a + match.size]
68 else:
69 print('No longest common sub-string found')
70
71
72
73 ########################################################################################################################################################
74 def collapse_sam(path):
75
76 ini_sam=read(path,0)
77 main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
78 intro_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" in x.split("\t")[0]]
79
80 uni_seq = []
81 for x in main_sam:
82
83 if [x[2], x[9]] not in uni_seq:
84 uni_seq.append([x[2], x[9]])
85
86 new_main_sam=[]
87 incr_num=0
88 for i in range(len(uni_seq)):
89 count=0
90 incr_num+=1
91 for y in main_sam:
92 if uni_seq[i][1]==y[9] and uni_seq[i][0]==y[2]:
93 count+=1
94 temp=y
95 temp[10]="~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
96 temp[0]=str(incr_num)+"-"+str(count)
97 new_main_sam.append(temp)
98
99 new_sam=intro_sam+new_main_sam
100
101 return new_sam
102
103 #################################################################################################################################################################
104
105 def sam(mature_mirnas,path,name,con,l,samples,data,names,unmap_seq,samples_mirna_names,deseq,LHE_names,umi,ini_sample,unmap_counts):
106
107 # read the sam file
108
109 ini_sam=read(path,0)
110 new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
111 unique_seq = [x for x in new_main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26]
112
113 # Calculate the shifted positions for every isomir and add them to the name of it
114 sorted_uni_arms = []
115
116 for i in range(len(mature_mirnas)):
117 tmp_count_reads = 0 # calculate the total number of reads
118 tmp_count_seq = 0 # calculate the total number of sequences
119 for j in range(len(unique_seq)):
120
121 if mature_mirnas[i] == unique_seq[j][2]:
122
123 temp_mature = mature_mirnas[i+1]
124 off_part = longestSubstring(temp_mature, unique_seq[j][9])
125
126 mat_diff = temp_mature.split(off_part)
127 mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
128
129 unique_diff = unique_seq[j][9].split(off_part)
130 unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
131
132 # Problem with hsa-miR-8485
133 if mat_diff[1]!=0 and unique_diff[1]!=0:
134 unique_seq[j]=1
135 pre_pos = 0
136 post_pos = 0
137
138 elif mat_diff[0]!=0 and unique_diff[0]!=0:
139 unique_seq[j]=1
140 pre_pos = 0
141 post_pos = 0
142
143 else:
144 pre_pos = mat_diff[0]-unique_diff[0]
145 post_pos = unique_diff[1]-mat_diff[1]
146 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
147 tmp_count_seq = tmp_count_seq+1
148
149
150 if pre_pos != 0 or post_pos != 0:
151 if pre_pos == 0:
152 unique_seq[j][2] = unique_seq[j][2] + "_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos)
153 elif post_pos == 0:
154 unique_seq[j][2] = unique_seq[j][2] + "_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos)
155 else:
156 unique_seq[j][2] = unique_seq[j][2]+"_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos)
157
158 for x in range(unique_seq.count(1)):
159 unique_seq.remove(1)
160 if tmp_count_reads != 0 and tmp_count_seq != 0:
161 sorted_uni_arms.append([mature_mirnas[i], tmp_count_seq, tmp_count_reads])
162
163 # Store name of arms, number of sequences and number of reads
164 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
165
166 for y in sorted_uni_arms:
167 counts=0
168 seqs=0
169 for x in unique_seq:
170 if y[0]==x[2].split("_")[0]+"_"+x[2].split("_")[1]:
171 counts+=int(x[0].split("-")[1])
172 seqs+=1
173
174 y[1]=seqs
175 y[2]=counts
176
177 LHE=[]
178
179 l.acquire()
180 if con=="c":
181 LHE.extend(z[2] for z in unique_seq)
182 for y in unique_seq:
183 samples_mirna_names.append([y[2],y[9]])
184 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq])
185 LHE_names.extend(LHE)
186 unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4'])
187 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4'])
188 names.append(name)
189 samples.append(unique_seq)
190 data.append([con,name,unique_seq,sorted_uni_arms])
191 ini_sample.append(new_main_sam)
192
193 if con=="t":
194 LHE.extend(z[2] for z in unique_seq)
195 for y in unique_seq:
196 samples_mirna_names.append([y[2],y[9]])
197 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq])
198 LHE_names.extend(LHE)
199 unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4'])
200 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4'])
201 names.append(name)
202 samples.append(unique_seq)
203 data.append([con,name,unique_seq,sorted_uni_arms])
204 ini_sample.append(new_main_sam)
205 l.release()
206
207
208 ######################################################################################################################################
209 """
210 Read a sam file from Bowtie and do the followings:
211
212 1) Remove reverse stranded mapped reads
213 2) Remove unmapped reads
214 3) Remove all sequences with reads less than 11 reads
215 4) Sort the arms with the most sequences in decreading rate
216 5) Sort the sequences of every arm with the most reads in decreasing rate
217 6) Calculate total number of sequences of every arm
218 7) Calculate total number of reads of sequences of every arm.
219 8) Store all the informations in a txt file
220
221 """
222
223 def non_sam(mature_mirnas,path,name,con,l,data,names,n_deseq,n_samples_mirna_names,n_LHE_names):
224
225
226 ini_sam=read(path,0)
227 new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
228 unique_seq=[]
229 unique_seq = [x for x in new_main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26]
230 uni_seq=[]
231
232 # Calculate the shifted positions for every isomir and add them to the name of it
233 sorted_uni_arms = []
234 for i in range(1,len(mature_mirnas),2):
235 tmp_count_reads = 0 # calculate the total number of reads
236 tmp_count_seq = 0 # calculate the total number of sequences
237
238 for j in range(len(unique_seq)):
239
240 temp_mature = mature_mirnas[i].strip().replace("U", "T")
241
242 if temp_mature in unique_seq[j][9]:
243
244 off_part = longestSubstring(temp_mature, unique_seq[j][9])
245
246 mat_diff = temp_mature.split(off_part)
247 mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
248
249 unique_diff = unique_seq[j][9].split(off_part)
250 if len(unique_diff)<=2:
251 unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
252
253 pre_pos = mat_diff[0]-unique_diff[0]
254 post_pos = unique_diff[1]-mat_diff[1]
255
256 lengthofmir = len(off_part) + post_pos
257 if pre_pos == 0:
258 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
259 tmp_count_seq = tmp_count_seq + 1
260
261 if pre_pos == 0:
262 t_name=copy.deepcopy(unique_seq[j])
263 t_name[2]=mature_mirnas[i - 1] + "__" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):])
264 uni_seq.append(t_name)
265
266 if tmp_count_reads != 0 and tmp_count_seq != 0:
267 sorted_uni_arms.append([mature_mirnas[i-1], tmp_count_seq, tmp_count_reads])
268
269
270 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
271 unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq))))
272
273 LHE=[]
274
275 l.acquire()
276 if con=="c":
277 LHE.extend(x[2] for x in unique_seq if x[2]!="*")
278 for x in unique_seq:
279 if x[2]!="*":
280 n_samples_mirna_names.append([x[2],x[9]])
281 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
282 n_LHE_names.extend(LHE)
283 names.append(name)
284 data.append([con,name,unique_seq,sorted_uni_arms])
285
286
287 if con=="t":
288 LHE.extend(x[2] for x in unique_seq if x[2]!="*")
289 for x in unique_seq:
290 if x[2]!="*":
291 n_samples_mirna_names.append([x[2],x[9]])
292 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
293 n_LHE_names.extend(LHE)
294 names.append(name)
295 data.append([con,name,unique_seq,sorted_uni_arms])
296 l.release()
297
298 #################################################################################################################################################################################################################
299
300 def deseq2_temp(samples_mirna_names,deseq,con,l):
301
302 samples_mirna_names.sort(key=lambda x:[0])
303
304 for i in range(len(deseq)):
305 for y in samples_mirna_names:
306 flag = 0
307 for x in deseq[i]:
308 if y[0] == x[0]:
309 flag = 1
310 break
311
312 if flag == 0:
313 deseq[i].append([y[0], "0", y[1]])
314
315 [deseq[i].sort(key=lambda x: x[0]) for i, _ in enumerate(deseq)]
316 deseq_final = [[x[0],x[2]] for x in deseq[0]]
317 [deseq_final[z].append(deseq[i][j][1]) for z,_ in enumerate(deseq_final) for i, _ in enumerate(deseq) for j,_ in enumerate(deseq[i]) if deseq_final[z][0] == deseq[i][j][0]]
318
319 l.acquire()
320 if con=="c":
321 q1.put(deseq_final)
322
323 if con=="t":
324 q2.put(deseq_final)
325 l.release()
326
327
328 ##################################################################################################################################################################################################################
329
330 def main_temp(LH2E, LH2E_names, LH8E, LH8E_names,flag,names_con,names_tre,filter_LH8E,filter_LH2E,raw_LH8E,raw_LH2E,per,count):
331
332 LH8E_add_names = [x for x in LH2E_names if x not in LH8E_names]
333 LH2E_add_names = [x for x in LH8E_names if x not in LH2E_names]
334
335 LH8E_add_names.sort()
336 LH2E_add_names.sort()
337 LH8E_add_names = list(LH8E_add_names for LH8E_add_names,_ in itertools.groupby(LH8E_add_names))
338 LH2E_add_names = list(LH2E_add_names for LH2E_add_names,_ in itertools.groupby(LH2E_add_names))
339
340 LH2E.sort()
341 LH8E.sort()
342 LH2E = list(LH2E for LH2E,_ in itertools.groupby(LH2E))
343 LH8E = list(LH8E for LH8E,_ in itertools.groupby(LH8E))
344
345 print("LHE_names")
346 print([len(LH8E_add_names),len(LH2E_add_names)])
347 print([len(LH8E),len(LH2E)])
348
349 zeros=["0"]*(len(LH8E[0])-2)
350 [LH8E_add_names[i].extend(zeros) for i,_ in enumerate(LH8E_add_names)]
351 LH8E=LH8E+LH8E_add_names
352
353 zeros=["0"]*(len(LH2E[0])-2)
354 [LH2E_add_names[i].extend(zeros) for i,_ in enumerate(LH2E_add_names)]
355 LH2E=LH2E+LH2E_add_names
356
357 dupes=[]
358 final_LH2E =[]
359
360 for num,_ in enumerate(LH2E):
361
362 if LH2E[num][1] not in final_LH2E and LH2E[num][0] not in final_LH2E:
363 final_LH2E.append(LH2E[num][1])
364 final_LH2E.append(LH2E[num][0])
365 else:
366 dupes.append(LH2E[num][1])
367
368 dupes=list(set(dupes))
369
370 dupes=[[x] for x in dupes]
371
372 for x in LH2E:
373 for y in dupes:
374 if x[1]==y[0]:
375 fl=0
376 if len(y)==1:
377 y.append(x[0])
378 else:
379 for i in range(1,len(y)):
380 if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]:
381 fl=1
382 if len(x[0])<len(y[i]):
383 del y[i]
384 y.append(x[0])
385 break
386
387 if fl==0:
388 y.append((x[0]))
389
390 for y in dupes:
391 if len(y)>2:
392 for i in range(len(y)-1,1,-1):
393 y[1]=y[1]+"/"+y[i]
394 del y[i]
395
396
397 for x in LH2E:
398 for y in dupes:
399 if x[1]==y[0]:
400 x[0]=y[1]
401
402 for x in LH8E:
403 for y in dupes:
404 if x[1]==y[0]:
405 x[0]=y[1]
406
407
408
409 LH2E.sort()
410 LH2E=list(LH2E for LH2E,_ in itertools.groupby(LH2E))
411
412 LH8E.sort()
413 LH8E=list(LH8E for LH8E,_ in itertools.groupby(LH8E))
414
415 if int(per)!=-1:
416 percent=int(per)/100
417 print(percent)
418 print(count)
419
420 c_col_filter=round(percent*(len(LH2E[1])-2))
421 t_col_filter=round(percent*(len(LH8E[1])-2))
422
423 for i, _ in enumerate(LH2E):
424 c_cols=0
425 t_cols=0
426
427 c_cols=sum([1 for j in range(len(LH2E[i])-2) if int(LH2E[i][j+2])>=int(count)])
428 t_cols=sum([1 for j in range(len(LH8E[i])-2) if int(LH8E[i][j+2])>=int(count)])
429
430 if c_cols>=c_col_filter or t_cols>=t_col_filter:
431 filter_LH8E.append(LH8E[i])
432 filter_LH2E.append(LH2E[i])
433
434 raw_LH2E.extend(LH2E)
435 raw_LH8E.extend(LH8E)
436
437 ##################################################################################################################################################################################################################
438
439 def write_main(raw_LH2E, raw_LH8E, fil_LH2E, fil_LH8E, names_con, names_tre, flag, per, n1, n2):
440
441 if flag == 1 and int(per)!=-1:
442 fp = open('Counts/Filtered '+n2 +' Templated Counts', 'w')
443 fp.write("Name\t")
444 fp.write("Sequence")
445 for y in names_tre:
446 fp.write("\t"+y)
447
448 for x in fil_LH8E:
449 fp.write("\n%s" % "\t".join(x))
450 fp.close()
451
452 fp = open('Counts/Filtered '+n1+' Templated Counts', 'w')
453 fp.write("Name\t")
454 fp.write("Sequence")
455 for y in names_con:
456 fp.write("\t"+y)
457
458 for x in fil_LH2E:
459 fp.write("\n%s" % "\t".join(x))
460 fp.close()
461
462
463 if flag == 2 and int(per)!=-1:
464 fp = open('Counts/Filtered '+n2+' Non-Templated Counts', 'w')
465 fp.write("Name\t")
466 fp.write("Sequence")
467 for y in names_tre:
468 fp.write("\t"+y)
469
470
471 for x in fil_LH8E:
472 fp.write("\n%s" % "\t".join(x))
473 fp.close()
474
475 fp = open('Counts/Filtered '+n1+' Non-Templated Counts', 'w')
476 fp.write("Name\t")
477 fp.write("Sequence")
478 for y in names_con:
479 fp.write("\t"+y)
480
481 for x in fil_LH2E:
482 fp.write("\n%s" % "\t".join(x))
483 fp.close()
484
485
486 if flag == 1:
487 fp = open('Counts/Raw '+n2+' Templated Counts', 'w')
488 fp.write("Name\t")
489 fp.write("Sequence")
490 for y in names_tre:
491 fp.write("\t"+y)
492
493 for x in raw_LH8E:
494 fp.write("\n%s" % "\t".join(x))
495 fp.close()
496
497 fp = open('Counts/Raw '+n1+' Templated Counts', 'w')
498 fp.write("Name\t")
499 fp.write("Sequence")
500 for y in names_con:
501 fp.write("\t"+y)
502
503 for x in raw_LH2E:
504 fp.write("\n%s" % "\t".join(x))
505 fp.close()
506
507 if flag == 2:
508 fp = open('Counts/Raw '+n2+' Non-Templated Counts', 'w')
509 fp.write("Name\t")
510 fp.write("Sequence")
511 for y in names_tre:
512 fp.write("\t"+y)
513
514
515 for x in raw_LH8E:
516 fp.write("\n%s" % "\t".join(x))
517 fp.close()
518
519 fp = open('Counts/Raw '+n1+' Non-Templated Counts', 'w')
520 fp.write("Name\t")
521 fp.write("Sequence")
522 for y in names_con:
523 fp.write("\t"+y)
524
525 for x in raw_LH2E:
526 fp.write("\n%s" % "\t".join(x))
527 fp.close()
528
529 ####################################################################################################################################################################################################################
530
531 def ssamples(names,samp,folder,pro):
532
533 for i in range(2,len(samp[0])):
534
535 fp = open(folder+names[i-2]+'.txt','w')
536 fp.write("miRNA id"+"\t"+names[i-2]+"\n")
537
538 for x in samp:
539 fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
540 fp.close()
541
542 ####################################################################################################################################################################################################################
543
544 def DB_write(con,name,unique_seq,sorted_uni_arms,f):
545
546 if f==1:
547 # Write a txt file with all the information
548 if con=="c":
549 fp = open('split1/'+name, 'w')
550
551 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
552 if con=="t":
553 fp = open('split2/'+name, 'w')
554 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
555
556 for i in range(len(sorted_uni_arms)):
557 temp = []
558 for j in range(len(unique_seq)):
559
560 if sorted_uni_arms[i][0] in (unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[1]):
561
562 temp.append(unique_seq[j])
563
564 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
565 fp.write("*********************************************************************************************************\n")
566 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
567 fp.write("*********************************************************************************************************\n\n")
568 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
569 fp.write("\n" + "\n")
570 fp.close()
571
572 if f==2:
573
574 if con=="c":
575 fp = open('split3/'+name, 'w')
576 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
577 if con=="t":
578 fp = open('split4/'+name, 'w')
579 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
580
581 for i in range(len(sorted_uni_arms)):
582 temp = []
583 for j in range(len(unique_seq)):
584 if sorted_uni_arms[i][0]==unique_seq[j][2].split("__")[0]:
585 temp.append(unique_seq[j])
586 if temp!=[]:
587 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
588 fp.write("*********************************************************************************************************\n")
589 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
590 fp.write("*********************************************************************************************************\n\n")
591 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
592 fp.write("\n" + "\n")
593 fp.close()
594
595
596 ##########################################################################################################################
597
598 def new_mat_seq(pre_unique_seq,mat_mirnas,l):
599
600 unique_iso = []
601 for x in pre_unique_seq:
602 if len(x[2].split("_"))==3:
603 for y in pre_unique_seq:
604 if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]):
605 if any(y[2] in lst2 for lst2 in unique_iso)==False:
606 y[2]=">"+y[2]
607 unique_iso.append(y)
608 l.acquire()
609 for x in unique_iso:
610 mat_mirnas.append(x[2])
611 mat_mirnas.append(x[9])
612 l.release()
613
614 #########################################################################################################################
615
616 def merging_names(LH2E_copy,new):
617
618 dupes=[]
619 final_LH2E =[]
620
621 for num in range(len(LH2E_copy)):
622
623 if LH2E_copy[num][1] not in final_LH2E and LH2E_copy[num][0] not in final_LH2E:
624 final_LH2E.append(LH2E_copy[num][1])
625 final_LH2E.append(LH2E_copy[num][0])
626 else:
627 dupes.append(LH2E_copy[num][1])
628
629 dupes=list(set(dupes))
630
631 for i in range(len(dupes)):
632 dupes[i]=[dupes[i]]
633
634 for x in LH2E_copy:
635 for y in dupes:
636 if x[1]==y[0]:
637 fl=0
638 if len(y)==1:
639 y.append(x[0])
640 else:
641 for i in range(1,len(y)):
642 if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]:
643 fl=1
644 if len(x[0])<len(y[i]):
645 del y[i]
646 y.append(x[0])
647 break
648
649 if fl==0:
650 y.append((x[0]))
651
652 for y in dupes:
653 if len(y)>2:
654 for i in range(len(y)-1,1,-1):
655 y[1]=y[1]+"/"+y[i]
656 del y[i]
657
658
659 for x in LH2E_copy:
660 for y in dupes:
661 if x[1]==y[0]:
662 x[0]=y[1]
663
664 LH2E_copy.sort()
665 LH2E_copy=list(LH2E_copy for LH2E_copy,_ in itertools.groupby(LH2E_copy))
666 new.extend(LH2E_copy)
667
668 ######################################################################################################################################################
669
670 def ssamples1(tem_names,tem_samp,non_names,non_samp,folder,pro):
671
672 for i in range(2,len(tem_samp[0])):
673
674 fp = open(folder+tem_names[i-2]+'.txt','w')
675 fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n")
676
677 for x in tem_samp:
678 fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
679
680 for j in range(len(non_names)):
681 if non_names[j]==tem_names[i-2]:
682 for x in non_samp:
683 fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n")
684 fp.close()
685
686 #################################################################################################################################################################################################################
687
688 def download_matures(matures,org_name):
689
690 mature_mir=[]
691
692 mat_url = 'http://mirgenedb.org/fasta/'+org_name+'?mat=1'
693 star_url = 'http://mirgenedb.org/fasta/'+org_name+'?star=1'
694
695 data = urllib.request.urlopen(mat_url).read()
696 file_mirna = data.decode('utf-8')
697 mature_mir = file_mirna.split("\n")
698 mature_mir = [x.replace(">","") for x in mature_mir]
699 del mature_mir[-1]
700
701 data = urllib.request.urlopen(star_url).read()
702 file_mirna = data.decode('utf-8')
703 star_mir = file_mirna.split("\n")
704 star_mir = [x.replace(">","") for x in star_mir]
705 del star_mir[-1]
706
707 mature_mir.extend(star_mir)
708
709 for i in range(1,len(mature_mir),2):
710 mature_mir[i]=mature_mir[i].replace("U","T")
711
712 matures.extend(mature_mir)
713
714 ###################################################################################################################
715
716 def non_template_ref(sc,st,all_isoforms):
717
718 pre_uni_seq_con = list(sc)
719 pre_uni_seq_tre = list(st)
720
721 for x in pre_uni_seq_con:
722 for y in x:
723 if y[2] not in all_isoforms and len(y[2].split("_"))>2:
724 all_isoforms.append(y[2])
725 all_isoforms.append(y[9])
726
727 for x in pre_uni_seq_tre:
728 for y in x:
729 if y[2] not in all_isoforms and len(y[2].split("_"))>2:
730 all_isoforms.append(y[2])
731 all_isoforms.append(y[9])
732
733 ################################################################################################################################################################################################
734
735 def deseqe2(sample,mir_names,l,new_d,sample_name,sample_order):
736
737 for y in mir_names:
738 flag=0
739 for x in sample:
740 if y[0]==x[0]:
741 flag=1
742 break
743 if flag==0:
744 sample.append([y[0],"0",y[1]])
745
746 sample.sort(key=lambda x: x[0])
747 sample=list(sample for sample,_ in itertools.groupby(sample))
748
749 l.acquire()
750 new_d.append(sample)
751 sample_order.append(sample_name)
752 l.release()
753
754 ###############################################################################################################################################################################################
755