0
|
1 import itertools
|
|
2 import time
|
|
3 import sys
|
|
4 import os
|
|
5 import urllib.request
|
|
6 import gzip
|
|
7 from multiprocessing import Process, Queue, Lock, Pool, Manager, Value
|
|
8 import subprocess
|
|
9 import argparse
|
|
10 from collections import OrderedDict
|
|
11 from matplotlib.backends.backend_pdf import PdfPages
|
|
12 import pandas as pd
|
|
13 from math import pi
|
|
14 import numpy as np
|
|
15 import matplotlib.pyplot as plt
|
|
16 from matplotlib.ticker import PercentFormatter
|
|
17 import seaborn as sns
|
|
18 import scipy.stats as stats
|
|
19 from plotnine import *
|
|
20 import math
|
|
21 import re
|
|
22 import matplotlib.ticker as mtick
|
|
23 import copy
|
|
24
|
|
25
|
|
26 """---------------------- Simple Functions -----------------------"""
|
|
27
|
|
28 # Read a file and return it as a list
|
|
29 def read(path, flag):
|
|
30 if flag == 0:
|
|
31 with open(path) as fp:
|
|
32 file=fp.readlines()
|
|
33 fp.close()
|
|
34 return file
|
|
35
|
|
36 if flag == 1:
|
|
37 with open(path) as fp:
|
|
38 file = fp.read().splitlines()
|
|
39 fp.close()
|
|
40 return file
|
|
41
|
|
42 # Write a list to a txt file
|
|
43 def write(path, list):
|
|
44 with open(path,'w') as fp:
|
|
45 for x in list:
|
|
46 fp.write(str("\t".join(x[1:-1])))
|
|
47 fp.close()
|
|
48
|
|
49 """---------------------- RNA-seq Functions ----------------------"""
|
|
50
|
|
51 # Detect the longest common substring sequence between two mirnas
|
|
52 def longestSubstring(str1, str2):
|
|
53
|
|
54 from difflib import SequenceMatcher
|
|
55 # initialize SequenceMatcher object with
|
|
56 # input string
|
|
57 seqMatch = SequenceMatcher(None, str1, str2)
|
|
58
|
|
59 # find match of longest sub-string
|
|
60 # output will be like Match(a=0, b=0, size=5)
|
|
61 match = seqMatch.find_longest_match(0, len(str1), 0, len(str2))
|
|
62
|
|
63 # print longest substring
|
|
64 if (match.size != 0):
|
|
65 return str1[match.a: match.a + match.size]
|
|
66 else:
|
|
67 print('No longest common sub-string found')
|
|
68
|
|
69
|
|
70
|
|
71 ########################################################################################################################################################
|
|
72
|
|
73 def collapse_sam(path):
|
|
74
|
|
75 ini_sam=read(path,0)
|
|
76 main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
|
|
77 intro_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" in x.split("\t")[0]]
|
|
78
|
|
79 uni_seq = []
|
|
80 for x in main_sam:
|
|
81
|
|
82 if [x[2], x[9]] not in uni_seq:
|
|
83 uni_seq.append([x[2], x[9]])
|
|
84
|
|
85 new_main_sam=[]
|
|
86 incr_num=0
|
|
87 for i in range(len(uni_seq)):
|
|
88 count=0
|
|
89 incr_num+=1
|
|
90 for y in main_sam:
|
|
91 if uni_seq[i][1]==y[9] and uni_seq[i][0]==y[2]:
|
|
92 count+=1
|
|
93 temp=y
|
|
94 temp[10]="~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
|
|
95 temp[0]=str(incr_num)+"-"+str(count)
|
|
96 new_main_sam.append(temp)
|
|
97
|
|
98 new_sam=intro_sam+new_main_sam
|
|
99
|
|
100 return new_sam
|
|
101
|
|
102 #################################################################################################################################################################################################################
|
|
103
|
|
104 def duplicate_chroms_isoforms(List):
|
|
105
|
|
106 dupes=[]
|
|
107
|
|
108 for num in range(len(List)):
|
|
109
|
|
110 if [List[num][9],List[num][0],List[num][2]] not in dupes :
|
|
111 dupes.append([List[num][9],List[num][0],List[num][2]])
|
|
112
|
|
113 for x in List:
|
|
114 for y in dupes:
|
|
115 if x[9]==y[0] and x[0]==y[1] and x[2].split("_")[0]==y[2].split("_")[0] and x[2]!=y[2]:
|
|
116 y.append(x[2])
|
|
117
|
|
118
|
|
119 double_List = [x[:] for x in List]
|
|
120
|
|
121 chr_order=[]
|
|
122 for x in dupes:
|
|
123 temp = []
|
|
124 for i in range(2,len(x)):
|
|
125 if x[i].split("chr")[1].split("(")[0].isdigit():
|
|
126 temp.append(int(x[i].split("chr")[1].split("(")[1][0]+x[i].split("chr")[1].split("(")[0]))
|
|
127 else:
|
|
128 temp.append(x[i].split("chr")[1][0:4])
|
|
129
|
|
130 for z in temp:
|
|
131 if 'X(-)'==z or 'Y(-)'==z or 'X(+)'==z or 'Y(+)'==z:
|
|
132 temp = [str(j) for j in temp]
|
|
133 temp=list(set(temp))
|
|
134 temp.sort()
|
|
135 chr_order.append(temp)
|
|
136
|
|
137 final_dupes=[]
|
|
138 for i in range(len(dupes)):
|
|
139 final_dupes.append([dupes[i][0],dupes[i][2].split("_")[0],dupes[i][1]])
|
|
140 for x in chr_order[i]:
|
|
141 result = re.match("[-+]?\d+$", str(x))
|
|
142 if len(chr_order[i]) == len(set(chr_order[i])):
|
|
143 if result is not None:
|
|
144
|
|
145 if int(x)<0:
|
|
146 final_dupes[i][1]=final_dupes[i][1]+"_chr"+str(abs(int(x)))+"(-)"
|
|
147 else:
|
|
148 final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(abs(int(x)))+"(+)"
|
|
149 else:
|
|
150 final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(x)
|
|
151 else:
|
|
152 if result is not None:
|
|
153 if int(x) < 0:
|
|
154 final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(-)"
|
|
155 else:
|
|
156 final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(+)"
|
|
157 else:
|
|
158 final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(x)
|
|
159
|
|
160 final_dupes.sort()
|
|
161 final_dupes=list(final_dupes for final_dupes,_ in itertools.groupby(final_dupes))
|
|
162
|
|
163 for i in range(len(double_List)):
|
|
164 for x in final_dupes:
|
|
165
|
|
166 if double_List[i][9] == x[0] and double_List[i][0] == x[2] and len(double_List[i][2].split("_")) >3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]:
|
|
167 gg=str("_"+double_List[i][2].split("_")[-2]+"_"+double_List[i][2].split("_")[-1])
|
|
168 double_List[i][2] = x[1]+gg
|
|
169
|
|
170 if double_List[i][9]==x[0] and double_List[i][0]== x[2] and len(double_List[i][2].split("_"))==3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]:
|
|
171 double_List[i][2]=x[1]
|
|
172 List[i][2] = x[1]
|
|
173
|
|
174 List.sort()
|
|
175 new_list=list(List for List,_ in itertools.groupby(List))
|
|
176
|
|
177 double_List.sort()
|
|
178 new_double_List = list(double_List for double_List, _ in itertools.groupby(double_List))
|
|
179
|
|
180 return new_list, new_double_List
|
|
181
|
|
182
|
|
183 #############################################################################################################################################################################################################
|
|
184
|
|
185 def sam(mature_mirnas,path,name,con,l,samples,data,names,unmap_seq,samples_mirna_names,deseq,LHE_names,umi,ini_sample,unmap_counts):
|
|
186
|
|
187 # read the sam file
|
|
188 ini_sam=read(path,0)
|
|
189 new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
|
|
190 unique_seq = [x for x in new_main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26]
|
|
191
|
|
192 sorted_uni_arms = []
|
|
193
|
|
194 for i in range(len(mature_mirnas)):
|
|
195 tmp_count_reads = 0 # calculate the total number of reads
|
|
196 tmp_count_seq = 0 # calculate the total number of sequences
|
|
197 for j in range(len(unique_seq)):
|
|
198
|
|
199 if "{" in unique_seq[j][2].split("_")[0]:
|
|
200 official=unique_seq[j][2].split("_")[0][:-4]
|
|
201 else:
|
|
202 official=unique_seq[j][2].split("_")[0]
|
|
203
|
|
204 if mature_mirnas[i].split(" ")[0][1:] == official:
|
|
205
|
|
206 temp_mature = mature_mirnas[i+1].strip().replace("U", "T")
|
|
207 off_part = longestSubstring(temp_mature, unique_seq[j][9])
|
|
208
|
|
209 mat_diff = temp_mature.split(off_part)
|
|
210 mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
|
|
211
|
|
212 unique_diff = unique_seq[j][9].split(off_part)
|
|
213 unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
|
|
214
|
|
215 # Problem with hsa-miR-8485
|
|
216 if mat_diff[1]!=0 and unique_diff[1]!=0:
|
|
217 unique_seq[j]=1
|
|
218 pre_pos = 0
|
|
219 post_pos = 0
|
|
220
|
|
221 elif mat_diff[0]!=0 and unique_diff[0]!=0:
|
|
222 unique_seq[j]=1
|
|
223 pre_pos = 0
|
|
224 post_pos = 0
|
|
225
|
|
226 else:
|
|
227 pre_pos = mat_diff[0]-unique_diff[0]
|
|
228 post_pos = unique_diff[1]-mat_diff[1]
|
|
229 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
|
|
230 tmp_count_seq = tmp_count_seq+1
|
|
231
|
|
232 if pre_pos != 0 or post_pos != 0:
|
|
233 if pre_pos == 0:
|
|
234 unique_seq[j][2] = unique_seq[j][2] + "_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos)
|
|
235 elif post_pos == 0:
|
|
236 unique_seq[j][2] = unique_seq[j][2] + "_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos)
|
|
237 else:
|
|
238 unique_seq[j][2] = unique_seq[j][2]+"_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos)
|
|
239
|
|
240 for x in range(unique_seq.count(1)):
|
|
241 unique_seq.remove(1)
|
|
242 if tmp_count_reads != 0 and tmp_count_seq != 0:
|
|
243 sorted_uni_arms.append([mature_mirnas[i].split(" ")[0][1:], tmp_count_seq, tmp_count_reads])
|
|
244 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
|
|
245 dedup_unique_seq,double_fil_uni_seq=duplicate_chroms_isoforms(unique_seq)
|
|
246
|
|
247 for y in sorted_uni_arms:
|
|
248 counts=0
|
|
249 seqs=0
|
|
250 for x in double_fil_uni_seq:
|
|
251 if y[0]==x[2].split("_")[0]:
|
|
252 counts+=int(x[0].split("-")[1])
|
|
253 seqs+=1
|
|
254
|
|
255 y[1]=seqs
|
|
256 y[2]=counts
|
|
257
|
|
258 LHE=[]
|
|
259 l.acquire()
|
|
260 if con=="c":
|
|
261 LHE.extend(z[2] for z in double_fil_uni_seq)
|
|
262 for y in double_fil_uni_seq:
|
|
263 samples_mirna_names.append([y[2],y[9]])
|
|
264 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq])
|
|
265 LHE_names.extend(LHE)
|
|
266 unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4'])
|
|
267 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4'])
|
|
268 names.append(name)
|
|
269 samples.append(dedup_unique_seq)
|
|
270 data.append([con,name,double_fil_uni_seq,sorted_uni_arms])
|
|
271 ini_sample.append(new_main_sam)
|
|
272
|
|
273 if con=="t":
|
|
274 LHE.extend(z[2] for z in double_fil_uni_seq)
|
|
275 for y in double_fil_uni_seq:
|
|
276 samples_mirna_names.append([y[2],y[9]])
|
|
277 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq])
|
|
278 LHE_names.extend(LHE)
|
|
279 unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4'])
|
|
280 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4'])
|
|
281 names.append(name)
|
|
282 samples.append(dedup_unique_seq)
|
|
283 data.append([con,name,double_fil_uni_seq,sorted_uni_arms])
|
|
284 ini_sample.append(new_main_sam)
|
|
285 l.release()
|
|
286
|
|
287
|
|
288 ######################################################################################################################################
|
|
289
|
|
290 """
|
|
291
|
|
292 Read a sam file from Bowtie and do the followings:
|
|
293
|
|
294 1) Remove reverse stranded mapped reads
|
|
295 2) Remove unmapped reads
|
|
296 3) Remove all sequences with reads less than 11 reads
|
|
297 4) Sort the arms with the most sequences in decreading rate
|
|
298 5) Sort the sequences of every arm with the most reads in decreasing rate
|
|
299 6) Calculate total number of sequences of every arm
|
|
300 7) Calculate total number of reads of sequences of every arm.
|
|
301 8) Store all the informations in a txt file
|
|
302
|
|
303 """
|
|
304
|
|
305 def non_sam(mature_mirnas,path,name,con,l,data,names,n_deseq,n_samples_mirna_names,n_LHE_names):
|
|
306
|
|
307 ini_sam=read(path,0)
|
|
308 new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
|
|
309 unique_seq=[]
|
|
310 unique_seq = [x for x in new_main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26]
|
|
311
|
|
312 uni_seq=[]
|
|
313 # Calculate the shifted positions for every isomir and add them to the name of it
|
|
314 sorted_uni_arms = []
|
|
315 for i in range(1,len(mature_mirnas),2):
|
|
316 tmp_count_reads = 0 # calculate the total number of reads
|
|
317 tmp_count_seq = 0 # calculate the total number of sequences
|
|
318
|
|
319 for j in range(len(unique_seq)):
|
|
320
|
|
321 temp_mature = mature_mirnas[i].strip().replace("U", "T")
|
|
322
|
|
323 if temp_mature in unique_seq[j][9]:
|
|
324
|
|
325 off_part = longestSubstring(temp_mature, unique_seq[j][9])
|
|
326
|
|
327 mat_diff = temp_mature.split(off_part)
|
|
328 mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
|
|
329
|
|
330 unique_diff = unique_seq[j][9].split(off_part)
|
|
331 if len(unique_diff)<=2:
|
|
332 unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
|
|
333
|
|
334 pre_pos = mat_diff[0]-unique_diff[0]
|
|
335 post_pos = unique_diff[1]-mat_diff[1]
|
|
336
|
|
337 lengthofmir = len(off_part) + post_pos
|
|
338 if pre_pos == 0:
|
|
339 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
|
|
340 tmp_count_seq = tmp_count_seq + 1
|
|
341
|
|
342 if pre_pos == 0:
|
|
343
|
|
344 t_name=unique_seq[j].copy()
|
|
345 t_name[2]=mature_mirnas[i - 1].split(" ")[0][1:] + "__" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):])
|
|
346 uni_seq.append(t_name)
|
|
347
|
|
348
|
|
349 if tmp_count_reads != 0 and tmp_count_seq != 0:
|
|
350 sorted_uni_arms.append([mature_mirnas[i-1].split(" ")[0][1:], tmp_count_seq, tmp_count_reads])
|
|
351
|
|
352
|
|
353 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
|
|
354 unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq))))
|
|
355
|
|
356 LHE=[]
|
|
357
|
|
358 l.acquire()
|
|
359 if con=="c":
|
|
360 LHE.extend(x[2] for x in unique_seq if x[2]!="*")
|
|
361 for x in unique_seq:
|
|
362 if x[2]!="*":
|
|
363 n_samples_mirna_names.append([x[2],x[9]])
|
|
364 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
|
|
365 n_LHE_names.extend(LHE)
|
|
366 names.append(name)
|
|
367 data.append([con,name,unique_seq,sorted_uni_arms])
|
|
368
|
|
369
|
|
370 if con=="t":
|
|
371 LHE.extend(x[2] for x in unique_seq if x[2]!="*")
|
|
372 for x in unique_seq:
|
|
373 if x[2]!="*":
|
|
374 n_samples_mirna_names.append([x[2],x[9]])
|
|
375 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
|
|
376 n_LHE_names.extend(LHE)
|
|
377 names.append(name)
|
|
378 data.append([con,name,unique_seq,sorted_uni_arms])
|
|
379 l.release()
|
|
380
|
|
381 #####################################################################################################################################################################################################################
|
|
382 def deseq2_temp(samples_mirna_names,deseq,con,l):
|
|
383
|
|
384 samples_mirna_names.sort(key=lambda x:[0])
|
|
385 for i in range(len(deseq)):
|
|
386 for y in samples_mirna_names:
|
|
387 flag = 0
|
|
388 for x in deseq[i]:
|
|
389 if y[0] == x[0]:
|
|
390 flag = 1
|
|
391 break
|
|
392
|
|
393 if flag == 0:
|
|
394 deseq[i].append([y[0], "0", y[1]])
|
|
395
|
|
396 [deseq[i].sort(key=lambda x: x[0]) for i, _ in enumerate(deseq)]
|
|
397 deseq_final = [[x[0],x[2]] for x in deseq[0]]
|
|
398 [deseq_final[z].append(deseq[i][j][1]) for z,_ in enumerate(deseq_final) for i, _ in enumerate(deseq) for j,_ in enumerate(deseq[i]) if deseq_final[z][0] == deseq[i][j][0]]
|
|
399
|
|
400 l.acquire()
|
|
401 if con=="c":
|
|
402 q1.put(deseq_final)
|
|
403
|
|
404 if con=="t":
|
|
405 q2.put(deseq_final)
|
|
406 l.release()
|
|
407
|
|
408
|
|
409 ####################################################################################################################################################################################################################
|
|
410
|
|
411 def main_temp(LH2E, LH2E_names, LH8E, LH8E_names,flag,names_con,names_tre,filter_LH8E,filter_LH2E,raw_LH8E,raw_LH2E,per,count):
|
|
412
|
|
413 LH8E_add_names = [x for x in LH2E_names if x not in LH8E_names]
|
|
414 LH2E_add_names = [x for x in LH8E_names if x not in LH2E_names]
|
|
415
|
|
416 LH8E_add_names.sort()
|
|
417 LH2E_add_names.sort()
|
|
418 LH8E_add_names = list(LH8E_add_names for LH8E_add_names,_ in itertools.groupby(LH8E_add_names))
|
|
419 LH2E_add_names = list(LH2E_add_names for LH2E_add_names,_ in itertools.groupby(LH2E_add_names))
|
|
420
|
|
421 LH2E.sort()
|
|
422 LH8E.sort()
|
|
423 LH2E = list(LH2E for LH2E,_ in itertools.groupby(LH2E))
|
|
424 LH8E = list(LH8E for LH8E,_ in itertools.groupby(LH8E))
|
|
425
|
|
426 print("LHE_names")
|
|
427 print([len(LH8E_add_names),len(LH2E_add_names)])
|
|
428 print([len(LH8E),len(LH2E)])
|
|
429
|
|
430 zeros=["0"]*(len(LH8E[0])-2)
|
|
431 [LH8E_add_names[i].extend(zeros) for i,_ in enumerate(LH8E_add_names)]
|
|
432 LH8E=LH8E+LH8E_add_names
|
|
433
|
|
434 zeros=["0"]*(len(LH2E[0])-2)
|
|
435 [LH2E_add_names[i].extend(zeros) for i,_ in enumerate(LH2E_add_names)]
|
|
436 LH2E=LH2E+LH2E_add_names
|
|
437
|
|
438 dupes=[]
|
|
439 final_LH2E =[]
|
|
440
|
|
441 for num,_ in enumerate(LH2E):
|
|
442
|
|
443 if LH2E[num][1] not in final_LH2E and LH2E[num][0] not in final_LH2E:
|
|
444 final_LH2E.append(LH2E[num][1])
|
|
445 final_LH2E.append(LH2E[num][0])
|
|
446 else:
|
|
447 dupes.append(LH2E[num][1])
|
|
448
|
|
449
|
|
450 dupes=list(set(dupes))
|
|
451
|
|
452 dupes=[[x] for x in dupes]
|
|
453
|
|
454 for x in LH2E:
|
|
455 for y in dupes:
|
|
456 if x[1]==y[0]:
|
|
457 fl=0
|
|
458 if len(y)==1:
|
|
459 y.append(x[0])
|
|
460 else:
|
|
461 for i in range(1,len(y)):
|
|
462 if y[i].split("_")[0]==x[0].split("_")[0]:
|
|
463 fl=1
|
|
464 if len(x[0])<len(y[i]):
|
|
465 del y[i]
|
|
466 y.append(x[0])
|
|
467 break
|
|
468
|
|
469 if fl==0:
|
|
470 y.append((x[0]))
|
|
471
|
|
472 for y in dupes:
|
|
473 if len(y)>2:
|
|
474 for i in range(len(y)-1,1,-1):
|
|
475 y[1]=y[1]+"/"+y[i]
|
|
476 del y[i]
|
|
477
|
|
478 for x in LH2E:
|
|
479 for y in dupes:
|
|
480 if x[1]==y[0]:
|
|
481 x[0]=y[1]
|
|
482
|
|
483 for x in LH8E:
|
|
484 for y in dupes:
|
|
485 if x[1]==y[0]:
|
|
486 x[0]=y[1]
|
|
487
|
|
488
|
|
489 LH2E.sort()
|
|
490 LH2E=list(LH2E for LH2E,_ in itertools.groupby(LH2E))
|
|
491
|
|
492 LH8E.sort()
|
|
493 LH8E=list(LH8E for LH8E,_ in itertools.groupby(LH8E))
|
|
494
|
|
495 if int(per)!=-1:
|
|
496 percent=int(per)/100
|
|
497 print(percent)
|
|
498 print(count)
|
|
499
|
|
500 c_col_filter=round(percent*(len(LH2E[1])-2))
|
|
501 t_col_filter=round(percent*(len(LH8E[1])-2))
|
|
502
|
|
503 for i, _ in enumerate(LH2E):
|
|
504 c_cols=0
|
|
505 t_cols=0
|
|
506
|
|
507 c_cols=sum([1 for j in range(len(LH2E[i])-2) if int(LH2E[i][j+2])>=int(count)])
|
|
508 t_cols=sum([1 for j in range(len(LH8E[i])-2) if int(LH8E[i][j+2])>=int(count)])
|
|
509
|
|
510 if c_cols>=c_col_filter or t_cols>=t_col_filter:
|
|
511 filter_LH8E.append(LH8E[i])
|
|
512 filter_LH2E.append(LH2E[i])
|
|
513
|
|
514 raw_LH2E.extend(LH2E)
|
|
515 raw_LH8E.extend(LH8E)
|
|
516
|
|
517 ##################################################################################################################################################################################################################
|
|
518
|
|
519 def write_main(raw_LH2E, raw_LH8E, fil_LH2E, fil_LH8E, names_con, names_tre, flag, per, n1, n2):
|
|
520
|
|
521 if flag == 1 and int(per)!=-1:
|
|
522 fp = open('Counts/Filtered '+n2 +' Templated Counts', 'w')
|
|
523 fp.write("Name\t")
|
|
524 fp.write("Sequence")
|
|
525 for y in names_tre:
|
|
526 fp.write("\t"+y)
|
|
527
|
|
528 for x in fil_LH8E:
|
|
529 fp.write("\n%s" % "\t".join(x))
|
|
530 fp.close()
|
|
531
|
|
532 fp = open('Counts/Filtered '+n1+' Templated Counts', 'w')
|
|
533 fp.write("Name\t")
|
|
534 fp.write("Sequence")
|
|
535 for y in names_con:
|
|
536 fp.write("\t"+y)
|
|
537
|
|
538 for x in fil_LH2E:
|
|
539 fp.write("\n%s" % "\t".join(x))
|
|
540 fp.close()
|
|
541
|
|
542
|
|
543 if flag == 2 and int(per)!=-1:
|
|
544 fp = open('Counts/Filtered '+n2+' Non-Templated Counts', 'w')
|
|
545 fp.write("Name\t")
|
|
546 fp.write("Sequence")
|
|
547 for y in names_tre:
|
|
548 fp.write("\t"+y)
|
|
549
|
|
550
|
|
551 for x in fil_LH8E:
|
|
552 fp.write("\n%s" % "\t".join(x))
|
|
553 fp.close()
|
|
554
|
|
555 fp = open('Counts/Filtered '+n1+' Non-Templated Counts', 'w')
|
|
556 fp.write("Name\t")
|
|
557 fp.write("Sequence")
|
|
558 for y in names_con:
|
|
559 fp.write("\t"+y)
|
|
560
|
|
561 for x in fil_LH2E:
|
|
562 fp.write("\n%s" % "\t".join(x))
|
|
563 fp.close()
|
|
564
|
|
565
|
|
566 if flag == 1:
|
|
567 fp = open('Counts/Raw '+n2+' Templated Counts', 'w')
|
|
568 fp.write("Name\t")
|
|
569 fp.write("Sequence")
|
|
570 for y in names_tre:
|
|
571 fp.write("\t"+y)
|
|
572
|
|
573 for x in raw_LH8E:
|
|
574 fp.write("\n%s" % "\t".join(x))
|
|
575 fp.close()
|
|
576
|
|
577 fp = open('Counts/Raw '+n1+' Templated Counts', 'w')
|
|
578 fp.write("Name\t")
|
|
579 fp.write("Sequence")
|
|
580 for y in names_con:
|
|
581 fp.write("\t"+y)
|
|
582
|
|
583 for x in raw_LH2E:
|
|
584 fp.write("\n%s" % "\t".join(x))
|
|
585 fp.close()
|
|
586
|
|
587
|
|
588 if flag == 2:
|
|
589 fp = open('Counts/Raw '+n2+' Non-Templated Counts', 'w')
|
|
590 fp.write("Name\t")
|
|
591 fp.write("Sequence")
|
|
592 for y in names_tre:
|
|
593 fp.write("\t"+y)
|
|
594
|
|
595
|
|
596 for x in raw_LH8E:
|
|
597 fp.write("\n%s" % "\t".join(x))
|
|
598 fp.close()
|
|
599
|
|
600 fp = open('Counts/Raw '+n1+' Non-Templated Counts', 'w')
|
|
601 fp.write("Name\t")
|
|
602 fp.write("Sequence")
|
|
603 for y in names_con:
|
|
604 fp.write("\t"+y)
|
|
605
|
|
606 for x in raw_LH2E:
|
|
607 fp.write("\n%s" % "\t".join(x))
|
|
608 fp.close()
|
|
609
|
|
610
|
|
611 #########################################################################################################################################
|
|
612
|
|
613 def ssamples(names,samp,folder,pro):
|
|
614
|
|
615 for i in range(2,len(samp[0])):
|
|
616
|
|
617 fp = open(folder+names[i-2]+'.txt','w')
|
|
618 fp.write("miRNA id"+"\t"+names[i-2]+"\n")
|
|
619
|
|
620 for x in samp:
|
|
621 fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
|
|
622 fp.close()
|
|
623
|
|
624 ##################################################################################################################
|
|
625
|
|
626 def DB_write(con,name,unique_seq,sorted_uni_arms,f):
|
|
627
|
|
628 if f==1:
|
|
629 # Write a txt file with all the information
|
|
630 if con=="c":
|
|
631 fp = open('split1/'+name, 'w')
|
|
632
|
|
633 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
|
|
634 if con=="t":
|
|
635 fp = open('split2/'+name, 'w')
|
|
636 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
|
|
637
|
|
638
|
|
639 for i in range(len(sorted_uni_arms)):
|
|
640 temp = []
|
|
641 for j in range(len(unique_seq)):
|
|
642
|
|
643 if sorted_uni_arms[i][0] in unique_seq[j][2].split("_")[0]:
|
|
644
|
|
645 temp.append(unique_seq[j])
|
|
646
|
|
647 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
|
|
648 fp.write("*********************************************************************************************************\n")
|
|
649 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
|
|
650 fp.write("*********************************************************************************************************\n\n")
|
|
651 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
|
|
652 fp.write("\n" + "\n")
|
|
653 fp.close()
|
|
654
|
|
655 if f==2:
|
|
656
|
|
657 if con=="c":
|
|
658 fp = open('split3/'+name, 'w')
|
|
659 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
|
|
660 if con=="t":
|
|
661 fp = open('split4/'+name, 'w')
|
|
662 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
|
|
663
|
|
664
|
|
665 for i in range(len(sorted_uni_arms)):
|
|
666 temp = []
|
|
667 for j in range(len(unique_seq)):
|
|
668 if sorted_uni_arms[i][0]==unique_seq[j][2].split("__")[0]:
|
|
669 temp.append(unique_seq[j])
|
|
670 if temp!=[]:
|
|
671 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
|
|
672 fp.write("*********************************************************************************************************\n")
|
|
673 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
|
|
674 fp.write("*********************************************************************************************************\n\n")
|
|
675 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
|
|
676 fp.write("\n" + "\n")
|
|
677 fp.close()
|
|
678
|
|
679
|
|
680 ##########################################################################################################################
|
|
681
|
|
682 def new_mat_seq(pre_unique_seq,mat_mirnas,l):
|
|
683
|
|
684 unique_iso = []
|
|
685 for x in pre_unique_seq:
|
|
686 if len(x[2].split("_"))==3:
|
|
687 for y in pre_unique_seq:
|
|
688 if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]):
|
|
689 if any(y[2] in lst2 for lst2 in unique_iso)==False:
|
|
690 y[2]=">"+y[2]
|
|
691 unique_iso.append(y)
|
|
692 l.acquire()
|
|
693 for x in unique_iso:
|
|
694 mat_mirnas.append(x[2])
|
|
695 mat_mirnas.append(x[9])
|
|
696 l.release()
|
|
697
|
|
698 #########################################################################################################################
|
|
699
|
|
700 def merging_names(LH2E_copy,new):
|
|
701
|
|
702 dupes=[]
|
|
703 final_LH2E =[]
|
|
704
|
|
705 for num in range(len(LH2E_copy)):
|
|
706
|
|
707 if LH2E_copy[num][1] not in final_LH2E and LH2E_copy[num][0] not in final_LH2E:
|
|
708 final_LH2E.append(LH2E_copy[num][1])
|
|
709 final_LH2E.append(LH2E_copy[num][0])
|
|
710 else:
|
|
711 dupes.append(LH2E_copy[num][1])
|
|
712
|
|
713 dupes=list(set(dupes))
|
|
714
|
|
715 for i in range(len(dupes)):
|
|
716 dupes[i]=[dupes[i]]
|
|
717
|
|
718 for x in LH2E_copy:
|
|
719 for y in dupes:
|
|
720 if x[1]==y[0]:
|
|
721 fl=0
|
|
722 if len(y)==1:
|
|
723 y.append(x[0])
|
|
724 else:
|
|
725 for i in range(1,len(y)):
|
|
726 if y[i].split("_")[0]==x[0].split("_")[0]:
|
|
727 fl=1
|
|
728 if len(x[0])<len(y[i]):
|
|
729 del y[i]
|
|
730 y.append(x[0])
|
|
731 break
|
|
732
|
|
733 if fl==0:
|
|
734 y.append((x[0]))
|
|
735
|
|
736 for y in dupes:
|
|
737 if len(y)>2:
|
|
738 for i in range(len(y)-1,1,-1):
|
|
739 y[1]=y[1]+"/"+y[i]
|
|
740 del y[i]
|
|
741
|
|
742
|
|
743 for x in LH2E_copy:
|
|
744 for y in dupes:
|
|
745 if x[1]==y[0]:
|
|
746 x[0]=y[1]
|
|
747
|
|
748 LH2E_copy.sort()
|
|
749 LH2E_copy=list(LH2E_copy for LH2E_copy,_ in itertools.groupby(LH2E_copy))
|
|
750
|
|
751 new.extend(LH2E_copy)
|
|
752
|
|
753
|
|
754 ######################################################################################################################################################
|
|
755
|
|
756 def ssamples1(tem_names,tem_samp,non_names,non_samp,folder,pro):
|
|
757
|
|
758 for i in range(2,len(tem_samp[0])):
|
|
759
|
|
760 fp = open(folder+tem_names[i-2]+'.txt','w')
|
|
761 fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n")
|
|
762
|
|
763 for x in tem_samp:
|
|
764 fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
|
|
765
|
|
766 for j in range(len(non_names)):
|
|
767 if non_names[j]==tem_names[i-2]:
|
|
768 for x in non_samp:
|
|
769 fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n")
|
|
770 fp.close()
|
|
771
|
|
772 ###################################################################################################################################################################################################################
|
|
773
|
|
774 def download_matures(matures,org_name):
|
|
775
|
|
776 #url = 'ftp://mirbase.org/pub/mirbase/21/mature.fa.gz'
|
|
777 url = 'ftp://mirbase.org/pub/mirbase/CURRENT/mature.fa.gz'
|
|
778 data = urllib.request.urlopen(url).read()
|
|
779 file_mirna = gzip.decompress(data).decode('utf-8')
|
|
780 file_mirna = file_mirna.split("\n")
|
|
781
|
|
782 for i in range(0,len(file_mirna)-1,2):
|
|
783
|
|
784 if org_name in file_mirna[i]:
|
|
785 matures.append(file_mirna[i])
|
|
786 matures.append(file_mirna[i+1])
|
|
787
|
|
788 ###################################################################################################################################################################################################################
|
|
789 def non_template_ref(sc,st,all_isoforms):
|
|
790
|
|
791 pre_uni_seq_con = list(sc)
|
|
792 pre_uni_seq_tre = list(st)
|
|
793
|
|
794 for x in pre_uni_seq_con:
|
|
795 for y in x:
|
|
796 if ">"+y[2] not in all_isoforms and ")_" in y[2] :
|
|
797 all_isoforms.append(">"+y[2])
|
|
798 all_isoforms.append(y[9])
|
|
799
|
|
800
|
|
801 for x in pre_uni_seq_tre:
|
|
802 for y in x:
|
|
803 if ">"+y[2] not in all_isoforms and ")_" in y[2]:
|
|
804 all_isoforms.append(">"+y[2])
|
|
805 all_isoforms.append(y[9])
|
|
806
|
|
807 ################################################################################################################################################################################################
|
|
808
|
|
809 def deseqe2(sample,mir_names,l,new_d,sample_name,sample_order):
|
|
810
|
|
811 for y in mir_names:
|
|
812 flag=0
|
|
813 for x in sample:
|
|
814 if y[0]==x[0]:
|
|
815 flag=1
|
|
816 break
|
|
817 if flag==0:
|
|
818 sample.append([y[0],"0",y[1]])
|
|
819
|
|
820 sample.sort(key=lambda x: x[0])
|
|
821 sample=list(sample for sample,_ in itertools.groupby(sample))
|
|
822
|
|
823 l.acquire()
|
|
824 new_d.append(sample)
|
|
825 sample_order.append(sample_name)
|
|
826 l.release()
|
|
827
|
|
828 ###############################################################################################################################################################################################
|
|
829
|