Mercurial > repos > glogobyte > viztool
comparison mirbase_functions.py @ 0:258aaaa465f3 draft
Uploaded
author | glogobyte |
---|---|
date | Fri, 16 Oct 2020 10:48:17 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:258aaaa465f3 |
---|---|
1 import itertools | |
2 import time | |
3 import sys | |
4 import os | |
5 import urllib.request | |
6 import gzip | |
7 from multiprocessing import Process, Queue, Lock, Pool, Manager, Value | |
8 import subprocess | |
9 import argparse | |
10 from collections import OrderedDict | |
11 from matplotlib.backends.backend_pdf import PdfPages | |
12 import pandas as pd | |
13 from math import pi | |
14 import numpy as np | |
15 import matplotlib.pyplot as plt | |
16 from matplotlib.ticker import PercentFormatter | |
17 import seaborn as sns | |
18 import scipy.stats as stats | |
19 from plotnine import * | |
20 import math | |
21 import re | |
22 import matplotlib.ticker as mtick | |
23 import copy | |
24 | |
25 | |
26 """---------------------- Simple Functions -----------------------""" | |
27 | |
28 # Read a file and return it as a list | |
29 def read(path, flag): | |
30 if flag == 0: | |
31 with open(path) as fp: | |
32 file=fp.readlines() | |
33 fp.close() | |
34 return file | |
35 | |
36 if flag == 1: | |
37 with open(path) as fp: | |
38 file = fp.read().splitlines() | |
39 fp.close() | |
40 return file | |
41 | |
42 # Write a list to a txt file | |
43 def write(path, list): | |
44 with open(path,'w') as fp: | |
45 for x in list: | |
46 fp.write(str("\t".join(x[1:-1]))) | |
47 fp.close() | |
48 | |
49 """---------------------- RNA-seq Functions ----------------------""" | |
50 | |
51 # Detect the longest common substring sequence between two mirnas | |
52 def longestSubstring(str1, str2): | |
53 | |
54 from difflib import SequenceMatcher | |
55 # initialize SequenceMatcher object with | |
56 # input string | |
57 seqMatch = SequenceMatcher(None, str1, str2) | |
58 | |
59 # find match of longest sub-string | |
60 # output will be like Match(a=0, b=0, size=5) | |
61 match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) | |
62 | |
63 # print longest substring | |
64 if (match.size != 0): | |
65 return str1[match.a: match.a + match.size] | |
66 else: | |
67 print('No longest common sub-string found') | |
68 | |
69 | |
70 | |
71 ######################################################################################################################################################## | |
72 | |
73 def collapse_sam(path): | |
74 | |
75 ini_sam=read(path,0) | |
76 main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] | |
77 intro_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" in x.split("\t")[0]] | |
78 | |
79 uni_seq = [] | |
80 for x in main_sam: | |
81 | |
82 if [x[2], x[9]] not in uni_seq: | |
83 uni_seq.append([x[2], x[9]]) | |
84 | |
85 new_main_sam=[] | |
86 incr_num=0 | |
87 for i in range(len(uni_seq)): | |
88 count=0 | |
89 incr_num+=1 | |
90 for y in main_sam: | |
91 if uni_seq[i][1]==y[9] and uni_seq[i][0]==y[2]: | |
92 count+=1 | |
93 temp=y | |
94 temp[10]="~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" | |
95 temp[0]=str(incr_num)+"-"+str(count) | |
96 new_main_sam.append(temp) | |
97 | |
98 new_sam=intro_sam+new_main_sam | |
99 | |
100 return new_sam | |
101 | |
102 ################################################################################################################################################################################################################# | |
103 | |
104 def duplicate_chroms_isoforms(List): | |
105 | |
106 dupes=[] | |
107 | |
108 for num in range(len(List)): | |
109 | |
110 if [List[num][9],List[num][0],List[num][2]] not in dupes : | |
111 dupes.append([List[num][9],List[num][0],List[num][2]]) | |
112 | |
113 for x in List: | |
114 for y in dupes: | |
115 if x[9]==y[0] and x[0]==y[1] and x[2].split("_")[0]==y[2].split("_")[0] and x[2]!=y[2]: | |
116 y.append(x[2]) | |
117 | |
118 | |
119 double_List = [x[:] for x in List] | |
120 | |
121 chr_order=[] | |
122 for x in dupes: | |
123 temp = [] | |
124 for i in range(2,len(x)): | |
125 if x[i].split("chr")[1].split("(")[0].isdigit(): | |
126 temp.append(int(x[i].split("chr")[1].split("(")[1][0]+x[i].split("chr")[1].split("(")[0])) | |
127 else: | |
128 temp.append(x[i].split("chr")[1][0:4]) | |
129 | |
130 for z in temp: | |
131 if 'X(-)'==z or 'Y(-)'==z or 'X(+)'==z or 'Y(+)'==z: | |
132 temp = [str(j) for j in temp] | |
133 temp=list(set(temp)) | |
134 temp.sort() | |
135 chr_order.append(temp) | |
136 | |
137 final_dupes=[] | |
138 for i in range(len(dupes)): | |
139 final_dupes.append([dupes[i][0],dupes[i][2].split("_")[0],dupes[i][1]]) | |
140 for x in chr_order[i]: | |
141 result = re.match("[-+]?\d+$", str(x)) | |
142 if len(chr_order[i]) == len(set(chr_order[i])): | |
143 if result is not None: | |
144 | |
145 if int(x)<0: | |
146 final_dupes[i][1]=final_dupes[i][1]+"_chr"+str(abs(int(x)))+"(-)" | |
147 else: | |
148 final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(abs(int(x)))+"(+)" | |
149 else: | |
150 final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(x) | |
151 else: | |
152 if result is not None: | |
153 if int(x) < 0: | |
154 final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(-)" | |
155 else: | |
156 final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(+)" | |
157 else: | |
158 final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(x) | |
159 | |
160 final_dupes.sort() | |
161 final_dupes=list(final_dupes for final_dupes,_ in itertools.groupby(final_dupes)) | |
162 | |
163 for i in range(len(double_List)): | |
164 for x in final_dupes: | |
165 | |
166 if double_List[i][9] == x[0] and double_List[i][0] == x[2] and len(double_List[i][2].split("_")) >3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]: | |
167 gg=str("_"+double_List[i][2].split("_")[-2]+"_"+double_List[i][2].split("_")[-1]) | |
168 double_List[i][2] = x[1]+gg | |
169 | |
170 if double_List[i][9]==x[0] and double_List[i][0]== x[2] and len(double_List[i][2].split("_"))==3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]: | |
171 double_List[i][2]=x[1] | |
172 List[i][2] = x[1] | |
173 | |
174 List.sort() | |
175 new_list=list(List for List,_ in itertools.groupby(List)) | |
176 | |
177 double_List.sort() | |
178 new_double_List = list(double_List for double_List, _ in itertools.groupby(double_List)) | |
179 | |
180 return new_list, new_double_List | |
181 | |
182 | |
183 ############################################################################################################################################################################################################# | |
184 | |
185 def sam(mature_mirnas,path,name,con,l,samples,data,names,unmap_seq,samples_mirna_names,deseq,LHE_names,umi,ini_sample,unmap_counts): | |
186 | |
187 # read the sam file | |
188 ini_sam=read(path,0) | |
189 new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] | |
190 unique_seq = [x for x in new_main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26] | |
191 | |
192 sorted_uni_arms = [] | |
193 | |
194 for i in range(len(mature_mirnas)): | |
195 tmp_count_reads = 0 # calculate the total number of reads | |
196 tmp_count_seq = 0 # calculate the total number of sequences | |
197 for j in range(len(unique_seq)): | |
198 | |
199 if "{" in unique_seq[j][2].split("_")[0]: | |
200 official=unique_seq[j][2].split("_")[0][:-4] | |
201 else: | |
202 official=unique_seq[j][2].split("_")[0] | |
203 | |
204 if mature_mirnas[i].split(" ")[0][1:] == official: | |
205 | |
206 temp_mature = mature_mirnas[i+1].strip().replace("U", "T") | |
207 off_part = longestSubstring(temp_mature, unique_seq[j][9]) | |
208 | |
209 mat_diff = temp_mature.split(off_part) | |
210 mat_diff = [len(mat_diff[0]), len(mat_diff[1])] | |
211 | |
212 unique_diff = unique_seq[j][9].split(off_part) | |
213 unique_diff = [len(unique_diff[0]), len(unique_diff[1])] | |
214 | |
215 # Problem with hsa-miR-8485 | |
216 if mat_diff[1]!=0 and unique_diff[1]!=0: | |
217 unique_seq[j]=1 | |
218 pre_pos = 0 | |
219 post_pos = 0 | |
220 | |
221 elif mat_diff[0]!=0 and unique_diff[0]!=0: | |
222 unique_seq[j]=1 | |
223 pre_pos = 0 | |
224 post_pos = 0 | |
225 | |
226 else: | |
227 pre_pos = mat_diff[0]-unique_diff[0] | |
228 post_pos = unique_diff[1]-mat_diff[1] | |
229 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) | |
230 tmp_count_seq = tmp_count_seq+1 | |
231 | |
232 if pre_pos != 0 or post_pos != 0: | |
233 if pre_pos == 0: | |
234 unique_seq[j][2] = unique_seq[j][2] + "_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos) | |
235 elif post_pos == 0: | |
236 unique_seq[j][2] = unique_seq[j][2] + "_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos) | |
237 else: | |
238 unique_seq[j][2] = unique_seq[j][2]+"_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos) | |
239 | |
240 for x in range(unique_seq.count(1)): | |
241 unique_seq.remove(1) | |
242 if tmp_count_reads != 0 and tmp_count_seq != 0: | |
243 sorted_uni_arms.append([mature_mirnas[i].split(" ")[0][1:], tmp_count_seq, tmp_count_reads]) | |
244 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) | |
245 dedup_unique_seq,double_fil_uni_seq=duplicate_chroms_isoforms(unique_seq) | |
246 | |
247 for y in sorted_uni_arms: | |
248 counts=0 | |
249 seqs=0 | |
250 for x in double_fil_uni_seq: | |
251 if y[0]==x[2].split("_")[0]: | |
252 counts+=int(x[0].split("-")[1]) | |
253 seqs+=1 | |
254 | |
255 y[1]=seqs | |
256 y[2]=counts | |
257 | |
258 LHE=[] | |
259 l.acquire() | |
260 if con=="c": | |
261 LHE.extend(z[2] for z in double_fil_uni_seq) | |
262 for y in double_fil_uni_seq: | |
263 samples_mirna_names.append([y[2],y[9]]) | |
264 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq]) | |
265 LHE_names.extend(LHE) | |
266 unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4']) | |
267 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4']) | |
268 names.append(name) | |
269 samples.append(dedup_unique_seq) | |
270 data.append([con,name,double_fil_uni_seq,sorted_uni_arms]) | |
271 ini_sample.append(new_main_sam) | |
272 | |
273 if con=="t": | |
274 LHE.extend(z[2] for z in double_fil_uni_seq) | |
275 for y in double_fil_uni_seq: | |
276 samples_mirna_names.append([y[2],y[9]]) | |
277 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq]) | |
278 LHE_names.extend(LHE) | |
279 unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4']) | |
280 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4']) | |
281 names.append(name) | |
282 samples.append(dedup_unique_seq) | |
283 data.append([con,name,double_fil_uni_seq,sorted_uni_arms]) | |
284 ini_sample.append(new_main_sam) | |
285 l.release() | |
286 | |
287 | |
288 ###################################################################################################################################### | |
289 | |
290 """ | |
291 | |
292 Read a sam file from Bowtie and do the followings: | |
293 | |
294 1) Remove reverse stranded mapped reads | |
295 2) Remove unmapped reads | |
296 3) Remove all sequences with reads less than 11 reads | |
297 4) Sort the arms with the most sequences in decreading rate | |
298 5) Sort the sequences of every arm with the most reads in decreasing rate | |
299 6) Calculate total number of sequences of every arm | |
300 7) Calculate total number of reads of sequences of every arm. | |
301 8) Store all the informations in a txt file | |
302 | |
303 """ | |
304 | |
305 def non_sam(mature_mirnas,path,name,con,l,data,names,n_deseq,n_samples_mirna_names,n_LHE_names): | |
306 | |
307 ini_sam=read(path,0) | |
308 new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] | |
309 unique_seq=[] | |
310 unique_seq = [x for x in new_main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26] | |
311 | |
312 uni_seq=[] | |
313 # Calculate the shifted positions for every isomir and add them to the name of it | |
314 sorted_uni_arms = [] | |
315 for i in range(1,len(mature_mirnas),2): | |
316 tmp_count_reads = 0 # calculate the total number of reads | |
317 tmp_count_seq = 0 # calculate the total number of sequences | |
318 | |
319 for j in range(len(unique_seq)): | |
320 | |
321 temp_mature = mature_mirnas[i].strip().replace("U", "T") | |
322 | |
323 if temp_mature in unique_seq[j][9]: | |
324 | |
325 off_part = longestSubstring(temp_mature, unique_seq[j][9]) | |
326 | |
327 mat_diff = temp_mature.split(off_part) | |
328 mat_diff = [len(mat_diff[0]), len(mat_diff[1])] | |
329 | |
330 unique_diff = unique_seq[j][9].split(off_part) | |
331 if len(unique_diff)<=2: | |
332 unique_diff = [len(unique_diff[0]), len(unique_diff[1])] | |
333 | |
334 pre_pos = mat_diff[0]-unique_diff[0] | |
335 post_pos = unique_diff[1]-mat_diff[1] | |
336 | |
337 lengthofmir = len(off_part) + post_pos | |
338 if pre_pos == 0: | |
339 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) | |
340 tmp_count_seq = tmp_count_seq + 1 | |
341 | |
342 if pre_pos == 0: | |
343 | |
344 t_name=unique_seq[j].copy() | |
345 t_name[2]=mature_mirnas[i - 1].split(" ")[0][1:] + "__" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):]) | |
346 uni_seq.append(t_name) | |
347 | |
348 | |
349 if tmp_count_reads != 0 and tmp_count_seq != 0: | |
350 sorted_uni_arms.append([mature_mirnas[i-1].split(" ")[0][1:], tmp_count_seq, tmp_count_reads]) | |
351 | |
352 | |
353 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) | |
354 unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq)))) | |
355 | |
356 LHE=[] | |
357 | |
358 l.acquire() | |
359 if con=="c": | |
360 LHE.extend(x[2] for x in unique_seq if x[2]!="*") | |
361 for x in unique_seq: | |
362 if x[2]!="*": | |
363 n_samples_mirna_names.append([x[2],x[9]]) | |
364 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) | |
365 n_LHE_names.extend(LHE) | |
366 names.append(name) | |
367 data.append([con,name,unique_seq,sorted_uni_arms]) | |
368 | |
369 | |
370 if con=="t": | |
371 LHE.extend(x[2] for x in unique_seq if x[2]!="*") | |
372 for x in unique_seq: | |
373 if x[2]!="*": | |
374 n_samples_mirna_names.append([x[2],x[9]]) | |
375 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) | |
376 n_LHE_names.extend(LHE) | |
377 names.append(name) | |
378 data.append([con,name,unique_seq,sorted_uni_arms]) | |
379 l.release() | |
380 | |
381 ##################################################################################################################################################################################################################### | |
382 def deseq2_temp(samples_mirna_names,deseq,con,l): | |
383 | |
384 samples_mirna_names.sort(key=lambda x:[0]) | |
385 for i in range(len(deseq)): | |
386 for y in samples_mirna_names: | |
387 flag = 0 | |
388 for x in deseq[i]: | |
389 if y[0] == x[0]: | |
390 flag = 1 | |
391 break | |
392 | |
393 if flag == 0: | |
394 deseq[i].append([y[0], "0", y[1]]) | |
395 | |
396 [deseq[i].sort(key=lambda x: x[0]) for i, _ in enumerate(deseq)] | |
397 deseq_final = [[x[0],x[2]] for x in deseq[0]] | |
398 [deseq_final[z].append(deseq[i][j][1]) for z,_ in enumerate(deseq_final) for i, _ in enumerate(deseq) for j,_ in enumerate(deseq[i]) if deseq_final[z][0] == deseq[i][j][0]] | |
399 | |
400 l.acquire() | |
401 if con=="c": | |
402 q1.put(deseq_final) | |
403 | |
404 if con=="t": | |
405 q2.put(deseq_final) | |
406 l.release() | |
407 | |
408 | |
409 #################################################################################################################################################################################################################### | |
410 | |
411 def main_temp(LH2E, LH2E_names, LH8E, LH8E_names,flag,names_con,names_tre,filter_LH8E,filter_LH2E,raw_LH8E,raw_LH2E,per,count): | |
412 | |
413 LH8E_add_names = [x for x in LH2E_names if x not in LH8E_names] | |
414 LH2E_add_names = [x for x in LH8E_names if x not in LH2E_names] | |
415 | |
416 LH8E_add_names.sort() | |
417 LH2E_add_names.sort() | |
418 LH8E_add_names = list(LH8E_add_names for LH8E_add_names,_ in itertools.groupby(LH8E_add_names)) | |
419 LH2E_add_names = list(LH2E_add_names for LH2E_add_names,_ in itertools.groupby(LH2E_add_names)) | |
420 | |
421 LH2E.sort() | |
422 LH8E.sort() | |
423 LH2E = list(LH2E for LH2E,_ in itertools.groupby(LH2E)) | |
424 LH8E = list(LH8E for LH8E,_ in itertools.groupby(LH8E)) | |
425 | |
426 print("LHE_names") | |
427 print([len(LH8E_add_names),len(LH2E_add_names)]) | |
428 print([len(LH8E),len(LH2E)]) | |
429 | |
430 zeros=["0"]*(len(LH8E[0])-2) | |
431 [LH8E_add_names[i].extend(zeros) for i,_ in enumerate(LH8E_add_names)] | |
432 LH8E=LH8E+LH8E_add_names | |
433 | |
434 zeros=["0"]*(len(LH2E[0])-2) | |
435 [LH2E_add_names[i].extend(zeros) for i,_ in enumerate(LH2E_add_names)] | |
436 LH2E=LH2E+LH2E_add_names | |
437 | |
438 dupes=[] | |
439 final_LH2E =[] | |
440 | |
441 for num,_ in enumerate(LH2E): | |
442 | |
443 if LH2E[num][1] not in final_LH2E and LH2E[num][0] not in final_LH2E: | |
444 final_LH2E.append(LH2E[num][1]) | |
445 final_LH2E.append(LH2E[num][0]) | |
446 else: | |
447 dupes.append(LH2E[num][1]) | |
448 | |
449 | |
450 dupes=list(set(dupes)) | |
451 | |
452 dupes=[[x] for x in dupes] | |
453 | |
454 for x in LH2E: | |
455 for y in dupes: | |
456 if x[1]==y[0]: | |
457 fl=0 | |
458 if len(y)==1: | |
459 y.append(x[0]) | |
460 else: | |
461 for i in range(1,len(y)): | |
462 if y[i].split("_")[0]==x[0].split("_")[0]: | |
463 fl=1 | |
464 if len(x[0])<len(y[i]): | |
465 del y[i] | |
466 y.append(x[0]) | |
467 break | |
468 | |
469 if fl==0: | |
470 y.append((x[0])) | |
471 | |
472 for y in dupes: | |
473 if len(y)>2: | |
474 for i in range(len(y)-1,1,-1): | |
475 y[1]=y[1]+"/"+y[i] | |
476 del y[i] | |
477 | |
478 for x in LH2E: | |
479 for y in dupes: | |
480 if x[1]==y[0]: | |
481 x[0]=y[1] | |
482 | |
483 for x in LH8E: | |
484 for y in dupes: | |
485 if x[1]==y[0]: | |
486 x[0]=y[1] | |
487 | |
488 | |
489 LH2E.sort() | |
490 LH2E=list(LH2E for LH2E,_ in itertools.groupby(LH2E)) | |
491 | |
492 LH8E.sort() | |
493 LH8E=list(LH8E for LH8E,_ in itertools.groupby(LH8E)) | |
494 | |
495 if int(per)!=-1: | |
496 percent=int(per)/100 | |
497 print(percent) | |
498 print(count) | |
499 | |
500 c_col_filter=round(percent*(len(LH2E[1])-2)) | |
501 t_col_filter=round(percent*(len(LH8E[1])-2)) | |
502 | |
503 for i, _ in enumerate(LH2E): | |
504 c_cols=0 | |
505 t_cols=0 | |
506 | |
507 c_cols=sum([1 for j in range(len(LH2E[i])-2) if int(LH2E[i][j+2])>=int(count)]) | |
508 t_cols=sum([1 for j in range(len(LH8E[i])-2) if int(LH8E[i][j+2])>=int(count)]) | |
509 | |
510 if c_cols>=c_col_filter or t_cols>=t_col_filter: | |
511 filter_LH8E.append(LH8E[i]) | |
512 filter_LH2E.append(LH2E[i]) | |
513 | |
514 raw_LH2E.extend(LH2E) | |
515 raw_LH8E.extend(LH8E) | |
516 | |
517 ################################################################################################################################################################################################################## | |
518 | |
519 def write_main(raw_LH2E, raw_LH8E, fil_LH2E, fil_LH8E, names_con, names_tre, flag, per, n1, n2): | |
520 | |
521 if flag == 1 and int(per)!=-1: | |
522 fp = open('Counts/Filtered '+n2 +' Templated Counts', 'w') | |
523 fp.write("Name\t") | |
524 fp.write("Sequence") | |
525 for y in names_tre: | |
526 fp.write("\t"+y) | |
527 | |
528 for x in fil_LH8E: | |
529 fp.write("\n%s" % "\t".join(x)) | |
530 fp.close() | |
531 | |
532 fp = open('Counts/Filtered '+n1+' Templated Counts', 'w') | |
533 fp.write("Name\t") | |
534 fp.write("Sequence") | |
535 for y in names_con: | |
536 fp.write("\t"+y) | |
537 | |
538 for x in fil_LH2E: | |
539 fp.write("\n%s" % "\t".join(x)) | |
540 fp.close() | |
541 | |
542 | |
543 if flag == 2 and int(per)!=-1: | |
544 fp = open('Counts/Filtered '+n2+' Non-Templated Counts', 'w') | |
545 fp.write("Name\t") | |
546 fp.write("Sequence") | |
547 for y in names_tre: | |
548 fp.write("\t"+y) | |
549 | |
550 | |
551 for x in fil_LH8E: | |
552 fp.write("\n%s" % "\t".join(x)) | |
553 fp.close() | |
554 | |
555 fp = open('Counts/Filtered '+n1+' Non-Templated Counts', 'w') | |
556 fp.write("Name\t") | |
557 fp.write("Sequence") | |
558 for y in names_con: | |
559 fp.write("\t"+y) | |
560 | |
561 for x in fil_LH2E: | |
562 fp.write("\n%s" % "\t".join(x)) | |
563 fp.close() | |
564 | |
565 | |
566 if flag == 1: | |
567 fp = open('Counts/Raw '+n2+' Templated Counts', 'w') | |
568 fp.write("Name\t") | |
569 fp.write("Sequence") | |
570 for y in names_tre: | |
571 fp.write("\t"+y) | |
572 | |
573 for x in raw_LH8E: | |
574 fp.write("\n%s" % "\t".join(x)) | |
575 fp.close() | |
576 | |
577 fp = open('Counts/Raw '+n1+' Templated Counts', 'w') | |
578 fp.write("Name\t") | |
579 fp.write("Sequence") | |
580 for y in names_con: | |
581 fp.write("\t"+y) | |
582 | |
583 for x in raw_LH2E: | |
584 fp.write("\n%s" % "\t".join(x)) | |
585 fp.close() | |
586 | |
587 | |
588 if flag == 2: | |
589 fp = open('Counts/Raw '+n2+' Non-Templated Counts', 'w') | |
590 fp.write("Name\t") | |
591 fp.write("Sequence") | |
592 for y in names_tre: | |
593 fp.write("\t"+y) | |
594 | |
595 | |
596 for x in raw_LH8E: | |
597 fp.write("\n%s" % "\t".join(x)) | |
598 fp.close() | |
599 | |
600 fp = open('Counts/Raw '+n1+' Non-Templated Counts', 'w') | |
601 fp.write("Name\t") | |
602 fp.write("Sequence") | |
603 for y in names_con: | |
604 fp.write("\t"+y) | |
605 | |
606 for x in raw_LH2E: | |
607 fp.write("\n%s" % "\t".join(x)) | |
608 fp.close() | |
609 | |
610 | |
611 ######################################################################################################################################### | |
612 | |
613 def ssamples(names,samp,folder,pro): | |
614 | |
615 for i in range(2,len(samp[0])): | |
616 | |
617 fp = open(folder+names[i-2]+'.txt','w') | |
618 fp.write("miRNA id"+"\t"+names[i-2]+"\n") | |
619 | |
620 for x in samp: | |
621 fp.write("%s" % "\t".join([x[0],x[i]])+"\n") | |
622 fp.close() | |
623 | |
624 ################################################################################################################## | |
625 | |
626 def DB_write(con,name,unique_seq,sorted_uni_arms,f): | |
627 | |
628 if f==1: | |
629 # Write a txt file with all the information | |
630 if con=="c": | |
631 fp = open('split1/'+name, 'w') | |
632 | |
633 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | |
634 if con=="t": | |
635 fp = open('split2/'+name, 'w') | |
636 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | |
637 | |
638 | |
639 for i in range(len(sorted_uni_arms)): | |
640 temp = [] | |
641 for j in range(len(unique_seq)): | |
642 | |
643 if sorted_uni_arms[i][0] in unique_seq[j][2].split("_")[0]: | |
644 | |
645 temp.append(unique_seq[j]) | |
646 | |
647 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) | |
648 fp.write("*********************************************************************************************************\n") | |
649 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) | |
650 fp.write("*********************************************************************************************************\n\n") | |
651 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] | |
652 fp.write("\n" + "\n") | |
653 fp.close() | |
654 | |
655 if f==2: | |
656 | |
657 if con=="c": | |
658 fp = open('split3/'+name, 'w') | |
659 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | |
660 if con=="t": | |
661 fp = open('split4/'+name, 'w') | |
662 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | |
663 | |
664 | |
665 for i in range(len(sorted_uni_arms)): | |
666 temp = [] | |
667 for j in range(len(unique_seq)): | |
668 if sorted_uni_arms[i][0]==unique_seq[j][2].split("__")[0]: | |
669 temp.append(unique_seq[j]) | |
670 if temp!=[]: | |
671 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) | |
672 fp.write("*********************************************************************************************************\n") | |
673 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) | |
674 fp.write("*********************************************************************************************************\n\n") | |
675 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] | |
676 fp.write("\n" + "\n") | |
677 fp.close() | |
678 | |
679 | |
680 ########################################################################################################################## | |
681 | |
682 def new_mat_seq(pre_unique_seq,mat_mirnas,l): | |
683 | |
684 unique_iso = [] | |
685 for x in pre_unique_seq: | |
686 if len(x[2].split("_"))==3: | |
687 for y in pre_unique_seq: | |
688 if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]): | |
689 if any(y[2] in lst2 for lst2 in unique_iso)==False: | |
690 y[2]=">"+y[2] | |
691 unique_iso.append(y) | |
692 l.acquire() | |
693 for x in unique_iso: | |
694 mat_mirnas.append(x[2]) | |
695 mat_mirnas.append(x[9]) | |
696 l.release() | |
697 | |
698 ######################################################################################################################### | |
699 | |
700 def merging_names(LH2E_copy,new): | |
701 | |
702 dupes=[] | |
703 final_LH2E =[] | |
704 | |
705 for num in range(len(LH2E_copy)): | |
706 | |
707 if LH2E_copy[num][1] not in final_LH2E and LH2E_copy[num][0] not in final_LH2E: | |
708 final_LH2E.append(LH2E_copy[num][1]) | |
709 final_LH2E.append(LH2E_copy[num][0]) | |
710 else: | |
711 dupes.append(LH2E_copy[num][1]) | |
712 | |
713 dupes=list(set(dupes)) | |
714 | |
715 for i in range(len(dupes)): | |
716 dupes[i]=[dupes[i]] | |
717 | |
718 for x in LH2E_copy: | |
719 for y in dupes: | |
720 if x[1]==y[0]: | |
721 fl=0 | |
722 if len(y)==1: | |
723 y.append(x[0]) | |
724 else: | |
725 for i in range(1,len(y)): | |
726 if y[i].split("_")[0]==x[0].split("_")[0]: | |
727 fl=1 | |
728 if len(x[0])<len(y[i]): | |
729 del y[i] | |
730 y.append(x[0]) | |
731 break | |
732 | |
733 if fl==0: | |
734 y.append((x[0])) | |
735 | |
736 for y in dupes: | |
737 if len(y)>2: | |
738 for i in range(len(y)-1,1,-1): | |
739 y[1]=y[1]+"/"+y[i] | |
740 del y[i] | |
741 | |
742 | |
743 for x in LH2E_copy: | |
744 for y in dupes: | |
745 if x[1]==y[0]: | |
746 x[0]=y[1] | |
747 | |
748 LH2E_copy.sort() | |
749 LH2E_copy=list(LH2E_copy for LH2E_copy,_ in itertools.groupby(LH2E_copy)) | |
750 | |
751 new.extend(LH2E_copy) | |
752 | |
753 | |
754 ###################################################################################################################################################### | |
755 | |
756 def ssamples1(tem_names,tem_samp,non_names,non_samp,folder,pro): | |
757 | |
758 for i in range(2,len(tem_samp[0])): | |
759 | |
760 fp = open(folder+tem_names[i-2]+'.txt','w') | |
761 fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n") | |
762 | |
763 for x in tem_samp: | |
764 fp.write("%s" % "\t".join([x[0],x[i]])+"\n") | |
765 | |
766 for j in range(len(non_names)): | |
767 if non_names[j]==tem_names[i-2]: | |
768 for x in non_samp: | |
769 fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n") | |
770 fp.close() | |
771 | |
772 ################################################################################################################################################################################################################### | |
773 | |
774 def download_matures(matures,org_name): | |
775 | |
776 #url = 'ftp://mirbase.org/pub/mirbase/21/mature.fa.gz' | |
777 url = 'ftp://mirbase.org/pub/mirbase/CURRENT/mature.fa.gz' | |
778 data = urllib.request.urlopen(url).read() | |
779 file_mirna = gzip.decompress(data).decode('utf-8') | |
780 file_mirna = file_mirna.split("\n") | |
781 | |
782 for i in range(0,len(file_mirna)-1,2): | |
783 | |
784 if org_name in file_mirna[i]: | |
785 matures.append(file_mirna[i]) | |
786 matures.append(file_mirna[i+1]) | |
787 | |
788 ################################################################################################################################################################################################################### | |
789 def non_template_ref(sc,st,all_isoforms): | |
790 | |
791 pre_uni_seq_con = list(sc) | |
792 pre_uni_seq_tre = list(st) | |
793 | |
794 for x in pre_uni_seq_con: | |
795 for y in x: | |
796 if ">"+y[2] not in all_isoforms and ")_" in y[2] : | |
797 all_isoforms.append(">"+y[2]) | |
798 all_isoforms.append(y[9]) | |
799 | |
800 | |
801 for x in pre_uni_seq_tre: | |
802 for y in x: | |
803 if ">"+y[2] not in all_isoforms and ")_" in y[2]: | |
804 all_isoforms.append(">"+y[2]) | |
805 all_isoforms.append(y[9]) | |
806 | |
807 ################################################################################################################################################################################################ | |
808 | |
809 def deseqe2(sample,mir_names,l,new_d,sample_name,sample_order): | |
810 | |
811 for y in mir_names: | |
812 flag=0 | |
813 for x in sample: | |
814 if y[0]==x[0]: | |
815 flag=1 | |
816 break | |
817 if flag==0: | |
818 sample.append([y[0],"0",y[1]]) | |
819 | |
820 sample.sort(key=lambda x: x[0]) | |
821 sample=list(sample for sample,_ in itertools.groupby(sample)) | |
822 | |
823 l.acquire() | |
824 new_d.append(sample) | |
825 sample_order.append(sample_name) | |
826 l.release() | |
827 | |
828 ############################################################################################################################################################################################### | |
829 |