Mercurial > repos > glogobyte > isoread
comparison mirgene_functions.py @ 17:dc31f01cf21d draft
Uploaded
author | glogobyte |
---|---|
date | Thu, 22 Oct 2020 07:48:12 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
16:e19c832c5368 | 17:dc31f01cf21d |
---|---|
1 import itertools | |
2 import time | |
3 import sys | |
4 import os | |
5 import urllib.request | |
6 import gzip | |
7 from multiprocessing import Process, Queue, Lock, Pool, Manager, Value | |
8 import subprocess | |
9 import argparse | |
10 from collections import OrderedDict | |
11 from matplotlib.backends.backend_pdf import PdfPages | |
12 import pandas as pd | |
13 from math import pi | |
14 import numpy as np | |
15 import matplotlib.pyplot as plt | |
16 from matplotlib.ticker import PercentFormatter | |
17 import seaborn as sns | |
18 import scipy.stats as stats | |
19 from plotnine import * | |
20 import math | |
21 import re | |
22 import matplotlib.ticker as mtick | |
23 import copy | |
24 | |
25 | |
26 | |
27 | |
28 """---------------------- Simple Functions -----------------------""" | |
29 | |
30 # Read a file and return it as a list | |
31 def read(path, flag): | |
32 if flag == 0: | |
33 with open(path) as fp: | |
34 file=fp.readlines() | |
35 fp.close() | |
36 return file | |
37 | |
38 if flag == 1: | |
39 with open(path) as fp: | |
40 file = fp.read().splitlines() | |
41 fp.close() | |
42 return file | |
43 | |
44 # Write a list to a txt file | |
45 def write(path, list): | |
46 with open(path,'w') as fp: | |
47 for x in list: | |
48 fp.write(str("\t".join(x[1:-1]))) | |
49 fp.close() | |
50 | |
51 """---------------------- RNA-seq Functions ----------------------""" | |
52 | |
53 # Detect the longest common substring sequence between two mirnas | |
54 def longestSubstring(str1, str2): | |
55 | |
56 from difflib import SequenceMatcher | |
57 # initialize SequenceMatcher object with | |
58 # input string | |
59 seqMatch = SequenceMatcher(None, str1, str2) | |
60 | |
61 # find match of longest sub-string | |
62 # output will be like Match(a=0, b=0, size=5) | |
63 match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) | |
64 | |
65 # print longest substring | |
66 if (match.size != 0): | |
67 return str1[match.a: match.a + match.size] | |
68 else: | |
69 print('No longest common sub-string found') | |
70 | |
71 | |
72 | |
73 ######################################################################################################################################################## | |
74 def collapse_sam(path): | |
75 | |
76 ini_sam=read(path,0) | |
77 main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] | |
78 intro_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" in x.split("\t")[0]] | |
79 | |
80 uni_seq = [] | |
81 for x in main_sam: | |
82 | |
83 if [x[2], x[9]] not in uni_seq: | |
84 uni_seq.append([x[2], x[9]]) | |
85 | |
86 new_main_sam=[] | |
87 incr_num=0 | |
88 for i in range(len(uni_seq)): | |
89 count=0 | |
90 incr_num+=1 | |
91 for y in main_sam: | |
92 if uni_seq[i][1]==y[9] and uni_seq[i][0]==y[2]: | |
93 count+=1 | |
94 temp=y | |
95 temp[10]="~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" | |
96 temp[0]=str(incr_num)+"-"+str(count) | |
97 new_main_sam.append(temp) | |
98 | |
99 new_sam=intro_sam+new_main_sam | |
100 | |
101 return new_sam | |
102 | |
103 ################################################################################################################################################################# | |
104 | |
105 def sam(mature_mirnas,path,name,con,l,samples,data,names,unmap_seq,samples_mirna_names,deseq,LHE_names,umi,ini_sample,unmap_counts): | |
106 | |
107 # read the sam file | |
108 | |
109 ini_sam=read(path,0) | |
110 new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] | |
111 unique_seq = [x for x in new_main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26] | |
112 | |
113 # Calculate the shifted positions for every isomir and add them to the name of it | |
114 sorted_uni_arms = [] | |
115 | |
116 for i in range(len(mature_mirnas)): | |
117 tmp_count_reads = 0 # calculate the total number of reads | |
118 tmp_count_seq = 0 # calculate the total number of sequences | |
119 for j in range(len(unique_seq)): | |
120 | |
121 if mature_mirnas[i] == unique_seq[j][2]: | |
122 | |
123 temp_mature = mature_mirnas[i+1] | |
124 off_part = longestSubstring(temp_mature, unique_seq[j][9]) | |
125 | |
126 mat_diff = temp_mature.split(off_part) | |
127 mat_diff = [len(mat_diff[0]), len(mat_diff[1])] | |
128 | |
129 unique_diff = unique_seq[j][9].split(off_part) | |
130 unique_diff = [len(unique_diff[0]), len(unique_diff[1])] | |
131 | |
132 # Problem with hsa-miR-8485 | |
133 if mat_diff[1]!=0 and unique_diff[1]!=0: | |
134 unique_seq[j]=1 | |
135 pre_pos = 0 | |
136 post_pos = 0 | |
137 | |
138 elif mat_diff[0]!=0 and unique_diff[0]!=0: | |
139 unique_seq[j]=1 | |
140 pre_pos = 0 | |
141 post_pos = 0 | |
142 | |
143 else: | |
144 pre_pos = mat_diff[0]-unique_diff[0] | |
145 post_pos = unique_diff[1]-mat_diff[1] | |
146 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) | |
147 tmp_count_seq = tmp_count_seq+1 | |
148 | |
149 | |
150 if pre_pos != 0 or post_pos != 0: | |
151 if pre_pos == 0: | |
152 unique_seq[j][2] = unique_seq[j][2] + "_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos) | |
153 elif post_pos == 0: | |
154 unique_seq[j][2] = unique_seq[j][2] + "_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos) | |
155 else: | |
156 unique_seq[j][2] = unique_seq[j][2]+"_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos) | |
157 | |
158 for x in range(unique_seq.count(1)): | |
159 unique_seq.remove(1) | |
160 if tmp_count_reads != 0 and tmp_count_seq != 0: | |
161 sorted_uni_arms.append([mature_mirnas[i], tmp_count_seq, tmp_count_reads]) | |
162 | |
163 # Store name of arms, number of sequences and number of reads | |
164 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) | |
165 | |
166 for y in sorted_uni_arms: | |
167 counts=0 | |
168 seqs=0 | |
169 for x in unique_seq: | |
170 if y[0]==x[2].split("_")[0]+"_"+x[2].split("_")[1]: | |
171 counts+=int(x[0].split("-")[1]) | |
172 seqs+=1 | |
173 | |
174 y[1]=seqs | |
175 y[2]=counts | |
176 | |
177 LHE=[] | |
178 | |
179 l.acquire() | |
180 if con=="c": | |
181 LHE.extend(z[2] for z in unique_seq) | |
182 for y in unique_seq: | |
183 samples_mirna_names.append([y[2],y[9]]) | |
184 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq]) | |
185 LHE_names.extend(LHE) | |
186 unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4']) | |
187 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4']) | |
188 names.append(name) | |
189 samples.append(unique_seq) | |
190 data.append([con,name,unique_seq,sorted_uni_arms]) | |
191 ini_sample.append(new_main_sam) | |
192 | |
193 if con=="t": | |
194 LHE.extend(z[2] for z in unique_seq) | |
195 for y in unique_seq: | |
196 samples_mirna_names.append([y[2],y[9]]) | |
197 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq]) | |
198 LHE_names.extend(LHE) | |
199 unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4']) | |
200 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4']) | |
201 names.append(name) | |
202 samples.append(unique_seq) | |
203 data.append([con,name,unique_seq,sorted_uni_arms]) | |
204 ini_sample.append(new_main_sam) | |
205 l.release() | |
206 | |
207 | |
208 ###################################################################################################################################### | |
209 """ | |
210 Read a sam file from Bowtie and do the followings: | |
211 | |
212 1) Remove reverse stranded mapped reads | |
213 2) Remove unmapped reads | |
214 3) Remove all sequences with reads less than 11 reads | |
215 4) Sort the arms with the most sequences in decreading rate | |
216 5) Sort the sequences of every arm with the most reads in decreasing rate | |
217 6) Calculate total number of sequences of every arm | |
218 7) Calculate total number of reads of sequences of every arm. | |
219 8) Store all the informations in a txt file | |
220 | |
221 """ | |
222 | |
223 def non_sam(mature_mirnas,path,name,con,l,data,names,n_deseq,n_samples_mirna_names,n_LHE_names): | |
224 | |
225 | |
226 ini_sam=read(path,0) | |
227 new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]] | |
228 unique_seq=[] | |
229 unique_seq = [x for x in new_main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26] | |
230 uni_seq=[] | |
231 | |
232 # Calculate the shifted positions for every isomir and add them to the name of it | |
233 sorted_uni_arms = [] | |
234 for i in range(1,len(mature_mirnas),2): | |
235 tmp_count_reads = 0 # calculate the total number of reads | |
236 tmp_count_seq = 0 # calculate the total number of sequences | |
237 | |
238 for j in range(len(unique_seq)): | |
239 | |
240 temp_mature = mature_mirnas[i].strip().replace("U", "T") | |
241 | |
242 if temp_mature in unique_seq[j][9]: | |
243 | |
244 off_part = longestSubstring(temp_mature, unique_seq[j][9]) | |
245 | |
246 mat_diff = temp_mature.split(off_part) | |
247 mat_diff = [len(mat_diff[0]), len(mat_diff[1])] | |
248 | |
249 unique_diff = unique_seq[j][9].split(off_part) | |
250 if len(unique_diff)<=2: | |
251 unique_diff = [len(unique_diff[0]), len(unique_diff[1])] | |
252 | |
253 pre_pos = mat_diff[0]-unique_diff[0] | |
254 post_pos = unique_diff[1]-mat_diff[1] | |
255 | |
256 lengthofmir = len(off_part) + post_pos | |
257 if pre_pos == 0: | |
258 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1]) | |
259 tmp_count_seq = tmp_count_seq + 1 | |
260 | |
261 if pre_pos == 0: | |
262 t_name=copy.deepcopy(unique_seq[j]) | |
263 t_name[2]=mature_mirnas[i - 1] + "__" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):]) | |
264 uni_seq.append(t_name) | |
265 | |
266 if tmp_count_reads != 0 and tmp_count_seq != 0: | |
267 sorted_uni_arms.append([mature_mirnas[i-1], tmp_count_seq, tmp_count_reads]) | |
268 | |
269 | |
270 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True) | |
271 unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq)))) | |
272 | |
273 LHE=[] | |
274 | |
275 l.acquire() | |
276 if con=="c": | |
277 LHE.extend(x[2] for x in unique_seq if x[2]!="*") | |
278 for x in unique_seq: | |
279 if x[2]!="*": | |
280 n_samples_mirna_names.append([x[2],x[9]]) | |
281 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) | |
282 n_LHE_names.extend(LHE) | |
283 names.append(name) | |
284 data.append([con,name,unique_seq,sorted_uni_arms]) | |
285 | |
286 | |
287 if con=="t": | |
288 LHE.extend(x[2] for x in unique_seq if x[2]!="*") | |
289 for x in unique_seq: | |
290 if x[2]!="*": | |
291 n_samples_mirna_names.append([x[2],x[9]]) | |
292 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"]) | |
293 n_LHE_names.extend(LHE) | |
294 names.append(name) | |
295 data.append([con,name,unique_seq,sorted_uni_arms]) | |
296 l.release() | |
297 | |
298 ################################################################################################################################################################################################################# | |
299 | |
300 def deseq2_temp(samples_mirna_names,deseq,con,l): | |
301 | |
302 samples_mirna_names.sort(key=lambda x:[0]) | |
303 | |
304 for i in range(len(deseq)): | |
305 for y in samples_mirna_names: | |
306 flag = 0 | |
307 for x in deseq[i]: | |
308 if y[0] == x[0]: | |
309 flag = 1 | |
310 break | |
311 | |
312 if flag == 0: | |
313 deseq[i].append([y[0], "0", y[1]]) | |
314 | |
315 [deseq[i].sort(key=lambda x: x[0]) for i, _ in enumerate(deseq)] | |
316 deseq_final = [[x[0],x[2]] for x in deseq[0]] | |
317 [deseq_final[z].append(deseq[i][j][1]) for z,_ in enumerate(deseq_final) for i, _ in enumerate(deseq) for j,_ in enumerate(deseq[i]) if deseq_final[z][0] == deseq[i][j][0]] | |
318 | |
319 l.acquire() | |
320 if con=="c": | |
321 q1.put(deseq_final) | |
322 | |
323 if con=="t": | |
324 q2.put(deseq_final) | |
325 l.release() | |
326 | |
327 | |
328 ################################################################################################################################################################################################################## | |
329 | |
330 def main_temp(LH2E, LH2E_names, LH8E, LH8E_names,flag,names_con,names_tre,filter_LH8E,filter_LH2E,raw_LH8E,raw_LH2E,per,count): | |
331 | |
332 LH8E_add_names = [x for x in LH2E_names if x not in LH8E_names] | |
333 LH2E_add_names = [x for x in LH8E_names if x not in LH2E_names] | |
334 | |
335 LH8E_add_names.sort() | |
336 LH2E_add_names.sort() | |
337 LH8E_add_names = list(LH8E_add_names for LH8E_add_names,_ in itertools.groupby(LH8E_add_names)) | |
338 LH2E_add_names = list(LH2E_add_names for LH2E_add_names,_ in itertools.groupby(LH2E_add_names)) | |
339 | |
340 LH2E.sort() | |
341 LH8E.sort() | |
342 LH2E = list(LH2E for LH2E,_ in itertools.groupby(LH2E)) | |
343 LH8E = list(LH8E for LH8E,_ in itertools.groupby(LH8E)) | |
344 | |
345 zeros=["0"]*(len(LH8E[0])-2) | |
346 [LH8E_add_names[i].extend(zeros) for i,_ in enumerate(LH8E_add_names)] | |
347 LH8E=LH8E+LH8E_add_names | |
348 | |
349 zeros=["0"]*(len(LH2E[0])-2) | |
350 [LH2E_add_names[i].extend(zeros) for i,_ in enumerate(LH2E_add_names)] | |
351 LH2E=LH2E+LH2E_add_names | |
352 | |
353 dupes=[] | |
354 final_LH2E =[] | |
355 | |
356 for num,_ in enumerate(LH2E): | |
357 | |
358 if LH2E[num][1] not in final_LH2E and LH2E[num][0] not in final_LH2E: | |
359 final_LH2E.append(LH2E[num][1]) | |
360 final_LH2E.append(LH2E[num][0]) | |
361 else: | |
362 dupes.append(LH2E[num][1]) | |
363 | |
364 dupes=list(set(dupes)) | |
365 | |
366 dupes=[[x] for x in dupes] | |
367 | |
368 for x in LH2E: | |
369 for y in dupes: | |
370 if x[1]==y[0]: | |
371 fl=0 | |
372 if len(y)==1: | |
373 y.append(x[0]) | |
374 else: | |
375 for i in range(1,len(y)): | |
376 if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]: | |
377 fl=1 | |
378 if len(x[0])<len(y[i]): | |
379 del y[i] | |
380 y.append(x[0]) | |
381 break | |
382 | |
383 if fl==0: | |
384 y.append((x[0])) | |
385 | |
386 for y in dupes: | |
387 if len(y)>2: | |
388 for i in range(len(y)-1,1,-1): | |
389 y[1]=y[1]+"/"+y[i] | |
390 del y[i] | |
391 | |
392 | |
393 for x in LH2E: | |
394 for y in dupes: | |
395 if x[1]==y[0]: | |
396 x[0]=y[1] | |
397 | |
398 for x in LH8E: | |
399 for y in dupes: | |
400 if x[1]==y[0]: | |
401 x[0]=y[1] | |
402 | |
403 | |
404 | |
405 LH2E.sort() | |
406 LH2E=list(LH2E for LH2E,_ in itertools.groupby(LH2E)) | |
407 | |
408 LH8E.sort() | |
409 LH8E=list(LH8E for LH8E,_ in itertools.groupby(LH8E)) | |
410 | |
411 if int(per)!=-1: | |
412 percent=int(per)/100 | |
413 | |
414 c_col_filter=round(percent*(len(LH2E[1])-2)) | |
415 t_col_filter=round(percent*(len(LH8E[1])-2)) | |
416 | |
417 for i, _ in enumerate(LH2E): | |
418 c_cols=0 | |
419 t_cols=0 | |
420 | |
421 c_cols=sum([1 for j in range(len(LH2E[i])-2) if int(LH2E[i][j+2])>=int(count)]) | |
422 t_cols=sum([1 for j in range(len(LH8E[i])-2) if int(LH8E[i][j+2])>=int(count)]) | |
423 | |
424 if c_cols>=c_col_filter or t_cols>=t_col_filter: | |
425 filter_LH8E.append(LH8E[i]) | |
426 filter_LH2E.append(LH2E[i]) | |
427 | |
428 raw_LH2E.extend(LH2E) | |
429 raw_LH8E.extend(LH8E) | |
430 | |
431 ################################################################################################################################################################################################################## | |
432 | |
433 def write_main(raw_LH2E, raw_LH8E, fil_LH2E, fil_LH8E, names_con, names_tre, flag, per, n1, n2): | |
434 | |
435 if flag == 1 and int(per)!=-1: | |
436 fp = open('Counts/Filtered '+n2 +' Templated Counts', 'w') | |
437 fp.write("Name\t") | |
438 fp.write("Sequence") | |
439 for y in names_tre: | |
440 fp.write("\t"+y) | |
441 | |
442 for x in fil_LH8E: | |
443 fp.write("\n%s" % "\t".join(x)) | |
444 fp.close() | |
445 | |
446 fp = open('Counts/Filtered '+n1+' Templated Counts', 'w') | |
447 fp.write("Name\t") | |
448 fp.write("Sequence") | |
449 for y in names_con: | |
450 fp.write("\t"+y) | |
451 | |
452 for x in fil_LH2E: | |
453 fp.write("\n%s" % "\t".join(x)) | |
454 fp.close() | |
455 | |
456 | |
457 if flag == 2 and int(per)!=-1: | |
458 fp = open('Counts/Filtered '+n2+' Non-Templated Counts', 'w') | |
459 fp.write("Name\t") | |
460 fp.write("Sequence") | |
461 for y in names_tre: | |
462 fp.write("\t"+y) | |
463 | |
464 | |
465 for x in fil_LH8E: | |
466 fp.write("\n%s" % "\t".join(x)) | |
467 fp.close() | |
468 | |
469 fp = open('Counts/Filtered '+n1+' Non-Templated Counts', 'w') | |
470 fp.write("Name\t") | |
471 fp.write("Sequence") | |
472 for y in names_con: | |
473 fp.write("\t"+y) | |
474 | |
475 for x in fil_LH2E: | |
476 fp.write("\n%s" % "\t".join(x)) | |
477 fp.close() | |
478 | |
479 | |
480 if flag == 1: | |
481 fp = open('Counts/Raw '+n2+' Templated Counts', 'w') | |
482 fp.write("Name\t") | |
483 fp.write("Sequence") | |
484 for y in names_tre: | |
485 fp.write("\t"+y) | |
486 | |
487 for x in raw_LH8E: | |
488 fp.write("\n%s" % "\t".join(x)) | |
489 fp.close() | |
490 | |
491 fp = open('Counts/Raw '+n1+' Templated Counts', 'w') | |
492 fp.write("Name\t") | |
493 fp.write("Sequence") | |
494 for y in names_con: | |
495 fp.write("\t"+y) | |
496 | |
497 for x in raw_LH2E: | |
498 fp.write("\n%s" % "\t".join(x)) | |
499 fp.close() | |
500 | |
501 if flag == 2: | |
502 fp = open('Counts/Raw '+n2+' Non-Templated Counts', 'w') | |
503 fp.write("Name\t") | |
504 fp.write("Sequence") | |
505 for y in names_tre: | |
506 fp.write("\t"+y) | |
507 | |
508 | |
509 for x in raw_LH8E: | |
510 fp.write("\n%s" % "\t".join(x)) | |
511 fp.close() | |
512 | |
513 fp = open('Counts/Raw '+n1+' Non-Templated Counts', 'w') | |
514 fp.write("Name\t") | |
515 fp.write("Sequence") | |
516 for y in names_con: | |
517 fp.write("\t"+y) | |
518 | |
519 for x in raw_LH2E: | |
520 fp.write("\n%s" % "\t".join(x)) | |
521 fp.close() | |
522 | |
523 #################################################################################################################################################################################################################### | |
524 | |
525 def ssamples(names,samp,folder,pro): | |
526 | |
527 for i in range(2,len(samp[0])): | |
528 | |
529 fp = open(folder+names[i-2]+'.txt','w') | |
530 fp.write("miRNA id"+"\t"+names[i-2]+"\n") | |
531 | |
532 for x in samp: | |
533 fp.write("%s" % "\t".join([x[0],x[i]])+"\n") | |
534 fp.close() | |
535 | |
536 #################################################################################################################################################################################################################### | |
537 | |
538 def DB_write(con,name,unique_seq,sorted_uni_arms,f): | |
539 | |
540 if f==1: | |
541 # Write a txt file with all the information | |
542 if con=="c": | |
543 fp = open('split1/'+name, 'w') | |
544 | |
545 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | |
546 if con=="t": | |
547 fp = open('split2/'+name, 'w') | |
548 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | |
549 | |
550 for i in range(len(sorted_uni_arms)): | |
551 temp = [] | |
552 for j in range(len(unique_seq)): | |
553 | |
554 if sorted_uni_arms[i][0] in (unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[1]): | |
555 | |
556 temp.append(unique_seq[j]) | |
557 | |
558 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) | |
559 fp.write("*********************************************************************************************************\n") | |
560 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) | |
561 fp.write("*********************************************************************************************************\n\n") | |
562 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] | |
563 fp.write("\n" + "\n") | |
564 fp.close() | |
565 | |
566 if f==2: | |
567 | |
568 if con=="c": | |
569 fp = open('split3/'+name, 'w') | |
570 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | |
571 if con=="t": | |
572 fp = open('split4/'+name, 'w') | |
573 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence")) | |
574 | |
575 for i in range(len(sorted_uni_arms)): | |
576 temp = [] | |
577 for j in range(len(unique_seq)): | |
578 if sorted_uni_arms[i][0]==unique_seq[j][2].split("__")[0]: | |
579 temp.append(unique_seq[j]) | |
580 if temp!=[]: | |
581 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True) | |
582 fp.write("*********************************************************************************************************\n") | |
583 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|")) | |
584 fp.write("*********************************************************************************************************\n\n") | |
585 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp] | |
586 fp.write("\n" + "\n") | |
587 fp.close() | |
588 | |
589 | |
590 ########################################################################################################################## | |
591 | |
592 def new_mat_seq(pre_unique_seq,mat_mirnas,l): | |
593 | |
594 unique_iso = [] | |
595 for x in pre_unique_seq: | |
596 if len(x[2].split("_"))==3: | |
597 for y in pre_unique_seq: | |
598 if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]): | |
599 if any(y[2] in lst2 for lst2 in unique_iso)==False: | |
600 y[2]=">"+y[2] | |
601 unique_iso.append(y) | |
602 l.acquire() | |
603 for x in unique_iso: | |
604 mat_mirnas.append(x[2]) | |
605 mat_mirnas.append(x[9]) | |
606 l.release() | |
607 | |
608 ######################################################################################################################### | |
609 | |
610 def merging_names(LH2E_copy,new): | |
611 | |
612 dupes=[] | |
613 final_LH2E =[] | |
614 | |
615 for num in range(len(LH2E_copy)): | |
616 | |
617 if LH2E_copy[num][1] not in final_LH2E and LH2E_copy[num][0] not in final_LH2E: | |
618 final_LH2E.append(LH2E_copy[num][1]) | |
619 final_LH2E.append(LH2E_copy[num][0]) | |
620 else: | |
621 dupes.append(LH2E_copy[num][1]) | |
622 | |
623 dupes=list(set(dupes)) | |
624 | |
625 for i in range(len(dupes)): | |
626 dupes[i]=[dupes[i]] | |
627 | |
628 for x in LH2E_copy: | |
629 for y in dupes: | |
630 if x[1]==y[0]: | |
631 fl=0 | |
632 if len(y)==1: | |
633 y.append(x[0]) | |
634 else: | |
635 for i in range(1,len(y)): | |
636 if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]: | |
637 fl=1 | |
638 if len(x[0])<len(y[i]): | |
639 del y[i] | |
640 y.append(x[0]) | |
641 break | |
642 | |
643 if fl==0: | |
644 y.append((x[0])) | |
645 | |
646 for y in dupes: | |
647 if len(y)>2: | |
648 for i in range(len(y)-1,1,-1): | |
649 y[1]=y[1]+"/"+y[i] | |
650 del y[i] | |
651 | |
652 | |
653 for x in LH2E_copy: | |
654 for y in dupes: | |
655 if x[1]==y[0]: | |
656 x[0]=y[1] | |
657 | |
658 LH2E_copy.sort() | |
659 LH2E_copy=list(LH2E_copy for LH2E_copy,_ in itertools.groupby(LH2E_copy)) | |
660 new.extend(LH2E_copy) | |
661 | |
662 ###################################################################################################################################################### | |
663 | |
664 def ssamples1(tem_names,tem_samp,non_names,non_samp,folder,pro): | |
665 | |
666 for i in range(2,len(tem_samp[0])): | |
667 | |
668 fp = open(folder+tem_names[i-2]+'.txt','w') | |
669 fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n") | |
670 | |
671 for x in tem_samp: | |
672 fp.write("%s" % "\t".join([x[0],x[i]])+"\n") | |
673 | |
674 for j in range(len(non_names)): | |
675 if non_names[j]==tem_names[i-2]: | |
676 for x in non_samp: | |
677 fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n") | |
678 fp.close() | |
679 | |
680 ################################################################################################################################################################################################################# | |
681 | |
682 def download_matures(matures,org_name): | |
683 | |
684 mature_mir=[] | |
685 | |
686 mat_url = 'http://mirgenedb.org/fasta/'+org_name+'?mat=1' | |
687 star_url = 'http://mirgenedb.org/fasta/'+org_name+'?star=1' | |
688 | |
689 data = urllib.request.urlopen(mat_url).read() | |
690 file_mirna = data.decode('utf-8') | |
691 mature_mir = file_mirna.split("\n") | |
692 mature_mir = [x.replace(">","") for x in mature_mir] | |
693 del mature_mir[-1] | |
694 | |
695 data = urllib.request.urlopen(star_url).read() | |
696 file_mirna = data.decode('utf-8') | |
697 star_mir = file_mirna.split("\n") | |
698 star_mir = [x.replace(">","") for x in star_mir] | |
699 del star_mir[-1] | |
700 | |
701 mature_mir.extend(star_mir) | |
702 | |
703 for i in range(1,len(mature_mir),2): | |
704 mature_mir[i]=mature_mir[i].replace("U","T") | |
705 | |
706 matures.extend(mature_mir) | |
707 | |
708 ################################################################################################################### | |
709 | |
710 def non_template_ref(sc,st,all_isoforms): | |
711 | |
712 pre_uni_seq_con = list(sc) | |
713 pre_uni_seq_tre = list(st) | |
714 | |
715 for x in pre_uni_seq_con: | |
716 for y in x: | |
717 if y[2] not in all_isoforms and len(y[2].split("_"))>2: | |
718 all_isoforms.append(y[2]) | |
719 all_isoforms.append(y[9]) | |
720 | |
721 for x in pre_uni_seq_tre: | |
722 for y in x: | |
723 if y[2] not in all_isoforms and len(y[2].split("_"))>2: | |
724 all_isoforms.append(y[2]) | |
725 all_isoforms.append(y[9]) | |
726 | |
727 ################################################################################################################################################################################################ | |
728 | |
729 def deseqe2(sample,mir_names,l,new_d,sample_name,sample_order): | |
730 | |
731 for y in mir_names: | |
732 flag=0 | |
733 for x in sample: | |
734 if y[0]==x[0]: | |
735 flag=1 | |
736 break | |
737 if flag==0: | |
738 sample.append([y[0],"0",y[1]]) | |
739 | |
740 sample.sort(key=lambda x: x[0]) | |
741 sample=list(sample for sample,_ in itertools.groupby(sample)) | |
742 | |
743 l.acquire() | |
744 new_d.append(sample) | |
745 sample_order.append(sample_name) | |
746 l.release() | |
747 | |
748 ############################################################################################################################################################################################### | |
749 |