annotate mirbase_functions.py @ 16:e19c832c5368 draft

Uploaded
author glogobyte
date Thu, 22 Oct 2020 07:47:58 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
16
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
1 import itertools
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
2 import time
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
3 import sys
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
4 import os
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
5 import urllib.request
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
6 import gzip
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
7 from multiprocessing import Process, Queue, Lock, Pool, Manager, Value
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
8 import subprocess
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
9 import argparse
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
10 from collections import OrderedDict
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
11 from matplotlib.backends.backend_pdf import PdfPages
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
12 import pandas as pd
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
13 from math import pi
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
14 import numpy as np
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
15 import matplotlib.pyplot as plt
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
16 from matplotlib.ticker import PercentFormatter
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
17 import seaborn as sns
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
18 import scipy.stats as stats
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
19 from plotnine import *
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
20 import math
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
21 import re
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
22 import matplotlib.ticker as mtick
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
23 import copy
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
24
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
25
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
26 """---------------------- Simple Functions -----------------------"""
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
27
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
28 # Read a file and return it as a list
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
29 def read(path, flag):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
30 if flag == 0:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
31 with open(path) as fp:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
32 file=fp.readlines()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
33 fp.close()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
34 return file
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
35
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
36 if flag == 1:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
37 with open(path) as fp:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
38 file = fp.read().splitlines()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
39 fp.close()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
40 return file
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
41
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
42 # Write a list to a txt file
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
43 def write(path, list):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
44 with open(path,'w') as fp:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
45 for x in list:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
46 fp.write(str("\t".join(x[1:-1])))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
47 fp.close()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
48
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
49 """---------------------- RNA-seq Functions ----------------------"""
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
50
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
51 # Detect the longest common substring sequence between two mirnas
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
52 def longestSubstring(str1, str2):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
53
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
54 from difflib import SequenceMatcher
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
55 # initialize SequenceMatcher object with
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
56 # input string
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
57 seqMatch = SequenceMatcher(None, str1, str2)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
58
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
59 # find match of longest sub-string
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
60 # output will be like Match(a=0, b=0, size=5)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
61 match = seqMatch.find_longest_match(0, len(str1), 0, len(str2))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
62
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
63 # print longest substring
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
64 if (match.size != 0):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
65 return str1[match.a: match.a + match.size]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
66 else:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
67 print('No longest common sub-string found')
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
68
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
69
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
70
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
71 ########################################################################################################################################################
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
72
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
73 def collapse_sam(path):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
74
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
75 ini_sam=read(path,0)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
76 main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
77 intro_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" in x.split("\t")[0]]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
78
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
79 uni_seq = []
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
80 for x in main_sam:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
81
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
82 if [x[2], x[9]] not in uni_seq:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
83 uni_seq.append([x[2], x[9]])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
84
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
85 new_main_sam=[]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
86 incr_num=0
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
87 for i in range(len(uni_seq)):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
88 count=0
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
89 incr_num+=1
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
90 for y in main_sam:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
91 if uni_seq[i][1]==y[9] and uni_seq[i][0]==y[2]:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
92 count+=1
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
93 temp=y
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
94 temp[10]="~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
95 temp[0]=str(incr_num)+"-"+str(count)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
96 new_main_sam.append(temp)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
97
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
98 new_sam=intro_sam+new_main_sam
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
99
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
100 return new_sam
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
101
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
102 #################################################################################################################################################################################################################
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
103
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
104 def duplicate_chroms_isoforms(List):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
105
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
106 dupes=[]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
107
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
108 for num in range(len(List)):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
109
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
110 if [List[num][9],List[num][0],List[num][2]] not in dupes :
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
111 dupes.append([List[num][9],List[num][0],List[num][2]])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
112
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
113 for x in List:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
114 for y in dupes:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
115 if x[9]==y[0] and x[0]==y[1] and x[2].split("_")[0]==y[2].split("_")[0] and x[2]!=y[2]:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
116 y.append(x[2])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
117
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
118
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
119 double_List = [x[:] for x in List]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
120
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
121 chr_order=[]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
122 for x in dupes:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
123 temp = []
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
124 for i in range(2,len(x)):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
125 if x[i].split("chr")[1].split("(")[0].isdigit():
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
126 temp.append(int(x[i].split("chr")[1].split("(")[1][0]+x[i].split("chr")[1].split("(")[0]))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
127 else:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
128 temp.append(x[i].split("chr")[1][0:4])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
129
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
130 for z in temp:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
131 if 'X(-)'==z or 'Y(-)'==z or 'X(+)'==z or 'Y(+)'==z:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
132 temp = [str(j) for j in temp]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
133 temp=list(set(temp))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
134 temp.sort()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
135 chr_order.append(temp)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
136
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
137 final_dupes=[]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
138 for i in range(len(dupes)):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
139 final_dupes.append([dupes[i][0],dupes[i][2].split("_")[0],dupes[i][1]])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
140 for x in chr_order[i]:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
141 result = re.match("[-+]?\d+$", str(x))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
142 if len(chr_order[i]) == len(set(chr_order[i])):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
143 if result is not None:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
144
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
145 if int(x)<0:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
146 final_dupes[i][1]=final_dupes[i][1]+"_chr"+str(abs(int(x)))+"(-)"
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
147 else:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
148 final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(abs(int(x)))+"(+)"
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
149 else:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
150 final_dupes[i][1] = final_dupes[i][1] + "_chr" + str(x)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
151 else:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
152 if result is not None:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
153 if int(x) < 0:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
154 final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(-)"
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
155 else:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
156 final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(abs(int(x))) + "(+)"
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
157 else:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
158 final_dupes[i][1] = final_dupes[i][1] +dupes[i][2].split("_")[1]+ "_chr" + str(x)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
159
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
160 final_dupes.sort()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
161 final_dupes=list(final_dupes for final_dupes,_ in itertools.groupby(final_dupes))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
162
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
163 for i in range(len(double_List)):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
164 for x in final_dupes:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
165
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
166 if double_List[i][9] == x[0] and double_List[i][0] == x[2] and len(double_List[i][2].split("_")) >3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
167 gg=str("_"+double_List[i][2].split("_")[-2]+"_"+double_List[i][2].split("_")[-1])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
168 double_List[i][2] = x[1]+gg
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
169
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
170 if double_List[i][9]==x[0] and double_List[i][0]== x[2] and len(double_List[i][2].split("_"))==3 and double_List[i][2].split("_")[0]==x[1].split("_")[0]:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
171 double_List[i][2]=x[1]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
172 List[i][2] = x[1]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
173
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
174 List.sort()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
175 new_list=list(List for List,_ in itertools.groupby(List))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
176
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
177 double_List.sort()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
178 new_double_List = list(double_List for double_List, _ in itertools.groupby(double_List))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
179
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
180 return new_list, new_double_List
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
181
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
182
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
183 #############################################################################################################################################################################################################
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
184
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
185 def sam(mature_mirnas,path,name,con,l,samples,data,names,unmap_seq,samples_mirna_names,deseq,LHE_names,umi,ini_sample,unmap_counts):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
186
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
187 # read the sam file
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
188 ini_sam=read(path,0)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
189 new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
190 unique_seq = [x for x in new_main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
191
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
192 sorted_uni_arms = []
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
193
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
194 for i in range(len(mature_mirnas)):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
195 tmp_count_reads = 0 # calculate the total number of reads
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
196 tmp_count_seq = 0 # calculate the total number of sequences
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
197 for j in range(len(unique_seq)):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
198
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
199 if "{" in unique_seq[j][2].split("_")[0]:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
200 official=unique_seq[j][2].split("_")[0][:-4]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
201 else:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
202 official=unique_seq[j][2].split("_")[0]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
203
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
204 if mature_mirnas[i].split(" ")[0][1:] == official:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
205
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
206 temp_mature = mature_mirnas[i+1].strip().replace("U", "T")
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
207 off_part = longestSubstring(temp_mature, unique_seq[j][9])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
208
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
209 mat_diff = temp_mature.split(off_part)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
210 mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
211
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
212 unique_diff = unique_seq[j][9].split(off_part)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
213 unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
214
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
215 # Problem with hsa-miR-8485
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
216 if mat_diff[1]!=0 and unique_diff[1]!=0:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
217 unique_seq[j]=1
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
218 pre_pos = 0
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
219 post_pos = 0
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
220
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
221 elif mat_diff[0]!=0 and unique_diff[0]!=0:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
222 unique_seq[j]=1
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
223 pre_pos = 0
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
224 post_pos = 0
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
225
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
226 else:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
227 pre_pos = mat_diff[0]-unique_diff[0]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
228 post_pos = unique_diff[1]-mat_diff[1]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
229 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
230 tmp_count_seq = tmp_count_seq+1
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
231
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
232 if pre_pos != 0 or post_pos != 0:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
233 if pre_pos == 0:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
234 unique_seq[j][2] = unique_seq[j][2] + "_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
235 elif post_pos == 0:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
236 unique_seq[j][2] = unique_seq[j][2] + "_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
237 else:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
238 unique_seq[j][2] = unique_seq[j][2]+"_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
239
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
240 for x in range(unique_seq.count(1)):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
241 unique_seq.remove(1)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
242 if tmp_count_reads != 0 and tmp_count_seq != 0:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
243 sorted_uni_arms.append([mature_mirnas[i].split(" ")[0][1:], tmp_count_seq, tmp_count_reads])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
244 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
245 dedup_unique_seq,double_fil_uni_seq=duplicate_chroms_isoforms(unique_seq)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
246
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
247 for y in sorted_uni_arms:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
248 counts=0
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
249 seqs=0
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
250 for x in double_fil_uni_seq:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
251 if y[0]==x[2].split("_")[0]:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
252 counts+=int(x[0].split("-")[1])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
253 seqs+=1
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
254
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
255 y[1]=seqs
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
256 y[2]=counts
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
257
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
258 LHE=[]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
259 l.acquire()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
260 if con=="c":
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
261 LHE.extend(z[2] for z in double_fil_uni_seq)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
262 for y in double_fil_uni_seq:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
263 samples_mirna_names.append([y[2],y[9]])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
264 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
265 LHE_names.extend(LHE)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
266 unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4'])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
267 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4'])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
268 names.append(name)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
269 samples.append(dedup_unique_seq)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
270 data.append([con,name,double_fil_uni_seq,sorted_uni_arms])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
271 ini_sample.append(new_main_sam)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
272
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
273 if con=="t":
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
274 LHE.extend(z[2] for z in double_fil_uni_seq)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
275 for y in double_fil_uni_seq:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
276 samples_mirna_names.append([y[2],y[9]])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
277 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in double_fil_uni_seq])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
278 LHE_names.extend(LHE)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
279 unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4'])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
280 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4'])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
281 names.append(name)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
282 samples.append(dedup_unique_seq)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
283 data.append([con,name,double_fil_uni_seq,sorted_uni_arms])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
284 ini_sample.append(new_main_sam)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
285 l.release()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
286
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
287
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
288 ######################################################################################################################################
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
289
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
290 """
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
291
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
292 Read a sam file from Bowtie and do the followings:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
293
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
294 1) Remove reverse stranded mapped reads
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
295 2) Remove unmapped reads
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
296 3) Remove all sequences with reads less than 11 reads
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
297 4) Sort the arms with the most sequences in decreading rate
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
298 5) Sort the sequences of every arm with the most reads in decreasing rate
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
299 6) Calculate total number of sequences of every arm
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
300 7) Calculate total number of reads of sequences of every arm.
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
301 8) Store all the informations in a txt file
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
302
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
303 """
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
304
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
305 def non_sam(mature_mirnas,path,name,con,l,data,names,n_deseq,n_samples_mirna_names,n_LHE_names):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
306
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
307 ini_sam=read(path,0)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
308 new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
309 unique_seq=[]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
310 unique_seq = [x for x in new_main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
311
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
312 uni_seq=[]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
313 # Calculate the shifted positions for every isomir and add them to the name of it
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
314 sorted_uni_arms = []
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
315 for i in range(1,len(mature_mirnas),2):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
316 tmp_count_reads = 0 # calculate the total number of reads
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
317 tmp_count_seq = 0 # calculate the total number of sequences
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
318
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
319 for j in range(len(unique_seq)):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
320
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
321 temp_mature = mature_mirnas[i].strip().replace("U", "T")
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
322
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
323 if temp_mature in unique_seq[j][9]:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
324
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
325 off_part = longestSubstring(temp_mature, unique_seq[j][9])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
326
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
327 mat_diff = temp_mature.split(off_part)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
328 mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
329
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
330 unique_diff = unique_seq[j][9].split(off_part)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
331 if len(unique_diff)<=2:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
332 unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
333
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
334 pre_pos = mat_diff[0]-unique_diff[0]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
335 post_pos = unique_diff[1]-mat_diff[1]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
336
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
337 lengthofmir = len(off_part) + post_pos
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
338 if pre_pos == 0:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
339 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
340 tmp_count_seq = tmp_count_seq + 1
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
341
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
342 if pre_pos == 0:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
343
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
344 t_name=unique_seq[j].copy()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
345 t_name[2]=mature_mirnas[i - 1].split(" ")[0][1:] + "__" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
346 uni_seq.append(t_name)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
347
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
348
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
349 if tmp_count_reads != 0 and tmp_count_seq != 0:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
350 sorted_uni_arms.append([mature_mirnas[i-1].split(" ")[0][1:], tmp_count_seq, tmp_count_reads])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
351
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
352
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
353 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
354 unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq))))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
355
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
356 LHE=[]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
357
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
358 l.acquire()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
359 if con=="c":
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
360 LHE.extend(x[2] for x in unique_seq if x[2]!="*")
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
361 for x in unique_seq:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
362 if x[2]!="*":
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
363 n_samples_mirna_names.append([x[2],x[9]])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
364 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
365 n_LHE_names.extend(LHE)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
366 names.append(name)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
367 data.append([con,name,unique_seq,sorted_uni_arms])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
368
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
369
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
370 if con=="t":
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
371 LHE.extend(x[2] for x in unique_seq if x[2]!="*")
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
372 for x in unique_seq:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
373 if x[2]!="*":
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
374 n_samples_mirna_names.append([x[2],x[9]])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
375 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
376 n_LHE_names.extend(LHE)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
377 names.append(name)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
378 data.append([con,name,unique_seq,sorted_uni_arms])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
379 l.release()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
380
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
381 #####################################################################################################################################################################################################################
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
382 def deseq2_temp(samples_mirna_names,deseq,con,l):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
383
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
384 samples_mirna_names.sort(key=lambda x:[0])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
385 for i in range(len(deseq)):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
386 for y in samples_mirna_names:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
387 flag = 0
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
388 for x in deseq[i]:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
389 if y[0] == x[0]:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
390 flag = 1
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
391 break
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
392
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
393 if flag == 0:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
394 deseq[i].append([y[0], "0", y[1]])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
395
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
396 [deseq[i].sort(key=lambda x: x[0]) for i, _ in enumerate(deseq)]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
397 deseq_final = [[x[0],x[2]] for x in deseq[0]]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
398 [deseq_final[z].append(deseq[i][j][1]) for z,_ in enumerate(deseq_final) for i, _ in enumerate(deseq) for j,_ in enumerate(deseq[i]) if deseq_final[z][0] == deseq[i][j][0]]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
399
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
400 l.acquire()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
401 if con=="c":
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
402 q1.put(deseq_final)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
403
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
404 if con=="t":
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
405 q2.put(deseq_final)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
406 l.release()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
407
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
408
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
409 ####################################################################################################################################################################################################################
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
410
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
411 def main_temp(LH2E, LH2E_names, LH8E, LH8E_names,flag,names_con,names_tre,filter_LH8E,filter_LH2E,raw_LH8E,raw_LH2E,per,count):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
412
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
413 LH8E_add_names = [x for x in LH2E_names if x not in LH8E_names]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
414 LH2E_add_names = [x for x in LH8E_names if x not in LH2E_names]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
415
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
416 LH8E_add_names.sort()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
417 LH2E_add_names.sort()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
418 LH8E_add_names = list(LH8E_add_names for LH8E_add_names,_ in itertools.groupby(LH8E_add_names))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
419 LH2E_add_names = list(LH2E_add_names for LH2E_add_names,_ in itertools.groupby(LH2E_add_names))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
420
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
421 LH2E.sort()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
422 LH8E.sort()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
423 LH2E = list(LH2E for LH2E,_ in itertools.groupby(LH2E))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
424 LH8E = list(LH8E for LH8E,_ in itertools.groupby(LH8E))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
425
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
426 zeros=["0"]*(len(LH8E[0])-2)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
427 [LH8E_add_names[i].extend(zeros) for i,_ in enumerate(LH8E_add_names)]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
428 LH8E=LH8E+LH8E_add_names
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
429
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
430 zeros=["0"]*(len(LH2E[0])-2)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
431 [LH2E_add_names[i].extend(zeros) for i,_ in enumerate(LH2E_add_names)]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
432 LH2E=LH2E+LH2E_add_names
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
433
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
434 dupes=[]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
435 final_LH2E =[]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
436
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
437 for num,_ in enumerate(LH2E):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
438
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
439 if LH2E[num][1] not in final_LH2E and LH2E[num][0] not in final_LH2E:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
440 final_LH2E.append(LH2E[num][1])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
441 final_LH2E.append(LH2E[num][0])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
442 else:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
443 dupes.append(LH2E[num][1])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
444
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
445
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
446 dupes=list(set(dupes))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
447
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
448 dupes=[[x] for x in dupes]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
449
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
450 for x in LH2E:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
451 for y in dupes:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
452 if x[1]==y[0]:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
453 fl=0
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
454 if len(y)==1:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
455 y.append(x[0])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
456 else:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
457 for i in range(1,len(y)):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
458 if y[i].split("_")[0]==x[0].split("_")[0]:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
459 fl=1
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
460 if len(x[0])<len(y[i]):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
461 del y[i]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
462 y.append(x[0])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
463 break
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
464
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
465 if fl==0:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
466 y.append((x[0]))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
467
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
468 for y in dupes:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
469 if len(y)>2:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
470 for i in range(len(y)-1,1,-1):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
471 y[1]=y[1]+"/"+y[i]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
472 del y[i]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
473
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
474 for x in LH2E:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
475 for y in dupes:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
476 if x[1]==y[0]:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
477 x[0]=y[1]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
478
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
479 for x in LH8E:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
480 for y in dupes:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
481 if x[1]==y[0]:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
482 x[0]=y[1]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
483
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
484
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
485 LH2E.sort()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
486 LH2E=list(LH2E for LH2E,_ in itertools.groupby(LH2E))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
487
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
488 LH8E.sort()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
489 LH8E=list(LH8E for LH8E,_ in itertools.groupby(LH8E))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
490
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
491 if int(per)!=-1:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
492 percent=int(per)/100
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
493
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
494 c_col_filter=round(percent*(len(LH2E[1])-2))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
495 t_col_filter=round(percent*(len(LH8E[1])-2))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
496
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
497 for i, _ in enumerate(LH2E):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
498 c_cols=0
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
499 t_cols=0
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
500
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
501 c_cols=sum([1 for j in range(len(LH2E[i])-2) if int(LH2E[i][j+2])>=int(count)])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
502 t_cols=sum([1 for j in range(len(LH8E[i])-2) if int(LH8E[i][j+2])>=int(count)])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
503
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
504 if c_cols>=c_col_filter or t_cols>=t_col_filter:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
505 filter_LH8E.append(LH8E[i])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
506 filter_LH2E.append(LH2E[i])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
507
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
508 raw_LH2E.extend(LH2E)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
509 raw_LH8E.extend(LH8E)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
510
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
511 ##################################################################################################################################################################################################################
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
512
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
513 def write_main(raw_LH2E, raw_LH8E, fil_LH2E, fil_LH8E, names_con, names_tre, flag, per, n1, n2):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
514
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
515 if flag == 1 and int(per)!=-1:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
516 fp = open('Counts/Filtered '+n2 +' Templated Counts', 'w')
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
517 fp.write("Name\t")
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
518 fp.write("Sequence")
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
519 for y in names_tre:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
520 fp.write("\t"+y)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
521
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
522 for x in fil_LH8E:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
523 fp.write("\n%s" % "\t".join(x))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
524 fp.close()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
525
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
526 fp = open('Counts/Filtered '+n1+' Templated Counts', 'w')
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
527 fp.write("Name\t")
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
528 fp.write("Sequence")
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
529 for y in names_con:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
530 fp.write("\t"+y)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
531
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
532 for x in fil_LH2E:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
533 fp.write("\n%s" % "\t".join(x))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
534 fp.close()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
535
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
536
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
537 if flag == 2 and int(per)!=-1:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
538 fp = open('Counts/Filtered '+n2+' Non-Templated Counts', 'w')
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
539 fp.write("Name\t")
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
540 fp.write("Sequence")
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
541 for y in names_tre:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
542 fp.write("\t"+y)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
543
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
544
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
545 for x in fil_LH8E:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
546 fp.write("\n%s" % "\t".join(x))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
547 fp.close()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
548
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
549 fp = open('Counts/Filtered '+n1+' Non-Templated Counts', 'w')
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
550 fp.write("Name\t")
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
551 fp.write("Sequence")
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
552 for y in names_con:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
553 fp.write("\t"+y)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
554
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
555 for x in fil_LH2E:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
556 fp.write("\n%s" % "\t".join(x))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
557 fp.close()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
558
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
559
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
560 if flag == 1:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
561 fp = open('Counts/Raw '+n2+' Templated Counts', 'w')
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
562 fp.write("Name\t")
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
563 fp.write("Sequence")
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
564 for y in names_tre:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
565 fp.write("\t"+y)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
566
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
567 for x in raw_LH8E:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
568 fp.write("\n%s" % "\t".join(x))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
569 fp.close()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
570
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
571 fp = open('Counts/Raw '+n1+' Templated Counts', 'w')
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
572 fp.write("Name\t")
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
573 fp.write("Sequence")
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
574 for y in names_con:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
575 fp.write("\t"+y)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
576
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
577 for x in raw_LH2E:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
578 fp.write("\n%s" % "\t".join(x))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
579 fp.close()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
580
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
581
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
582 if flag == 2:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
583 fp = open('Counts/Raw '+n2+' Non-Templated Counts', 'w')
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
584 fp.write("Name\t")
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
585 fp.write("Sequence")
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
586 for y in names_tre:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
587 fp.write("\t"+y)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
588
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
589
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
590 for x in raw_LH8E:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
591 fp.write("\n%s" % "\t".join(x))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
592 fp.close()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
593
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
594 fp = open('Counts/Raw '+n1+' Non-Templated Counts', 'w')
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
595 fp.write("Name\t")
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
596 fp.write("Sequence")
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
597 for y in names_con:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
598 fp.write("\t"+y)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
599
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
600 for x in raw_LH2E:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
601 fp.write("\n%s" % "\t".join(x))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
602 fp.close()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
603
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
604
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
605 #########################################################################################################################################
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
606
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
607 def ssamples(names,samp,folder,pro):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
608
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
609 for i in range(2,len(samp[0])):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
610
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
611 fp = open(folder+names[i-2]+'.txt','w')
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
612 fp.write("miRNA id"+"\t"+names[i-2]+"\n")
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
613
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
614 for x in samp:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
615 fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
616 fp.close()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
617
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
618 ##################################################################################################################
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
619
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
620 def DB_write(con,name,unique_seq,sorted_uni_arms,f):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
621
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
622 if f==1:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
623 # Write a txt file with all the information
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
624 if con=="c":
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
625 fp = open('split1/'+name, 'w')
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
626
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
627 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
628 if con=="t":
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
629 fp = open('split2/'+name, 'w')
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
630 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
631
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
632
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
633 for i in range(len(sorted_uni_arms)):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
634 temp = []
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
635 for j in range(len(unique_seq)):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
636
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
637 if sorted_uni_arms[i][0] in unique_seq[j][2].split("_")[0]:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
638
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
639 temp.append(unique_seq[j])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
640
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
641 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
642 fp.write("*********************************************************************************************************\n")
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
643 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
644 fp.write("*********************************************************************************************************\n\n")
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
645 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
646 fp.write("\n" + "\n")
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
647 fp.close()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
648
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
649 if f==2:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
650
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
651 if con=="c":
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
652 fp = open('split3/'+name, 'w')
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
653 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
654 if con=="t":
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
655 fp = open('split4/'+name, 'w')
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
656 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
657
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
658
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
659 for i in range(len(sorted_uni_arms)):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
660 temp = []
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
661 for j in range(len(unique_seq)):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
662 if sorted_uni_arms[i][0]==unique_seq[j][2].split("__")[0]:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
663 temp.append(unique_seq[j])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
664 if temp!=[]:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
665 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
666 fp.write("*********************************************************************************************************\n")
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
667 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
668 fp.write("*********************************************************************************************************\n\n")
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
669 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
670 fp.write("\n" + "\n")
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
671 fp.close()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
672
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
673
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
674 ##########################################################################################################################
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
675
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
676 def new_mat_seq(pre_unique_seq,mat_mirnas,l):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
677
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
678 unique_iso = []
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
679 for x in pre_unique_seq:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
680 if len(x[2].split("_"))==3:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
681 for y in pre_unique_seq:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
682 if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
683 if any(y[2] in lst2 for lst2 in unique_iso)==False:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
684 y[2]=">"+y[2]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
685 unique_iso.append(y)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
686 l.acquire()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
687 for x in unique_iso:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
688 mat_mirnas.append(x[2])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
689 mat_mirnas.append(x[9])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
690 l.release()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
691
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
692 #########################################################################################################################
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
693
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
694 def merging_names(LH2E_copy,new):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
695
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
696 dupes=[]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
697 final_LH2E =[]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
698
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
699 for num in range(len(LH2E_copy)):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
700
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
701 if LH2E_copy[num][1] not in final_LH2E and LH2E_copy[num][0] not in final_LH2E:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
702 final_LH2E.append(LH2E_copy[num][1])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
703 final_LH2E.append(LH2E_copy[num][0])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
704 else:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
705 dupes.append(LH2E_copy[num][1])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
706
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
707 dupes=list(set(dupes))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
708
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
709 for i in range(len(dupes)):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
710 dupes[i]=[dupes[i]]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
711
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
712 for x in LH2E_copy:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
713 for y in dupes:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
714 if x[1]==y[0]:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
715 fl=0
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
716 if len(y)==1:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
717 y.append(x[0])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
718 else:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
719 for i in range(1,len(y)):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
720 if y[i].split("_")[0]==x[0].split("_")[0]:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
721 fl=1
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
722 if len(x[0])<len(y[i]):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
723 del y[i]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
724 y.append(x[0])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
725 break
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
726
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
727 if fl==0:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
728 y.append((x[0]))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
729
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
730 for y in dupes:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
731 if len(y)>2:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
732 for i in range(len(y)-1,1,-1):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
733 y[1]=y[1]+"/"+y[i]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
734 del y[i]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
735
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
736
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
737 for x in LH2E_copy:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
738 for y in dupes:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
739 if x[1]==y[0]:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
740 x[0]=y[1]
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
741
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
742 LH2E_copy.sort()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
743 LH2E_copy=list(LH2E_copy for LH2E_copy,_ in itertools.groupby(LH2E_copy))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
744
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
745 new.extend(LH2E_copy)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
746
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
747
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
748 ######################################################################################################################################################
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
749
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
750 def ssamples1(tem_names,tem_samp,non_names,non_samp,folder,pro):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
751
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
752 for i in range(2,len(tem_samp[0])):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
753
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
754 fp = open(folder+tem_names[i-2]+'.txt','w')
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
755 fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n")
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
756
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
757 for x in tem_samp:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
758 fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
759
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
760 for j in range(len(non_names)):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
761 if non_names[j]==tem_names[i-2]:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
762 for x in non_samp:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
763 fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n")
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
764 fp.close()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
765
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
766 ###################################################################################################################################################################################################################
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
767
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
768 def download_matures(matures,org_name):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
769
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
770 #url = 'ftp://mirbase.org/pub/mirbase/21/mature.fa.gz'
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
771 url = 'ftp://mirbase.org/pub/mirbase/CURRENT/mature.fa.gz'
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
772 data = urllib.request.urlopen(url).read()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
773 file_mirna = gzip.decompress(data).decode('utf-8')
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
774 file_mirna = file_mirna.split("\n")
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
775
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
776 for i in range(0,len(file_mirna)-1,2):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
777
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
778 if org_name in file_mirna[i]:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
779 matures.append(file_mirna[i])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
780 matures.append(file_mirna[i+1])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
781
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
782 ###################################################################################################################################################################################################################
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
783 def non_template_ref(sc,st,all_isoforms):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
784
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
785 pre_uni_seq_con = list(sc)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
786 pre_uni_seq_tre = list(st)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
787
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
788 for x in pre_uni_seq_con:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
789 for y in x:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
790 if ">"+y[2] not in all_isoforms and ")_" in y[2] :
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
791 all_isoforms.append(">"+y[2])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
792 all_isoforms.append(y[9])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
793
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
794
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
795 for x in pre_uni_seq_tre:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
796 for y in x:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
797 if ">"+y[2] not in all_isoforms and ")_" in y[2]:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
798 all_isoforms.append(">"+y[2])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
799 all_isoforms.append(y[9])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
800
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
801 ################################################################################################################################################################################################
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
802
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
803 def deseqe2(sample,mir_names,l,new_d,sample_name,sample_order):
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
804
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
805 for y in mir_names:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
806 flag=0
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
807 for x in sample:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
808 if y[0]==x[0]:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
809 flag=1
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
810 break
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
811 if flag==0:
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
812 sample.append([y[0],"0",y[1]])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
813
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
814 sample.sort(key=lambda x: x[0])
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
815 sample=list(sample for sample,_ in itertools.groupby(sample))
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
816
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
817 l.acquire()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
818 new_d.append(sample)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
819 sample_order.append(sample_name)
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
820 l.release()
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
821
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
822 ###############################################################################################################################################################################################
e19c832c5368 Uploaded
glogobyte
parents:
diff changeset
823