7
|
1 import argparse
|
|
2 from functions import *
|
|
3 from viz_graphs import *
|
|
4 import sys
|
|
5 import pandas as pd
|
|
6 import matplotlib.pyplot as plt
|
|
7 import matplotlib.patches as mpatches
|
|
8 import matplotlib.font_manager as font_manager
|
|
9 import time
|
|
10 from multiprocessing import Process, Queue, Lock, Pool, Manager, Value
|
|
11
|
|
12
|
|
13 ##################################################################################################################################################################################################################
|
|
14
|
|
15 def top_diff(miRNA_info, number,flag,l):
|
|
16
|
|
17 Kind=[]
|
|
18
|
|
19 miRNA_info.sort(key = lambda x: abs(x[1]),reverse=True)
|
|
20 miRNA_info = miRNA_info[:number]
|
|
21 miRNA_info.sort(key = lambda x: x[0])
|
|
22
|
|
23 for x in miRNA_info:
|
|
24 if x[1] > 0:
|
|
25 Kind.append(True)
|
|
26 elif x[1] < 0:
|
|
27 Kind.append(False)
|
|
28 else:
|
|
29 Kind.append("Zero")
|
|
30
|
|
31 top_miRNA = {"Names": [x[0] for x in miRNA_info],
|
|
32 "Log2FC": [x[1] for x in miRNA_info],
|
|
33 "Kind": Kind};
|
|
34
|
|
35 df_miRNA = pd.DataFrame(data=top_miRNA)
|
|
36 df_miRNA = df_miRNA.sort_values(by=['Names'])
|
|
37 if df_miRNA.empty==False:
|
|
38 h1=df_miRNA.plot.barh(x= 'Names',y='Log2FC',color=df_miRNA.Kind.map({True: 'g', False: 'r', 'Zero':'k'}))
|
|
39 figure = plt.gcf() # get current figure
|
|
40 figure.set_size_inches(5, 12) # set figure's size manually to your full screen (32x18)
|
|
41 up_reg = mpatches.Patch(color='green', label='Upregulated')
|
|
42 down_reg = mpatches.Patch(color='red', label='Downregulated')
|
|
43 font = font_manager.FontProperties(weight='bold', style='normal')
|
|
44 l3 = plt.legend(handles=[up_reg,down_reg],bbox_to_anchor=(1.04,0.5), loc="center left", borderaxespad=0)
|
|
45 h1.set_ylabel(" ", fontsize=3, fontweight='bold')
|
|
46 h1.set_xlabel("Log2FC", fontsize=12, fontweight='bold')
|
|
47 plt.axvline(x=0, color="k")
|
|
48
|
|
49 plt.grid(axis='y', linewidth=0.2)
|
|
50 plt.grid(axis='x', linewidth=0.2)
|
|
51 if flag=='t':
|
|
52 plt.savefig('tem.png', bbox_inches='tight', dpi=300)
|
|
53 if flag=='nt':
|
|
54 plt.savefig('non.png', bbox_inches='tight', dpi=300)
|
|
55
|
|
56 ####################################################################################################################################################################################################################
|
|
57
|
|
58 def unique(sequence):
|
|
59 seen = set()
|
|
60 return [x for x in sequence if not (x in seen or seen.add(x))]
|
|
61
|
|
62 ###########################################################################################################################################################################################################################################################################
|
|
63
|
|
64 def top_scatter_non(matures,isoforms,non_temp,uni_names,number):
|
|
65
|
|
66 mat_names=[]
|
|
67 mat_log2fc=[]
|
|
68
|
|
69 iso_names=[]
|
|
70 iso_log2fc=[]
|
|
71
|
|
72 non_temp_names=[]
|
|
73 non_temp_log2fc=[]
|
|
74
|
|
75 count=0
|
|
76 for x in uni_names:
|
|
77 flag = False
|
|
78 if count<number:
|
|
79 for y in matures:
|
|
80 if x in y[0]:
|
|
81 mat_log2fc.append(y[1])
|
|
82 mat_names.append(x)
|
|
83 flag=True
|
|
84 for y in isoforms:
|
|
85 if x in y[0]:
|
|
86 iso_log2fc.append(y[1])
|
|
87 iso_names.append(x)
|
|
88 flag=True
|
|
89 for y in non_temp:
|
|
90 if x in y[0]:
|
|
91 non_temp_log2fc.append(y[1])
|
|
92 non_temp_names.append(x)
|
|
93 flag=True
|
|
94 if flag==True:
|
|
95 count+=1
|
|
96
|
|
97 mat_df = pd.DataFrame(dict(names=mat_names, log2fc=mat_log2fc))
|
|
98 iso_df = pd.DataFrame(dict(names=iso_names, log2fc=iso_log2fc))
|
|
99 non_df = pd.DataFrame(dict(names=non_temp_names, log2fc= non_temp_log2fc))
|
|
100
|
|
101 iso_df.sort_values(by=['names'])
|
|
102 mat_df.sort_values(by=['names'])
|
|
103 non_df.sort_values(by=['names'])
|
|
104
|
|
105 fig, ax = plt.subplots()
|
|
106
|
|
107 h3=ax.scatter(iso_df['log2fc'],iso_df['names'],edgecolors='k',linewidth=1, marker='o', c='red')
|
|
108 h1=ax.scatter(mat_df['log2fc'],mat_df['names'],edgecolors='k',linewidth=1, marker='o', c='green')
|
|
109 h2=ax.scatter(non_df['log2fc'],non_df['names'],edgecolors='k',linewidth=1, marker='o', c='blue')
|
|
110
|
|
111 l3 = plt.legend([h1,h2,h3],["Reference miRNA","Non-template","Template isomiRs"],bbox_to_anchor=(1.04,0.5), loc="center left", borderaxespad=0)
|
|
112 plt.axvline(x=0, color="k")
|
|
113 plt.grid(axis='y', linewidth=0.2)
|
|
114 plt.grid(axis='x', linewidth=0.2)
|
|
115 plt.xlabel("Log2FC", fontsize=12, fontweight='bold')
|
|
116 plt.yticks(rotation=0,ha="right", fontsize=10)
|
|
117 plt.xticks(rotation=0,ha="right", fontsize=10)
|
|
118 plt.tight_layout()
|
|
119 figure = plt.gcf() # get current figure
|
|
120 figure.set_size_inches(16, 12) # set figure's size manually to your full screen (32x18)
|
|
121 plt.savefig('a2.png', bbox_inches='tight', dpi=300)
|
|
122
|
|
123 #########################################################################################################################################################################################################################################
|
|
124 def top_scatter_tem(matures,isoforms,uni_names,number):
|
|
125
|
|
126 mat_names=[]
|
|
127 mat_log2fc=[]
|
|
128
|
|
129 iso_names=[]
|
|
130 iso_log2fc=[]
|
|
131
|
|
132 count=0
|
|
133 for x in uni_names:
|
|
134 flag = False
|
|
135 if count<number:
|
|
136 for y in matures:
|
|
137 if x in y[0]:
|
|
138 mat_log2fc.append(y[1])
|
|
139 mat_names.append(x)
|
|
140 flag=True
|
|
141 for y in isoforms:
|
|
142 if x in y[0]:
|
|
143 iso_log2fc.append(y[1])
|
|
144 iso_names.append(x)
|
|
145 flag=True
|
|
146 if flag==True:
|
|
147 count+=1
|
|
148
|
|
149 mat_df = pd.DataFrame(dict(names=mat_names, log2fc=mat_log2fc))
|
|
150 iso_df = pd.DataFrame(dict(names=iso_names, log2fc=iso_log2fc))
|
|
151
|
|
152 iso_df.sort_values(by=['names'])
|
|
153 mat_df.sort_values(by=['names'])
|
|
154
|
|
155 fig, ax = plt.subplots()
|
|
156
|
|
157 h3=ax.scatter(iso_df['log2fc'],iso_df['names'],edgecolors='k',linewidth=1, marker='o', c='red')
|
|
158 h1=ax.scatter(mat_df['log2fc'],mat_df['names'],edgecolors='k',linewidth=1, marker='o', c='green')
|
|
159
|
|
160 l3 = plt.legend([h1,h3],["Reference miRNA","Template isomiRs"],bbox_to_anchor=(1.04,0.5), loc="center left", borderaxespad=0)
|
|
161 plt.axvline(x=0, color="k")
|
|
162 plt.grid(axis='y', linewidth=0.2)
|
|
163 plt.grid(axis='x', linewidth=0.2)
|
|
164 plt.xlabel("Log2FC", fontsize=12, fontweight='bold')
|
|
165 plt.yticks(rotation=0,ha="right", fontsize=10)
|
|
166 plt.xticks(rotation=0,ha="right", fontsize=10)
|
|
167 plt.tight_layout()
|
|
168 figure = plt.gcf() # get current figure
|
|
169 figure.set_size_inches(16, 12) # set figure's size manually to your full screen (32x18)
|
|
170 plt.savefig('a2.png', bbox_inches='tight', dpi=300)
|
|
171
|
|
172
|
|
173 ##############################################################################################################################################################################################################################################
|
|
174 def preproccess(non_templated,matures,isoforms,log2fc,pval):
|
|
175
|
|
176 non_temp = [[x[0],float(x[1]),float(x[2])] for x in non_templated if abs(float(x[1]))>log2fc and float(x[2])<pval]
|
|
177 mat = [[x[0],float(x[1]),float(x[2])] for x in matures if abs(float(x[1]))>log2fc and float(x[2])<pval]
|
|
178 iso = [[x[0],float(x[1]),float(x[2])] for x in isoforms if abs(float(x[1]))>log2fc and float(x[2])<pval]
|
|
179 mat_iso = mat+iso
|
|
180
|
|
181 if not non_temp and not mat and not iso:
|
|
182 sys.exit("There aren't entries which meet these criteria")
|
|
183
|
|
184 mat.sort(key = lambda x: abs(float(x[1])),reverse=True)
|
|
185 iso.sort(key = lambda x: abs(float(x[1])),reverse=True)
|
|
186 non_temp.sort(key = lambda x: abs(float(x[1])),reverse=True)
|
|
187
|
|
188 all=mat+iso+non_temp
|
|
189 all.sort(key = lambda x: abs(float(x[1])), reverse=True)
|
|
190 names=[x[0].split("_")[0] for x in all]
|
|
191 uni_names=unique(names)
|
|
192
|
|
193 diff_non_templated = [[x[0],float(x[1]),float(x[2])] for x in non_templated if abs(float(x[1]))>1 and float(x[2])<pval and x[0].split("_")[0] in uni_names]
|
|
194 diff_matures = [[x[0],float(x[1]),float(x[2])] for x in matures if abs(float(x[1]))>1 and float(x[2])<pval and x[0].split("_")[0] in uni_names]
|
|
195 diff_isoforms = [[x[0],float(x[1]),float(x[2])] for x in isoforms if abs(float(x[1]))>1 and float(x[2])<pval and x[0].split("_")[0] in uni_names]
|
|
196
|
|
197 diff_matures.sort(key = lambda x: abs(float(x[1])),reverse=True)
|
|
198 diff_isoforms.sort(key = lambda x: abs(float(x[1])),reverse=True)
|
|
199 diff_non_templated.sort(key = lambda x: abs(float(x[1])),reverse=True)
|
|
200
|
|
201 return diff_matures,diff_isoforms,diff_non_templated,uni_names,non_temp,mat_iso
|
|
202
|
|
203 ##################################################################################################################################################################################################################################################################
|
|
204 starttime = time.time()
|
|
205
|
|
206 parser = argparse.ArgumentParser()
|
|
207 parser.add_argument("-in", "--input", help="choose type of analysis", action="store")
|
|
208 parser.add_argument("-p_value", "--pval", help="choose type of analysis", action="store")
|
|
209 parser.add_argument("-fc", "--log2fc", help="choose type of analysis", action="store")
|
|
210 parser.add_argument("-top", "--top_mirnas", help="choose type of analysis", action="store")
|
|
211 parser.add_argument("-tool_dir", "--tool_directory", help="tool directory path", action="store")
|
|
212 parser.add_argument("-statistic", "--stat", help="tool directory path", action="store")
|
|
213 parser.add_argument("-diff_tool", "--tool", help="tool directory path", action="store")
|
|
214
|
|
215 args = parser.parse_args()
|
|
216
|
|
217 l=Lock()
|
|
218 number = int(args.top_mirnas)
|
|
219 log2fc = float(args.log2fc)
|
|
220 pval = float(args.pval)
|
|
221
|
|
222 if args.tool=="2":
|
|
223
|
|
224 raw_EdgeR = read(args.input,0)
|
|
225 EdgeR = [x.rstrip("\n").split("\t") for x in raw_EdgeR]
|
|
226 del EdgeR[0]
|
|
227
|
|
228 if args.stat=="1":
|
|
229 non_templated = [[x[0],x[1],x[4]] for x in EdgeR if "__" in x[0] and x[1]!="NA" and x[4]!="NA"]
|
|
230 matures = [[x[0],x[1],x[4]] for x in EdgeR if 'chr' in x[0].split("_")[-1] and "__" not in x[0] and x[1]!="NA" and x[4]!="NA"]
|
|
231 isoforms = [[x[0],x[1],x[4]] for x in EdgeR if 'chr' not in x[0].split("_")[-1] and "__" not in x[0] and x[1]!="NA" and x[4]!="NA"]
|
|
232 else:
|
|
233 non_templated = [[x[0],x[1],x[5]] for x in EdgeR if "__" in x[0] and x[1]!="NA" and x[5]!="NA"]
|
|
234 matures = [[x[0],x[1],x[5]] for x in EdgeR if 'chr' in x[0].split("_")[-1] and "__" not in x[0] and x[1]!="NA" and x[5]!="NA"]
|
|
235 isoforms = [[x[0],x[1],x[5]] for x in EdgeR if 'chr' not in x[0].split("_")[-1] and "__" not in x[0] and x[1]!="NA" and x[5]!="NA"]
|
|
236
|
|
237 if args.tool=="1":
|
|
238
|
|
239 raw_Deseq = read(args.input,0)
|
|
240 Deseq = [x.rstrip("\n").split("\t") for x in raw_Deseq]
|
|
241
|
|
242 if args.stat=="1":
|
|
243 non_templated = [[x[0],x[2],x[5]] for x in Deseq if "__" in x[0] and x[2]!="NA" and x[5]!="NA"]
|
|
244 matures = [[x[0],x[2],x[5]] for x in Deseq if 'chr' in x[0].split("_")[-1] and "__" not in x[0] and x[2]!="NA" and x[5]!="NA"]
|
|
245 isoforms = [[x[0],x[2],x[5]] for x in Deseq if 'chr' not in x[0].split("_")[-1] and "__" not in x[0] and x[2]!="NA" and x[5]!="NA"]
|
|
246 else:
|
|
247 non_templated = [[x[0],x[2],x[6]] for x in Deseq if "__" in x[0] and x[2]!="NA" and x[6]!="NA"]
|
|
248 matures = [[x[0],x[2],x[6]] for x in Deseq if 'chr' in x[0].split("_")[-1] and "__" not in x[0] and x[2]!="NA" and x[6]!="NA"]
|
|
249 isoforms = [[x[0],x[2],x[6]] for x in Deseq if 'chr' not in x[0].split("_")[-1] and "__" not in x[0] and x[2]!="NA" and x[6]!="NA"]
|
|
250
|
|
251
|
|
252 diff_matures,diff_isoforms,diff_non_templated,names,non_temp,mat_iso = preproccess(non_templated,matures,isoforms,log2fc,pval)
|
|
253
|
|
254 if non_templated!=[]:
|
|
255 analysis="2"
|
|
256 p=[Process(target=top_diff,args=(non_temp,number,"nt",l))]
|
|
257 p.extend([Process(target=top_diff,args=(mat_iso,number,"t",l))])
|
|
258 p.extend([Process(target=top_scatter_non,args=(diff_matures,diff_isoforms,diff_non_templated,names,number))])
|
|
259
|
|
260 else:
|
|
261 analysis="1"
|
|
262 p=[Process(target=top_diff,args=(mat_iso,number,"t"))]
|
|
263 p.extend([Process(target=top_scatter_tem,args=(diff_matures,diff_isoforms,names,number))])
|
|
264
|
|
265 [x.start() for x in p]
|
|
266 [x.join() for x in p]
|
|
267
|
|
268 pdf_after_DE(analysis,args.top_mirnas)
|
|
269
|
|
270 print('That took {} seconds'.format(time.time() - starttime))
|
|
271
|