annotate mirgene_functions.py @ 23:d2eea02053a0 draft

Deleted selected files
author glogobyte
date Wed, 28 Oct 2020 08:13:30 +0000
parents dc31f01cf21d
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
17
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
1 import itertools
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
2 import time
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
3 import sys
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
4 import os
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
5 import urllib.request
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
6 import gzip
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
7 from multiprocessing import Process, Queue, Lock, Pool, Manager, Value
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
8 import subprocess
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
9 import argparse
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
10 from collections import OrderedDict
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
11 from matplotlib.backends.backend_pdf import PdfPages
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
12 import pandas as pd
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
13 from math import pi
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
14 import numpy as np
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
15 import matplotlib.pyplot as plt
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
16 from matplotlib.ticker import PercentFormatter
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
17 import seaborn as sns
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
18 import scipy.stats as stats
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
19 from plotnine import *
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
20 import math
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
21 import re
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
22 import matplotlib.ticker as mtick
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
23 import copy
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
24
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
25
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
26
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
27
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
28 """---------------------- Simple Functions -----------------------"""
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
29
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
30 # Read a file and return it as a list
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
31 def read(path, flag):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
32 if flag == 0:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
33 with open(path) as fp:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
34 file=fp.readlines()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
35 fp.close()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
36 return file
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
37
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
38 if flag == 1:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
39 with open(path) as fp:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
40 file = fp.read().splitlines()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
41 fp.close()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
42 return file
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
43
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
44 # Write a list to a txt file
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
45 def write(path, list):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
46 with open(path,'w') as fp:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
47 for x in list:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
48 fp.write(str("\t".join(x[1:-1])))
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
49 fp.close()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
50
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
51 """---------------------- RNA-seq Functions ----------------------"""
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
52
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
53 # Detect the longest common substring sequence between two mirnas
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
54 def longestSubstring(str1, str2):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
55
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
56 from difflib import SequenceMatcher
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
57 # initialize SequenceMatcher object with
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
58 # input string
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
59 seqMatch = SequenceMatcher(None, str1, str2)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
60
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
61 # find match of longest sub-string
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
62 # output will be like Match(a=0, b=0, size=5)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
63 match = seqMatch.find_longest_match(0, len(str1), 0, len(str2))
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
64
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
65 # print longest substring
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
66 if (match.size != 0):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
67 return str1[match.a: match.a + match.size]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
68 else:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
69 print('No longest common sub-string found')
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
70
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
71
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
72
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
73 ########################################################################################################################################################
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
74 def collapse_sam(path):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
75
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
76 ini_sam=read(path,0)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
77 main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
78 intro_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" in x.split("\t")[0]]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
79
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
80 uni_seq = []
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
81 for x in main_sam:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
82
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
83 if [x[2], x[9]] not in uni_seq:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
84 uni_seq.append([x[2], x[9]])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
85
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
86 new_main_sam=[]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
87 incr_num=0
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
88 for i in range(len(uni_seq)):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
89 count=0
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
90 incr_num+=1
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
91 for y in main_sam:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
92 if uni_seq[i][1]==y[9] and uni_seq[i][0]==y[2]:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
93 count+=1
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
94 temp=y
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
95 temp[10]="~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
96 temp[0]=str(incr_num)+"-"+str(count)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
97 new_main_sam.append(temp)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
98
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
99 new_sam=intro_sam+new_main_sam
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
100
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
101 return new_sam
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
102
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
103 #################################################################################################################################################################
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
104
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
105 def sam(mature_mirnas,path,name,con,l,samples,data,names,unmap_seq,samples_mirna_names,deseq,LHE_names,umi,ini_sample,unmap_counts):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
106
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
107 # read the sam file
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
108
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
109 ini_sam=read(path,0)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
110 new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
111 unique_seq = [x for x in new_main_sam if x[1] == '0' and len(x[9])>=18 and len(x[9])<=26]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
112
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
113 # Calculate the shifted positions for every isomir and add them to the name of it
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
114 sorted_uni_arms = []
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
115
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
116 for i in range(len(mature_mirnas)):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
117 tmp_count_reads = 0 # calculate the total number of reads
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
118 tmp_count_seq = 0 # calculate the total number of sequences
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
119 for j in range(len(unique_seq)):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
120
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
121 if mature_mirnas[i] == unique_seq[j][2]:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
122
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
123 temp_mature = mature_mirnas[i+1]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
124 off_part = longestSubstring(temp_mature, unique_seq[j][9])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
125
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
126 mat_diff = temp_mature.split(off_part)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
127 mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
128
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
129 unique_diff = unique_seq[j][9].split(off_part)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
130 unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
131
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
132 # Problem with hsa-miR-8485
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
133 if mat_diff[1]!=0 and unique_diff[1]!=0:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
134 unique_seq[j]=1
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
135 pre_pos = 0
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
136 post_pos = 0
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
137
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
138 elif mat_diff[0]!=0 and unique_diff[0]!=0:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
139 unique_seq[j]=1
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
140 pre_pos = 0
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
141 post_pos = 0
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
142
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
143 else:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
144 pre_pos = mat_diff[0]-unique_diff[0]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
145 post_pos = unique_diff[1]-mat_diff[1]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
146 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
147 tmp_count_seq = tmp_count_seq+1
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
148
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
149
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
150 if pre_pos != 0 or post_pos != 0:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
151 if pre_pos == 0:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
152 unique_seq[j][2] = unique_seq[j][2] + "_" +str(pre_pos) + "_" + '{:+d}'.format(post_pos)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
153 elif post_pos == 0:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
154 unique_seq[j][2] = unique_seq[j][2] + "_" + '{:+d}'.format(pre_pos) + "_" + str(post_pos)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
155 else:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
156 unique_seq[j][2] = unique_seq[j][2]+"_"+'{:+d}'.format(pre_pos)+"_"+'{:+d}'.format(post_pos)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
157
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
158 for x in range(unique_seq.count(1)):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
159 unique_seq.remove(1)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
160 if tmp_count_reads != 0 and tmp_count_seq != 0:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
161 sorted_uni_arms.append([mature_mirnas[i], tmp_count_seq, tmp_count_reads])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
162
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
163 # Store name of arms, number of sequences and number of reads
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
164 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
165
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
166 for y in sorted_uni_arms:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
167 counts=0
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
168 seqs=0
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
169 for x in unique_seq:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
170 if y[0]==x[2].split("_")[0]+"_"+x[2].split("_")[1]:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
171 counts+=int(x[0].split("-")[1])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
172 seqs+=1
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
173
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
174 y[1]=seqs
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
175 y[2]=counts
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
176
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
177 LHE=[]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
178
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
179 l.acquire()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
180 if con=="c":
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
181 LHE.extend(z[2] for z in unique_seq)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
182 for y in unique_seq:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
183 samples_mirna_names.append([y[2],y[9]])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
184 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
185 LHE_names.extend(LHE)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
186 unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4'])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
187 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4'])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
188 names.append(name)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
189 samples.append(unique_seq)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
190 data.append([con,name,unique_seq,sorted_uni_arms])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
191 ini_sample.append(new_main_sam)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
192
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
193 if con=="t":
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
194 LHE.extend(z[2] for z in unique_seq)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
195 for y in unique_seq:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
196 samples_mirna_names.append([y[2],y[9]])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
197 deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
198 LHE_names.extend(LHE)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
199 unmap_seq.value += sum([1 for x in new_main_sam if x[1] == '4'])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
200 unmap_counts.value += sum([int(x[0].split("-")[1]) for x in new_main_sam if x[1] == '4'])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
201 names.append(name)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
202 samples.append(unique_seq)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
203 data.append([con,name,unique_seq,sorted_uni_arms])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
204 ini_sample.append(new_main_sam)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
205 l.release()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
206
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
207
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
208 ######################################################################################################################################
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
209 """
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
210 Read a sam file from Bowtie and do the followings:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
211
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
212 1) Remove reverse stranded mapped reads
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
213 2) Remove unmapped reads
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
214 3) Remove all sequences with reads less than 11 reads
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
215 4) Sort the arms with the most sequences in decreading rate
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
216 5) Sort the sequences of every arm with the most reads in decreasing rate
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
217 6) Calculate total number of sequences of every arm
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
218 7) Calculate total number of reads of sequences of every arm.
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
219 8) Store all the informations in a txt file
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
220
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
221 """
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
222
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
223 def non_sam(mature_mirnas,path,name,con,l,data,names,n_deseq,n_samples_mirna_names,n_LHE_names):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
224
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
225
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
226 ini_sam=read(path,0)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
227 new_main_sam = [x.rstrip("\n").split("\t") for x in ini_sam if "@" not in x.split("\t")[0]]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
228 unique_seq=[]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
229 unique_seq = [x for x in new_main_sam if x[1] == '4' and len(x[9])>=18 and len(x[9])<=26]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
230 uni_seq=[]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
231
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
232 # Calculate the shifted positions for every isomir and add them to the name of it
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
233 sorted_uni_arms = []
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
234 for i in range(1,len(mature_mirnas),2):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
235 tmp_count_reads = 0 # calculate the total number of reads
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
236 tmp_count_seq = 0 # calculate the total number of sequences
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
237
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
238 for j in range(len(unique_seq)):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
239
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
240 temp_mature = mature_mirnas[i].strip().replace("U", "T")
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
241
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
242 if temp_mature in unique_seq[j][9]:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
243
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
244 off_part = longestSubstring(temp_mature, unique_seq[j][9])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
245
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
246 mat_diff = temp_mature.split(off_part)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
247 mat_diff = [len(mat_diff[0]), len(mat_diff[1])]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
248
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
249 unique_diff = unique_seq[j][9].split(off_part)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
250 if len(unique_diff)<=2:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
251 unique_diff = [len(unique_diff[0]), len(unique_diff[1])]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
252
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
253 pre_pos = mat_diff[0]-unique_diff[0]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
254 post_pos = unique_diff[1]-mat_diff[1]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
255
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
256 lengthofmir = len(off_part) + post_pos
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
257 if pre_pos == 0:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
258 tmp_count_reads = tmp_count_reads + int(unique_seq[j][0].split("-")[1])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
259 tmp_count_seq = tmp_count_seq + 1
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
260
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
261 if pre_pos == 0:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
262 t_name=copy.deepcopy(unique_seq[j])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
263 t_name[2]=mature_mirnas[i - 1] + "__" + str(pre_pos) + "_" + '{:+d}'.format(post_pos) + "_" + str(unique_seq[j][9][len(off_part):])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
264 uni_seq.append(t_name)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
265
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
266 if tmp_count_reads != 0 and tmp_count_seq != 0:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
267 sorted_uni_arms.append([mature_mirnas[i-1], tmp_count_seq, tmp_count_reads])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
268
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
269
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
270 sorted_uni_arms = sorted(sorted_uni_arms, key=lambda x: x[1], reverse=True)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
271 unique_seq = list(map(list, OrderedDict.fromkeys(map(tuple,uni_seq))))
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
272
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
273 LHE=[]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
274
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
275 l.acquire()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
276 if con=="c":
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
277 LHE.extend(x[2] for x in unique_seq if x[2]!="*")
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
278 for x in unique_seq:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
279 if x[2]!="*":
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
280 n_samples_mirna_names.append([x[2],x[9]])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
281 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
282 n_LHE_names.extend(LHE)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
283 names.append(name)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
284 data.append([con,name,unique_seq,sorted_uni_arms])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
285
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
286
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
287 if con=="t":
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
288 LHE.extend(x[2] for x in unique_seq if x[2]!="*")
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
289 for x in unique_seq:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
290 if x[2]!="*":
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
291 n_samples_mirna_names.append([x[2],x[9]])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
292 n_deseq.append([[x[2], x[0].split('-')[1], x[9]] for x in unique_seq if x[2]!="*"])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
293 n_LHE_names.extend(LHE)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
294 names.append(name)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
295 data.append([con,name,unique_seq,sorted_uni_arms])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
296 l.release()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
297
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
298 #################################################################################################################################################################################################################
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
299
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
300 def deseq2_temp(samples_mirna_names,deseq,con,l):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
301
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
302 samples_mirna_names.sort(key=lambda x:[0])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
303
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
304 for i in range(len(deseq)):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
305 for y in samples_mirna_names:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
306 flag = 0
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
307 for x in deseq[i]:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
308 if y[0] == x[0]:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
309 flag = 1
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
310 break
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
311
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
312 if flag == 0:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
313 deseq[i].append([y[0], "0", y[1]])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
314
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
315 [deseq[i].sort(key=lambda x: x[0]) for i, _ in enumerate(deseq)]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
316 deseq_final = [[x[0],x[2]] for x in deseq[0]]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
317 [deseq_final[z].append(deseq[i][j][1]) for z,_ in enumerate(deseq_final) for i, _ in enumerate(deseq) for j,_ in enumerate(deseq[i]) if deseq_final[z][0] == deseq[i][j][0]]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
318
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
319 l.acquire()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
320 if con=="c":
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
321 q1.put(deseq_final)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
322
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
323 if con=="t":
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
324 q2.put(deseq_final)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
325 l.release()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
326
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
327
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
328 ##################################################################################################################################################################################################################
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
329
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
330 def main_temp(LH2E, LH2E_names, LH8E, LH8E_names,flag,names_con,names_tre,filter_LH8E,filter_LH2E,raw_LH8E,raw_LH2E,per,count):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
331
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
332 LH8E_add_names = [x for x in LH2E_names if x not in LH8E_names]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
333 LH2E_add_names = [x for x in LH8E_names if x not in LH2E_names]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
334
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
335 LH8E_add_names.sort()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
336 LH2E_add_names.sort()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
337 LH8E_add_names = list(LH8E_add_names for LH8E_add_names,_ in itertools.groupby(LH8E_add_names))
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
338 LH2E_add_names = list(LH2E_add_names for LH2E_add_names,_ in itertools.groupby(LH2E_add_names))
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
339
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
340 LH2E.sort()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
341 LH8E.sort()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
342 LH2E = list(LH2E for LH2E,_ in itertools.groupby(LH2E))
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
343 LH8E = list(LH8E for LH8E,_ in itertools.groupby(LH8E))
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
344
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
345 zeros=["0"]*(len(LH8E[0])-2)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
346 [LH8E_add_names[i].extend(zeros) for i,_ in enumerate(LH8E_add_names)]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
347 LH8E=LH8E+LH8E_add_names
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
348
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
349 zeros=["0"]*(len(LH2E[0])-2)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
350 [LH2E_add_names[i].extend(zeros) for i,_ in enumerate(LH2E_add_names)]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
351 LH2E=LH2E+LH2E_add_names
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
352
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
353 dupes=[]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
354 final_LH2E =[]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
355
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
356 for num,_ in enumerate(LH2E):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
357
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
358 if LH2E[num][1] not in final_LH2E and LH2E[num][0] not in final_LH2E:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
359 final_LH2E.append(LH2E[num][1])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
360 final_LH2E.append(LH2E[num][0])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
361 else:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
362 dupes.append(LH2E[num][1])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
363
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
364 dupes=list(set(dupes))
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
365
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
366 dupes=[[x] for x in dupes]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
367
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
368 for x in LH2E:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
369 for y in dupes:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
370 if x[1]==y[0]:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
371 fl=0
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
372 if len(y)==1:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
373 y.append(x[0])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
374 else:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
375 for i in range(1,len(y)):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
376 if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
377 fl=1
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
378 if len(x[0])<len(y[i]):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
379 del y[i]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
380 y.append(x[0])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
381 break
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
382
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
383 if fl==0:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
384 y.append((x[0]))
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
385
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
386 for y in dupes:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
387 if len(y)>2:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
388 for i in range(len(y)-1,1,-1):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
389 y[1]=y[1]+"/"+y[i]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
390 del y[i]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
391
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
392
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
393 for x in LH2E:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
394 for y in dupes:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
395 if x[1]==y[0]:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
396 x[0]=y[1]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
397
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
398 for x in LH8E:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
399 for y in dupes:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
400 if x[1]==y[0]:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
401 x[0]=y[1]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
402
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
403
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
404
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
405 LH2E.sort()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
406 LH2E=list(LH2E for LH2E,_ in itertools.groupby(LH2E))
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
407
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
408 LH8E.sort()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
409 LH8E=list(LH8E for LH8E,_ in itertools.groupby(LH8E))
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
410
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
411 if int(per)!=-1:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
412 percent=int(per)/100
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
413
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
414 c_col_filter=round(percent*(len(LH2E[1])-2))
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
415 t_col_filter=round(percent*(len(LH8E[1])-2))
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
416
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
417 for i, _ in enumerate(LH2E):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
418 c_cols=0
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
419 t_cols=0
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
420
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
421 c_cols=sum([1 for j in range(len(LH2E[i])-2) if int(LH2E[i][j+2])>=int(count)])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
422 t_cols=sum([1 for j in range(len(LH8E[i])-2) if int(LH8E[i][j+2])>=int(count)])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
423
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
424 if c_cols>=c_col_filter or t_cols>=t_col_filter:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
425 filter_LH8E.append(LH8E[i])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
426 filter_LH2E.append(LH2E[i])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
427
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
428 raw_LH2E.extend(LH2E)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
429 raw_LH8E.extend(LH8E)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
430
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
431 ##################################################################################################################################################################################################################
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
432
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
433 def write_main(raw_LH2E, raw_LH8E, fil_LH2E, fil_LH8E, names_con, names_tre, flag, per, n1, n2):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
434
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
435 if flag == 1 and int(per)!=-1:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
436 fp = open('Counts/Filtered '+n2 +' Templated Counts', 'w')
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
437 fp.write("Name\t")
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
438 fp.write("Sequence")
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
439 for y in names_tre:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
440 fp.write("\t"+y)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
441
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
442 for x in fil_LH8E:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
443 fp.write("\n%s" % "\t".join(x))
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
444 fp.close()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
445
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
446 fp = open('Counts/Filtered '+n1+' Templated Counts', 'w')
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
447 fp.write("Name\t")
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
448 fp.write("Sequence")
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
449 for y in names_con:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
450 fp.write("\t"+y)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
451
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
452 for x in fil_LH2E:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
453 fp.write("\n%s" % "\t".join(x))
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
454 fp.close()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
455
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
456
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
457 if flag == 2 and int(per)!=-1:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
458 fp = open('Counts/Filtered '+n2+' Non-Templated Counts', 'w')
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
459 fp.write("Name\t")
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
460 fp.write("Sequence")
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
461 for y in names_tre:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
462 fp.write("\t"+y)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
463
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
464
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
465 for x in fil_LH8E:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
466 fp.write("\n%s" % "\t".join(x))
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
467 fp.close()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
468
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
469 fp = open('Counts/Filtered '+n1+' Non-Templated Counts', 'w')
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
470 fp.write("Name\t")
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
471 fp.write("Sequence")
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
472 for y in names_con:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
473 fp.write("\t"+y)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
474
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
475 for x in fil_LH2E:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
476 fp.write("\n%s" % "\t".join(x))
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
477 fp.close()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
478
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
479
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
480 if flag == 1:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
481 fp = open('Counts/Raw '+n2+' Templated Counts', 'w')
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
482 fp.write("Name\t")
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
483 fp.write("Sequence")
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
484 for y in names_tre:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
485 fp.write("\t"+y)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
486
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
487 for x in raw_LH8E:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
488 fp.write("\n%s" % "\t".join(x))
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
489 fp.close()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
490
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
491 fp = open('Counts/Raw '+n1+' Templated Counts', 'w')
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
492 fp.write("Name\t")
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
493 fp.write("Sequence")
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
494 for y in names_con:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
495 fp.write("\t"+y)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
496
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
497 for x in raw_LH2E:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
498 fp.write("\n%s" % "\t".join(x))
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
499 fp.close()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
500
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
501 if flag == 2:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
502 fp = open('Counts/Raw '+n2+' Non-Templated Counts', 'w')
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
503 fp.write("Name\t")
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
504 fp.write("Sequence")
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
505 for y in names_tre:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
506 fp.write("\t"+y)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
507
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
508
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
509 for x in raw_LH8E:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
510 fp.write("\n%s" % "\t".join(x))
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
511 fp.close()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
512
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
513 fp = open('Counts/Raw '+n1+' Non-Templated Counts', 'w')
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
514 fp.write("Name\t")
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
515 fp.write("Sequence")
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
516 for y in names_con:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
517 fp.write("\t"+y)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
518
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
519 for x in raw_LH2E:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
520 fp.write("\n%s" % "\t".join(x))
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
521 fp.close()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
522
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
523 ####################################################################################################################################################################################################################
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
524
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
525 def ssamples(names,samp,folder,pro):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
526
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
527 for i in range(2,len(samp[0])):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
528
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
529 fp = open(folder+names[i-2]+'.txt','w')
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
530 fp.write("miRNA id"+"\t"+names[i-2]+"\n")
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
531
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
532 for x in samp:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
533 fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
534 fp.close()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
535
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
536 ####################################################################################################################################################################################################################
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
537
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
538 def DB_write(con,name,unique_seq,sorted_uni_arms,f):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
539
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
540 if f==1:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
541 # Write a txt file with all the information
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
542 if con=="c":
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
543 fp = open('split1/'+name, 'w')
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
544
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
545 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
546 if con=="t":
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
547 fp = open('split2/'+name, 'w')
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
548 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
549
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
550 for i in range(len(sorted_uni_arms)):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
551 temp = []
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
552 for j in range(len(unique_seq)):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
553
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
554 if sorted_uni_arms[i][0] in (unique_seq[j][2].split("_")[0]+"_"+unique_seq[j][2].split("_")[1]):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
555
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
556 temp.append(unique_seq[j])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
557
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
558 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
559 fp.write("*********************************************************************************************************\n")
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
560 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
561 fp.write("*********************************************************************************************************\n\n")
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
562 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
563 fp.write("\n" + "\n")
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
564 fp.close()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
565
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
566 if f==2:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
567
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
568 if con=="c":
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
569 fp = open('split3/'+name, 'w')
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
570 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
571 if con=="t":
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
572 fp = open('split4/'+name, 'w')
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
573 fp.write("%s\t%-42s\t%s\n\n" % ("Number of Reads","Name of isomir","Sequence"))
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
574
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
575 for i in range(len(sorted_uni_arms)):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
576 temp = []
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
577 for j in range(len(unique_seq)):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
578 if sorted_uni_arms[i][0]==unique_seq[j][2].split("__")[0]:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
579 temp.append(unique_seq[j])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
580 if temp!=[]:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
581 temp = sorted(temp, key=lambda x: int(x[0].split('-')[1]), reverse=True)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
582 fp.write("*********************************************************************************************************\n")
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
583 fp.write("%-8s\t%-22s\t%-25s\t%-30s\t%s\n" % ("|",str(sorted_uni_arms[i][0]),"Sequence count = "+str(sorted_uni_arms[i][1]),"Total reads = "+str(sorted_uni_arms[i][2]),"|"))
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
584 fp.write("*********************************************************************************************************\n\n")
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
585 [fp.write("%-8s\t%-40s\t%s\n" % (x[0].split("-")[1], x[2],x[9])) for x in temp]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
586 fp.write("\n" + "\n")
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
587 fp.close()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
588
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
589
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
590 ##########################################################################################################################
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
591
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
592 def new_mat_seq(pre_unique_seq,mat_mirnas,l):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
593
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
594 unique_iso = []
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
595 for x in pre_unique_seq:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
596 if len(x[2].split("_"))==3:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
597 for y in pre_unique_seq:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
598 if x[2] in y[2] and int(x[0].split("-")[1])<int(y[0].split("-")[1]):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
599 if any(y[2] in lst2 for lst2 in unique_iso)==False:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
600 y[2]=">"+y[2]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
601 unique_iso.append(y)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
602 l.acquire()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
603 for x in unique_iso:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
604 mat_mirnas.append(x[2])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
605 mat_mirnas.append(x[9])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
606 l.release()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
607
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
608 #########################################################################################################################
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
609
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
610 def merging_names(LH2E_copy,new):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
611
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
612 dupes=[]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
613 final_LH2E =[]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
614
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
615 for num in range(len(LH2E_copy)):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
616
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
617 if LH2E_copy[num][1] not in final_LH2E and LH2E_copy[num][0] not in final_LH2E:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
618 final_LH2E.append(LH2E_copy[num][1])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
619 final_LH2E.append(LH2E_copy[num][0])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
620 else:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
621 dupes.append(LH2E_copy[num][1])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
622
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
623 dupes=list(set(dupes))
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
624
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
625 for i in range(len(dupes)):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
626 dupes[i]=[dupes[i]]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
627
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
628 for x in LH2E_copy:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
629 for y in dupes:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
630 if x[1]==y[0]:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
631 fl=0
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
632 if len(y)==1:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
633 y.append(x[0])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
634 else:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
635 for i in range(1,len(y)):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
636 if y[i].split("_")[0]+"_"+y[i].split("_")[1]==x[0].split("_")[0]+"_"+x[0].split("_")[1]:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
637 fl=1
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
638 if len(x[0])<len(y[i]):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
639 del y[i]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
640 y.append(x[0])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
641 break
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
642
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
643 if fl==0:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
644 y.append((x[0]))
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
645
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
646 for y in dupes:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
647 if len(y)>2:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
648 for i in range(len(y)-1,1,-1):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
649 y[1]=y[1]+"/"+y[i]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
650 del y[i]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
651
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
652
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
653 for x in LH2E_copy:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
654 for y in dupes:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
655 if x[1]==y[0]:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
656 x[0]=y[1]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
657
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
658 LH2E_copy.sort()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
659 LH2E_copy=list(LH2E_copy for LH2E_copy,_ in itertools.groupby(LH2E_copy))
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
660 new.extend(LH2E_copy)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
661
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
662 ######################################################################################################################################################
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
663
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
664 def ssamples1(tem_names,tem_samp,non_names,non_samp,folder,pro):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
665
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
666 for i in range(2,len(tem_samp[0])):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
667
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
668 fp = open(folder+tem_names[i-2]+'.txt','w')
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
669 fp.write("miRNA id"+"\t"+tem_names[i-2]+"\n")
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
670
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
671 for x in tem_samp:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
672 fp.write("%s" % "\t".join([x[0],x[i]])+"\n")
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
673
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
674 for j in range(len(non_names)):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
675 if non_names[j]==tem_names[i-2]:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
676 for x in non_samp:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
677 fp.write("%s" % "\t".join([x[0],x[j+2]])+"\n")
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
678 fp.close()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
679
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
680 #################################################################################################################################################################################################################
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
681
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
682 def download_matures(matures,org_name):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
683
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
684 mature_mir=[]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
685
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
686 mat_url = 'http://mirgenedb.org/fasta/'+org_name+'?mat=1'
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
687 star_url = 'http://mirgenedb.org/fasta/'+org_name+'?star=1'
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
688
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
689 data = urllib.request.urlopen(mat_url).read()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
690 file_mirna = data.decode('utf-8')
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
691 mature_mir = file_mirna.split("\n")
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
692 mature_mir = [x.replace(">","") for x in mature_mir]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
693 del mature_mir[-1]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
694
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
695 data = urllib.request.urlopen(star_url).read()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
696 file_mirna = data.decode('utf-8')
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
697 star_mir = file_mirna.split("\n")
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
698 star_mir = [x.replace(">","") for x in star_mir]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
699 del star_mir[-1]
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
700
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
701 mature_mir.extend(star_mir)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
702
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
703 for i in range(1,len(mature_mir),2):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
704 mature_mir[i]=mature_mir[i].replace("U","T")
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
705
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
706 matures.extend(mature_mir)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
707
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
708 ###################################################################################################################
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
709
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
710 def non_template_ref(sc,st,all_isoforms):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
711
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
712 pre_uni_seq_con = list(sc)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
713 pre_uni_seq_tre = list(st)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
714
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
715 for x in pre_uni_seq_con:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
716 for y in x:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
717 if y[2] not in all_isoforms and len(y[2].split("_"))>2:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
718 all_isoforms.append(y[2])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
719 all_isoforms.append(y[9])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
720
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
721 for x in pre_uni_seq_tre:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
722 for y in x:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
723 if y[2] not in all_isoforms and len(y[2].split("_"))>2:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
724 all_isoforms.append(y[2])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
725 all_isoforms.append(y[9])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
726
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
727 ################################################################################################################################################################################################
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
728
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
729 def deseqe2(sample,mir_names,l,new_d,sample_name,sample_order):
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
730
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
731 for y in mir_names:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
732 flag=0
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
733 for x in sample:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
734 if y[0]==x[0]:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
735 flag=1
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
736 break
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
737 if flag==0:
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
738 sample.append([y[0],"0",y[1]])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
739
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
740 sample.sort(key=lambda x: x[0])
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
741 sample=list(sample for sample,_ in itertools.groupby(sample))
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
742
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
743 l.acquire()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
744 new_d.append(sample)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
745 sample_order.append(sample_name)
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
746 l.release()
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
747
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
748 ###############################################################################################################################################################################################
dc31f01cf21d Uploaded
glogobyte
parents:
diff changeset
749